build: set version to 0.5
[vis.git] / lua / lexers / awk.lua
blob87e39d905b13ebed49e49795b501e015e0f79db3
1 -- Copyright 2006-2017 Mitchell mitchell.att.foicica.com. See LICENSE.
2 -- AWK LPeg lexer.
3 -- Modified by Wolfgang Seeberg 2012, 2013.
5 local l = require('lexer')
6 local token, word_match = l.token, l.word_match
7 local P, R, S = lpeg.P, lpeg.R, lpeg.S
9 local M = {_NAME = 'awk'}
11 local LEFTBRACKET = '['
12 local RIGHTBRACKET = ']'
13 local SLASH = '/'
14 local BACKSLASH = '\\'
15 local CARET = '^'
16 local CR = '\r'
17 local LF = '\n'
18 local CRLF = CR .. LF
19 local DQUOTE = '"'
20 local DELIMITER_MATCHES = {['('] = ')', ['['] = ']'}
21 local COMPANION = {['('] = '[', ['['] = '('}
22 local CC = {
23 alnum = 1, alpha = 1, blank = 1, cntrl = 1, digit = 1, graph = 1, lower = 1,
24 print = 1, punct = 1, space = 1, upper = 1, xdigit = 1
26 local LastRegexEnd = 0
27 local BackslashAtCommentEnd = 0
28 local KW_BEFORE_RX = {
29 case = 1, ['do'] = 1, ['else'] = 1, exit = 1, print = 1, printf = 1,
30 ['return'] = 1
33 local function findKeyword(input, e)
34 local i = e
35 while i > 0 and input:find("^[%l]", i) do i = i - 1 end
36 local w = input:sub(i + 1, e)
37 if i == 0 then
38 return KW_BEFORE_RX[w] == 1
39 elseif input:find("^[%u%d_]", i) then
40 return false
41 else
42 return KW_BEFORE_RX[w] == 1
43 end
44 end
46 local function isRegex(input, i)
47 while i >= 1 and input:find('^[ \t]', i) do i = i - 1 end
48 if i < 1 then return true end
49 if input:find("^[-!%%&(*+,:;<=>?[^{|}~\f]", i) or findKeyword(input, i) then
50 return true
51 elseif input:sub(i, i) == SLASH then
52 return i ~= LastRegexEnd -- deals with /xx/ / /yy/.
53 elseif input:find('^[]%w)."]', i) then
54 return false
55 elseif input:sub(i, i) == LF then
56 if i == 1 then return true end
57 i = i - 1
58 if input:sub(i, i) == CR then
59 if i == 1 then return true end
60 i = i - 1
61 end
62 elseif input:sub(i, i) == CR then
63 if i == 1 then return true end
64 i = i - 1
65 else
66 return false
67 end
68 if input:sub(i, i) == BACKSLASH and i ~= BackslashAtCommentEnd then
69 return isRegex(input, i - 1)
70 else
71 return true
72 end
73 end
75 local function eatCharacterClass(input, s, e)
76 local i = s
77 while i <= e do
78 if input:find('^[\r\n]', i) then
79 return false
80 elseif input:sub(i, i + 1) == ':]' then
81 local str = input:sub(s, i - 1)
82 return CC[str] == 1 and i + 1
83 end
84 i = i + 1
85 end
86 return false
87 end
89 local function eatBrackets(input, i, e)
90 if input:sub(i, i) == CARET then i = i + 1 end
91 if input:sub(i, i) == RIGHTBRACKET then i = i + 1 end
92 while i <= e do
93 if input:find('^[\r\n]', i) then
94 return false
95 elseif input:sub(i, i) == RIGHTBRACKET then
96 return i
97 elseif input:sub(i, i + 1) == '[:' then
98 i = eatCharacterClass(input, i + 2, e)
99 if not i then return false end
100 elseif input:sub(i, i) == BACKSLASH then
101 i = i + 1
102 if input:sub(i, i + 1) == CRLF then i = i + 1 end
104 i = i + 1
106 return false
109 local function eatRegex(input, i)
110 local e = #input
111 while i <= e do
112 if input:find('^[\r\n]', i) then
113 return false
114 elseif input:sub(i, i) == SLASH then
115 LastRegexEnd = i
116 return i
117 elseif input:sub(i, i) == LEFTBRACKET then
118 i = eatBrackets(input, i + 1, e)
119 if not i then return false end
120 elseif input:sub(i, i) == BACKSLASH then
121 i = i + 1
122 if input:sub(i, i + 1) == CRLF then i = i + 1 end
124 i = i + 1
126 return false
129 local ScanRegexResult
130 local function scanGawkRegex(input, index)
131 if isRegex(input, index - 2) then
132 local i = eatRegex(input, index)
133 if not i then
134 ScanRegexResult = false
135 return false
137 local rx = input:sub(index - 1, i)
138 for bs in rx:gmatch("[^\\](\\+)[BSsWwy<>`']") do
139 -- /\S/ is special, but /\\S/ is not.
140 if #bs % 2 == 1 then return i + 1 end
142 ScanRegexResult = i + 1
143 else
144 ScanRegexResult = false
146 return false
148 -- Is only called immediately after scanGawkRegex().
149 local function scanRegex()
150 return ScanRegexResult
153 local function scanString(input, index)
154 local i = index
155 local e = #input
156 while i <= e do
157 if input:find('^[\r\n]', i) then
158 return false
159 elseif input:sub(i, i) == DQUOTE then
160 return i + 1
161 elseif input:sub(i, i) == BACKSLASH then
162 i = i + 1
163 -- l.delimited_range() doesn't handle CRLF.
164 if input:sub(i, i + 1) == CRLF then i = i + 1 end
166 i = i + 1
168 return false
171 -- purpose: prevent isRegex() from entering a comment line that ends with a
172 -- backslash.
173 local function scanComment(input, index)
174 local _, i = input:find('[^\r\n]*', index)
175 if input:sub(i, i) == BACKSLASH then BackslashAtCommentEnd = i end
176 return i + 1
179 local function scanFieldDelimiters(input, index)
180 local i = index
181 local e = #input
182 local left = input:sub(i - 1, i - 1)
183 local count = 1
184 local right = DELIMITER_MATCHES[left]
185 local left2 = COMPANION[left]
186 local count2 = 0
187 local right2 = DELIMITER_MATCHES[left2]
188 while i <= e do
189 if input:find('^[#\r\n]', i) then
190 return false
191 elseif input:sub(i, i) == right then
192 count = count - 1
193 if count == 0 then return count2 == 0 and i + 1 end
194 elseif input:sub(i, i) == left then
195 count = count + 1
196 elseif input:sub(i, i) == right2 then
197 count2 = count2 - 1
198 if count2 < 0 then return false end
199 elseif input:sub(i, i) == left2 then
200 count2 = count2 + 1
201 elseif input:sub(i, i) == DQUOTE then
202 i = scanString(input, i + 1)
203 if not i then return false end
204 i = i - 1
205 elseif input:sub(i, i) == SLASH then
206 if isRegex(input, i - 1) then
207 i = eatRegex(input, i + 1)
208 if not i then return false end
210 elseif input:sub(i, i) == BACKSLASH then
211 if input:sub(i + 1, i + 2) == CRLF then
212 i = i + 2
213 elseif input:find('^[\r\n]', i + 1) then
214 i = i + 1
217 i = i + 1
219 return false
222 -- Whitespace.
223 local ws = token(l.WHITESPACE, l.space^1)
225 -- Comments.
226 local comment = token(l.COMMENT, '#' * P(scanComment))
228 -- Strings.
229 local string = token(l.STRING, DQUOTE * P(scanString))
231 -- Regular expressions.
232 -- Slash delimited regular expressions are preceded by most operators or
233 -- the keywords 'print' and 'case', possibly on a preceding line. They
234 -- can contain unescaped slashes and brackets in brackets. Some escape
235 -- sequences like '\S', '\s' have special meanings with Gawk. Tokens that
236 -- contain them are displayed differently.
237 local regex = token(l.REGEX, SLASH * P(scanRegex))
238 local gawkRegex = token('gawkRegex', SLASH * P(scanGawkRegex))
240 -- no leading sign because it might be binary.
241 local float = ((l.digit ^ 1 * ('.' * l.digit ^ 0) ^ -1) +
242 ('.' * l.digit ^ 1)) * (S('eE') * S('+-') ^ -1 * l.digit ^ 1) ^ -1
243 -- Numbers.
244 local number = token(l.NUMBER, float)
245 local gawkNumber = token('gawkNumber', l.hex_num + l.oct_num)
247 -- Operators.
248 local operator = token(l.OPERATOR, S('!%&()*+,-/:;<=>?[\\]^{|}~'))
249 local gawkOperator = token('gawkOperator', P("|&") + "@" + "**=" + "**")
251 -- Fields. E.g. $1, $a, $(x), $a(x), $a[x], $"1", $$a, etc.
252 local field = token('field', P('$') * S('$+-') ^ 0 *
253 (float + (l.word ^ 0 * '(' * P(scanFieldDelimiters)) +
254 (l.word ^ 1 * ('[' * P(scanFieldDelimiters)) ^ -1) +
255 ('"' * P(scanString)) + ('/' * P(eatRegex) * '/')))
257 -- Functions.
258 local func = token(l.FUNCTION, l.word * #P('('))
260 -- Identifiers.
261 local identifier = token(l.IDENTIFIER, l.word)
263 -- Keywords.
264 local keyword = token(l.KEYWORD, word_match{
265 'BEGIN', 'END', 'atan2', 'break', 'close', 'continue', 'cos', 'delete', 'do',
266 'else', 'exit', 'exp', 'fflush', 'for', 'function', 'getline', 'gsub', 'if',
267 'in', 'index', 'int', 'length', 'log', 'match', 'next', 'nextfile', 'print',
268 'printf', 'rand', 'return', 'sin', 'split', 'sprintf', 'sqrt', 'srand', 'sub',
269 'substr', 'system', 'tolower', 'toupper', 'while'
272 local gawkKeyword = token('gawkKeyword', word_match{
273 'BEGINFILE', 'ENDFILE', 'adump', 'and', 'asort', 'asorti', 'bindtextdomain',
274 'case', 'compl', 'dcgettext', 'dcngettext', 'default', 'extension', 'func',
275 'gensub', 'include', 'isarray', 'load', 'lshift', 'mktime', 'or', 'patsplit',
276 'rshift', 'stopme', 'strftime', 'strtonum', 'switch', 'systime', 'xor'
279 local builtInVariable = token('builtInVariable', word_match{
280 'ARGC', 'ARGV', 'CONVFMT', 'ENVIRON', 'FILENAME', 'FNR', 'FS', 'NF', 'NR',
281 'OFMT', 'OFS', 'ORS', 'RLENGTH', 'RS', 'RSTART', 'SUBSEP'
284 local gawkBuiltInVariable = token('gawkBuiltInVariable', word_match {
285 'ARGIND', 'BINMODE', 'ERRNO', 'FIELDWIDTHS', 'FPAT', 'FUNCTAB', 'IGNORECASE',
286 'LINT', 'PREC', 'PROCINFO', 'ROUNDMODE', 'RT', 'SYMTAB', 'TEXTDOMAIN'
289 -- Within each group order matters, but the groups themselves (except the
290 -- last) can be in any order.
291 M._rules = {
292 {'whitespace', ws},
294 {'comment', comment},
296 {'string', string},
298 {'field', field},
300 {'gawkRegex', gawkRegex},
301 {'regex', regex},
302 {'gawkOperator', gawkOperator},
303 {'operator', operator},
305 {'gawkNumber', gawkNumber},
306 {'number', number},
308 {'keyword', keyword},
309 {'builtInVariable', builtInVariable},
310 {'gawkKeyword', gawkKeyword},
311 {'gawkBuiltInVariable', gawkBuiltInVariable},
312 {'function', func},
313 {'identifier', identifier},
316 M._tokenstyles = {
317 builtInVariable = l.STYLE_CONSTANT,
318 default = l.STYLE_ERROR,
319 field = l.STYLE_LABEL,
320 gawkBuiltInVariable = l.STYLE_CONSTANT..',underlined',
321 gawkKeyword = l.STYLE_KEYWORD..',underlined',
322 gawkNumber = l.STYLE_NUMBER..',underlined',
323 gawkOperator = l.STYLE_OPERATOR..',underlined',
324 gawkRegex = l.STYLE_PREPROCESSOR..',underlined',
325 regex = l.STYLE_PREPROCESSOR
328 M._foldsymbols = {
329 _patterns = {'[{}]', '#'},
330 [l.OPERATOR] = {['{'] = 1, ['}'] = -1},
331 [l.COMMENT] = {['#'] = l.fold_line_comments('#')}
334 return M