1 -- Copyright 2006-2015 Mitchell mitchell.att.foicica.com. See LICENSE.
3 -- Modified by Wolfgang Seeberg 2012, 2013.
5 local l
= require('lexer')
6 local token
, word_match
= l
.token
, l
.word_match
7 local P
, R
, S
= lpeg
.P
, lpeg
.R
, lpeg
.S
9 local M
= {_NAME
= 'awk'}
11 local LEFTBRACKET
= '['
12 local RIGHTBRACKET
= ']'
14 local BACKSLASH
= '\\'
20 local DELIMITER_MATCHES
= {['('] = ')', ['['] = ']'}
21 local COMPANION
= {['('] = '[', ['['] = '('}
23 alnum
= 1, alpha
= 1, blank
= 1, cntrl
= 1, digit
= 1, graph
= 1, lower
= 1,
24 print = 1, punct
= 1, space
= 1, upper
= 1, xdigit
= 1
26 local LastRegexEnd
= 0
27 local BackslashAtCommentEnd
= 0
28 local KW_BEFORE_RX
= {
29 case
= 1, ['do'] = 1, ['else'] = 1, exit = 1, print = 1, printf
= 1,
33 local function findKeyword(input
, e
)
35 while i
> 0 and input
:find("^[%l]", i
) do i
= i
- 1 end
36 local w
= input
:sub(i
+ 1, e
)
38 return KW_BEFORE_RX
[w
] == 1
39 elseif input
:find("^[%u%d_]", i
) then
42 return KW_BEFORE_RX
[w
] == 1
46 local function isRegex(input
, i
)
47 while i
>= 1 and input
:find('^[ \t]', i
) do i
= i
- 1 end
48 if i
< 1 then return true end
49 if input
:find("^[-!%%&(*+,:;<=>?[^{|}~\f]", i
) or findKeyword(input
, i
) then
51 elseif input
:sub(i
, i
) == SLASH
then
52 return i
~= LastRegexEnd
-- deals with /xx/ / /yy/.
53 elseif input
:find('^[]%w)."]', i
) then
55 elseif input
:sub(i
, i
) == LF
then
56 if i
== 1 then return true end
58 if input
:sub(i
, i
) == CR
then
59 if i
== 1 then return true end
62 elseif input
:sub(i
, i
) == CR
then
63 if i
== 1 then return true end
68 if input
:sub(i
, i
) == BACKSLASH
and i
~= BackslashAtCommentEnd
then
69 return isRegex(input
, i
- 1)
75 local function eatCharacterClass(input
, s
, e
)
78 if input
:find('^[\r\n]', i
) then
80 elseif input
:sub(i
, i
+ 1) == ':]' then
81 local str
= input
:sub(s
, i
- 1)
82 return CC
[str
] == 1 and i
+ 1
89 local function eatBrackets(input
, i
, e
)
90 if input
:sub(i
, i
) == CARET
then i
= i
+ 1 end
91 if input
:sub(i
, i
) == RIGHTBRACKET
then i
= i
+ 1 end
93 if input
:find('^[\r\n]', i
) then
95 elseif input
:sub(i
, i
) == RIGHTBRACKET
then
97 elseif input
:sub(i
, i
+ 1) == '[:' then
98 i
= eatCharacterClass(input
, i
+ 2, e
)
99 if not i
then return false end
100 elseif input
:sub(i
, i
) == BACKSLASH
then
102 if input
:sub(i
, i
+ 1) == CRLF
then i
= i
+ 1 end
109 local function eatRegex(input
, i
)
112 if input
:find('^[\r\n]', i
) then
114 elseif input
:sub(i
, i
) == SLASH
then
117 elseif input
:sub(i
, i
) == LEFTBRACKET
then
118 i
= eatBrackets(input
, i
+ 1, e
)
119 if not i
then return false end
120 elseif input
:sub(i
, i
) == BACKSLASH
then
122 if input
:sub(i
, i
+ 1) == CRLF
then i
= i
+ 1 end
129 local ScanRegexResult
130 local function scanGawkRegex(input
, index
)
131 if isRegex(input
, index
- 2) then
132 local i
= eatRegex(input
, index
)
134 ScanRegexResult
= false
137 local rx
= input
:sub(index
- 1, i
)
138 for bs
in rx
:gmatch("[^\\](\\+)[BSsWwy<>`']") do
139 -- /\S/ is special, but /\\S/ is not.
140 if #bs
% 2 == 1 then return i
+ 1 end
142 ScanRegexResult
= i
+ 1
144 ScanRegexResult
= false
148 -- Is only called immediately after scanGawkRegex().
149 local function scanRegex()
150 return ScanRegexResult
153 local function scanString(input
, index
)
157 if input
:find('^[\r\n]', i
) then
159 elseif input
:sub(i
, i
) == DQUOTE
then
161 elseif input
:sub(i
, i
) == BACKSLASH
then
163 -- l.delimited_range() doesn't handle CRLF.
164 if input
:sub(i
, i
+ 1) == CRLF
then i
= i
+ 1 end
171 -- purpose: prevent isRegex() from entering a comment line that ends with a
173 local function scanComment(input
, index
)
174 local _
, i
= input
:find('[^\r\n]*', index
)
175 if input
:sub(i
, i
) == BACKSLASH
then BackslashAtCommentEnd
= i
end
179 local function scanFieldDelimiters(input
, index
)
182 local left
= input
:sub(i
- 1, i
- 1)
184 local right
= DELIMITER_MATCHES
[left
]
185 local left2
= COMPANION
[left
]
187 local right2
= DELIMITER_MATCHES
[left2
]
189 if input
:find('^[#\r\n]', i
) then
191 elseif input
:sub(i
, i
) == right
then
193 if count
== 0 then return count2
== 0 and i
+ 1 end
194 elseif input
:sub(i
, i
) == left
then
196 elseif input
:sub(i
, i
) == right2
then
198 if count2
< 0 then return false end
199 elseif input
:sub(i
, i
) == left2
then
201 elseif input
:sub(i
, i
) == DQUOTE
then
202 i
= scanString(input
, i
+ 1)
203 if not i
then return false end
205 elseif input
:sub(i
, i
) == SLASH
then
206 if isRegex(input
, i
- 1) then
207 i
= eatRegex(input
, i
+ 1)
208 if not i
then return false end
210 elseif input
:sub(i
, i
) == BACKSLASH
then
211 if input
:sub(i
+ 1, i
+ 2) == CRLF
then
213 elseif input
:find('^[\r\n]', i
+ 1) then
223 local ws
= token(l
.WHITESPACE
, l
.space^
1)
226 local comment
= token(l
.COMMENT
, '#' * P(scanComment
))
229 local string = token(l
.STRING
, DQUOTE
* P(scanString
))
231 -- Regular expressions.
232 -- Slash delimited regular expressions are preceded by most operators or
233 -- the keywords 'print' and 'case', possibly on a preceding line. They
234 -- can contain unescaped slashes and brackets in brackets. Some escape
235 -- sequences like '\S', '\s' have special meanings with Gawk. Tokens that
236 -- contain them are displayed differently.
237 local regex
= token(l
.REGEX
, SLASH
* P(scanRegex
))
238 local gawkRegex
= token('gawkRegex', SLASH
* P(scanGawkRegex
))
240 -- no leading sign because it might be binary.
241 local float
= ((l
.digit ^
1 * ('.' * l
.digit ^
0) ^
-1) +
242 ('.' * l
.digit ^
1)) * (S('eE') * S('+-') ^
-1 * l
.digit ^
1) ^
-1
244 local number = token(l
.NUMBER
, float
)
245 local gawkNumber
= token('gawkNumber', l
.hex_num
+ l
.oct_num
)
248 local operator
= token(l
.OPERATOR
, S('!%&()*+,-/:;<=>?[\\]^{|}~'))
249 local gawkOperator
= token('gawkOperator', P("|&") + "@" + "**=" + "**")
251 -- Fields. E.g. $1, $a, $(x), $a(x), $a[x], $"1", $$a, etc.
252 local field
= token('field', P('$') * S('$+-') ^
0 *
253 (float
+ (l
.word ^
0 * '(' * P(scanFieldDelimiters
)) +
254 (l
.word ^
1 * ('[' * P(scanFieldDelimiters
)) ^
-1) +
255 ('"' * P(scanString
)) + ('/' * P(eatRegex
) * '/')))
258 local func
= token(l
.FUNCTION
, l
.word
* #P('('))
261 local identifier
= token(l
.IDENTIFIER
, l
.word
)
264 local keyword
= token(l
.KEYWORD
, word_match
{
265 'BEGIN', 'END', 'atan2', 'break', 'close', 'continue', 'cos', 'delete', 'do',
266 'else', 'exit', 'exp', 'fflush', 'for', 'function', 'getline', 'gsub', 'if',
267 'in', 'index', 'int', 'length', 'log', 'match', 'next', 'nextfile', 'print',
268 'printf', 'rand', 'return', 'sin', 'split', 'sprintf', 'sqrt', 'srand', 'sub',
269 'substr', 'system', 'tolower', 'toupper', 'while'
272 local gawkKeyword
= token('gawkKeyword', word_match
{
273 'BEGINFILE', 'ENDFILE', 'adump', 'and', 'asort', 'asorti', 'bindtextdomain',
274 'case', 'compl', 'dcgettext', 'dcngettext', 'default', 'extension', 'func',
275 'gensub', 'include', 'isarray', 'load', 'lshift', 'mktime', 'or', 'patsplit',
276 'rshift', 'stopme', 'strftime', 'strtonum', 'switch', 'systime', 'xor'
279 local builtInVariable
= token('builtInVariable', word_match
{
280 'ARGC', 'ARGV', 'CONVFMT', 'ENVIRON', 'FILENAME', 'FNR', 'FS', 'NF', 'NR',
281 'OFMT', 'OFS', 'ORS', 'RLENGTH', 'RS', 'RSTART', 'SUBSEP'
284 local gawkBuiltInVariable
= token('gawkBuiltInVariable', word_match
{
285 'ARGIND', 'BINMODE', 'ERRNO', 'FIELDWIDTHS', 'FPAT', 'FUNCTAB', 'IGNORECASE',
286 'LINT', 'PREC', 'PROCINFO', 'ROUNDMODE', 'RT', 'SYMTAB', 'TEXTDOMAIN'
289 -- Within each group order matters, but the groups themselves (except the
290 -- last) can be in any order.
294 {'comment', comment
},
300 {'gawkRegex', gawkRegex
},
302 {'gawkOperator', gawkOperator
},
303 {'operator', operator
},
305 {'gawkNumber', gawkNumber
},
308 {'keyword', keyword
},
309 {'builtInVariable', builtInVariable
},
310 {'gawkKeyword', gawkKeyword
},
311 {'gawkBuiltInVariable', gawkBuiltInVariable
},
313 {'identifier', identifier
},
317 builtInVariable
= l
.STYLE_CONSTANT
,
318 default
= l
.STYLE_ERROR
,
319 field
= l
.STYLE_LABEL
,
320 gawkBuiltInVariable
= l
.STYLE_CONSTANT
..',underlined',
321 gawkKeyword
= l
.STYLE_KEYWORD
..',underlined',
322 gawkNumber
= l
.STYLE_NUMBER
..',underlined',
323 gawkOperator
= l
.STYLE_OPERATOR
..',underlined',
324 gawkRegex
= l
.STYLE_PREPROCESSOR
..',underlined',
325 regex
= l
.STYLE_PREPROCESSOR
329 _patterns
= {'[{}]', '#'},
330 [l
.OPERATOR
] = {['{'] = 1, ['}'] = -1},
331 [l
.COMMENT
] = {['#'] = l
.fold_line_comments('#')}