lexers/awk.lua

   1 -- Copyright 2006-2015 Mitchell mitchell.att.foicica.com. See LICENSE.
   2 -- AWK LPeg lexer.
   3 -- Modified by Wolfgang Seeberg 2012, 2013.
   4
   5 local l = require('lexer')
   6 local token, word_match = l.token, l.word_match
   7 local P, R, S = lpeg.P, lpeg.R, lpeg.S
   8
   9 local M = {_NAME = 'awk'}
  10
  11 local LEFTBRACKET = '['
  12 local RIGHTBRACKET = ']'
  13 local SLASH = '/'
  14 local BACKSLASH = '\\'
  15 local CARET = '^'
  16 local CR = '\r'
  17 local LF = '\n'
  18 local CRLF = CR .. LF
  19 local DQUOTE = '"'
  20 local DELIMITER_MATCHES = {['('] = ')', ['['] = ']'}
  21 local COMPANION = {['('] = '[', ['['] = '('}
  22 local CC = {
  23   alnum = 1, alpha = 1, blank = 1, cntrl = 1, digit = 1, graph = 1, lower = 1,
  24   print = 1, punct = 1, space = 1, upper = 1, xdigit = 1
  25 }
  26 local LastRegexEnd = 0
  27 local BackslashAtCommentEnd = 0
  28 local KW_BEFORE_RX = {
  29   case = 1, ['do'] = 1, ['else'] = 1, exit = 1, print = 1, printf = 1,
  30   ['return'] = 1
  31 }
  32
  33 local function findKeyword(input, e)
  34   local i = e
  35   while i > 0 and input:find("^[%l]", i) do i = i - 1 end
  36   local w = input:sub(i + 1, e)
  37   if i == 0 then
  38     return KW_BEFORE_RX[w] == 1
  39   elseif input:find("^[%u%d_]", i) then
  40     return false
  41   else
  42     return KW_BEFORE_RX[w] == 1
  43   end
  44 end
  45
  46 local function isRegex(input, i)
  47   while i >= 1 and input:find('^[ \t]', i) do i = i - 1 end
  48   if i < 1 then return true end
  49   if input:find("^[-!%%&(*+,:;<=>?[^{|}~\f]", i) or findKeyword(input, i) then
  50     return true
  51   elseif input:sub(i, i) == SLASH then
  52     return i ~= LastRegexEnd -- deals with /xx/ / /yy/.
  53   elseif input:find('^[]%w)."]', i) then
  54     return false
  55   elseif input:sub(i, i) == LF then
  56     if i == 1 then return true end
  57     i = i - 1
  58     if input:sub(i, i) == CR then
  59       if i == 1 then return true end
  60       i = i - 1
  61     end
  62   elseif input:sub(i, i) == CR then
  63     if i == 1 then return true end
  64     i = i - 1
  65   else
  66     return false
  67   end
  68   if input:sub(i, i) == BACKSLASH and i ~= BackslashAtCommentEnd then
  69     return isRegex(input, i - 1)
  70   else
  71     return true
  72   end
  73 end
  74
  75 local function eatCharacterClass(input, s, e)
  76   local i = s
  77   while i <= e do
  78     if input:find('^[\r\n]', i) then
  79       return false
  80     elseif input:sub(i, i + 1) == ':]' then
  81       local str = input:sub(s, i - 1)
  82       return CC[str] == 1 and i + 1
  83     end
  84     i = i + 1
  85   end
  86   return false
  87 end
  88
  89 local function eatBrackets(input, i, e)
  90   if input:sub(i, i) == CARET then i = i + 1 end
  91   if input:sub(i, i) == RIGHTBRACKET then i = i + 1 end
  92   while i <= e do
  93     if input:find('^[\r\n]', i) then
  94       return false
  95     elseif input:sub(i, i) == RIGHTBRACKET then
  96       return i
  97     elseif input:sub(i, i + 1) == '[:' then
  98       i = eatCharacterClass(input, i + 2, e)
  99       if not i then return false end
 100     elseif input:sub(i, i) == BACKSLASH then
 101       i = i + 1
 102       if input:sub(i, i + 1) == CRLF then i = i + 1 end
 103     end
 104     i = i + 1
 105   end
 106   return false
 107 end
 108
 109 local function eatRegex(input, i)
 110   local e = #input
 111   while i <= e do
 112     if input:find('^[\r\n]', i) then
 113       return false
 114     elseif input:sub(i, i) == SLASH then
 115       LastRegexEnd = i
 116       return i
 117     elseif input:sub(i, i) == LEFTBRACKET then
 118       i = eatBrackets(input, i + 1, e)
 119       if not i then return false end
 120     elseif input:sub(i, i) == BACKSLASH then
 121       i = i + 1
 122       if input:sub(i, i + 1) == CRLF then i = i + 1 end
 123     end
 124     i = i + 1
 125   end
 126   return false
 127 end
 128
 129 local ScanRegexResult
 130 local function scanGawkRegex(input, index)
 131   if isRegex(input, index - 2) then
 132     local i = eatRegex(input, index)
 133     if not i then
 134       ScanRegexResult = false
 135       return false
 136     end
 137     local rx = input:sub(index - 1, i)
 138     for bs in rx:gmatch("[^\\](\\+)[BSsWwy<>`']") do
 139       -- /\S/ is special, but /\\S/ is not.
 140       if #bs % 2 == 1 then return i + 1 end
 141     end
 142     ScanRegexResult = i + 1
 143   else
 144     ScanRegexResult = false
 145   end
 146   return false
 147 end
 148 -- Is only called immediately after scanGawkRegex().
 149 local function scanRegex()
 150   return ScanRegexResult
 151 end
 152
 153 local function scanString(input, index)
 154   local i = index
 155   local e = #input
 156   while i <= e do
 157     if input:find('^[\r\n]', i) then
 158       return false
 159     elseif input:sub(i, i) == DQUOTE then
 160       return i + 1
 161     elseif input:sub(i, i) == BACKSLASH then
 162       i = i + 1
 163       -- l.delimited_range() doesn't handle CRLF.
 164       if input:sub(i, i + 1) == CRLF then i = i + 1 end
 165     end
 166     i = i + 1
 167   end
 168   return false
 169 end
 170
 171 -- purpose: prevent isRegex() from entering a comment line that ends with a
 172 -- backslash.
 173 local function scanComment(input, index)
 174   local _, i = input:find('[^\r\n]*', index)
 175   if input:sub(i, i) == BACKSLASH then BackslashAtCommentEnd = i end
 176   return i + 1
 177 end
 178
 179 local function scanFieldDelimiters(input, index)
 180   local i = index
 181   local e = #input
 182   local left = input:sub(i - 1, i - 1)
 183   local count = 1
 184   local right = DELIMITER_MATCHES[left]
 185   local left2 = COMPANION[left]
 186   local count2 = 0
 187   local right2 = DELIMITER_MATCHES[left2]
 188   while i <= e do
 189     if input:find('^[#\r\n]', i) then
 190       return false
 191     elseif input:sub(i, i) == right then
 192       count = count - 1
 193       if count == 0 then return count2 == 0 and i + 1 end
 194     elseif input:sub(i, i) == left then
 195       count = count + 1
 196     elseif input:sub(i, i) == right2 then
 197       count2 = count2 - 1
 198       if count2 < 0 then return false end
 199     elseif input:sub(i, i) == left2 then
 200       count2 = count2 + 1
 201     elseif input:sub(i, i) == DQUOTE then
 202       i = scanString(input, i + 1)
 203       if not i then return false end
 204       i = i - 1
 205     elseif input:sub(i, i) == SLASH then
 206       if isRegex(input, i - 1) then
 207         i = eatRegex(input, i + 1)
 208         if not i then return false end
 209       end
 210     elseif input:sub(i, i) == BACKSLASH then
 211       if input:sub(i + 1, i + 2) == CRLF then
 212         i = i + 2
 213       elseif input:find('^[\r\n]', i + 1) then
 214         i = i + 1
 215       end
 216     end
 217     i = i + 1
 218   end
 219   return false
 220 end
 221
 222 -- Whitespace.
 223 local ws = token(l.WHITESPACE, l.space^1)
 224
 225 -- Comments.
 226 local comment = token(l.COMMENT, '#' * P(scanComment))
 227
 228 -- Strings.
 229 local string = token(l.STRING, DQUOTE * P(scanString))
 230
 231 -- Regular expressions.
 232 -- Slash delimited regular expressions are preceded by most operators or
 233 -- the keywords 'print' and 'case', possibly on a preceding line. They
 234 -- can contain unescaped slashes and brackets in brackets. Some escape
 235 -- sequences like '\S', '\s' have special meanings with Gawk. Tokens that
 236 -- contain them are displayed differently.
 237 local regex = token(l.REGEX, SLASH * P(scanRegex))
 238 local gawkRegex = token('gawkRegex', SLASH * P(scanGawkRegex))
 239
 240 -- no leading sign because it might be binary.
 241 local float = ((l.digit ^ 1 * ('.' * l.digit ^ 0) ^ -1) +
 242     ('.' * l.digit ^ 1)) * (S('eE') * S('+-') ^ -1 * l.digit ^ 1) ^ -1
 243 -- Numbers.
 244 local number = token(l.NUMBER, float)
 245 local gawkNumber = token('gawkNumber', l.hex_num + l.oct_num)
 246
 247 -- Operators.
 248 local operator = token(l.OPERATOR, S('!%&()*+,-/:;<=>?[\\]^{|}~'))
 249 local gawkOperator = token('gawkOperator', P("|&") + "@" + "**=" + "**")
 250
 251 -- Fields. E.g. $1, $a, $(x), $a(x), $a[x], $"1", $$a, etc.
 252 local field = token('field', P('$') * S('$+-') ^ 0 *
 253                     (float + (l.word ^ 0 * '(' * P(scanFieldDelimiters)) +
 254                      (l.word ^ 1 * ('[' * P(scanFieldDelimiters)) ^ -1) +
 255                      ('"' * P(scanString)) + ('/' * P(eatRegex) * '/')))
 256
 257 -- Functions.
 258 local func = token(l.FUNCTION, l.word * #P('('))
 259
 260 -- Identifiers.
 261 local identifier = token(l.IDENTIFIER, l.word)
 262
 263 -- Keywords.
 264 local keyword = token(l.KEYWORD, word_match{
 265   'BEGIN', 'END', 'atan2', 'break', 'close', 'continue', 'cos', 'delete', 'do',
 266   'else', 'exit', 'exp', 'fflush', 'for', 'function', 'getline', 'gsub', 'if',
 267   'in', 'index', 'int', 'length', 'log', 'match', 'next', 'nextfile', 'print',
 268   'printf', 'rand', 'return', 'sin', 'split', 'sprintf', 'sqrt', 'srand', 'sub',
 269   'substr', 'system', 'tolower', 'toupper', 'while'
 270 })
 271
 272 local gawkKeyword = token('gawkKeyword', word_match{
 273   'BEGINFILE', 'ENDFILE', 'adump', 'and', 'asort', 'asorti', 'bindtextdomain',
 274   'case', 'compl', 'dcgettext', 'dcngettext', 'default', 'extension', 'func',
 275   'gensub', 'include', 'isarray', 'load', 'lshift', 'mktime', 'or', 'patsplit',
 276   'rshift', 'stopme', 'strftime', 'strtonum', 'switch', 'systime', 'xor'
 277 })
 278
 279 local builtInVariable = token('builtInVariable', word_match{
 280   'ARGC', 'ARGV', 'CONVFMT', 'ENVIRON', 'FILENAME', 'FNR', 'FS', 'NF', 'NR',
 281   'OFMT', 'OFS', 'ORS', 'RLENGTH', 'RS', 'RSTART', 'SUBSEP'
 282 })
 283
 284 local gawkBuiltInVariable = token('gawkBuiltInVariable', word_match {
 285   'ARGIND', 'BINMODE', 'ERRNO', 'FIELDWIDTHS', 'FPAT', 'FUNCTAB', 'IGNORECASE',
 286   'LINT', 'PREC', 'PROCINFO', 'ROUNDMODE', 'RT', 'SYMTAB', 'TEXTDOMAIN'
 287 })
 288
 289 -- Within each group order matters, but the groups themselves (except the
 290 -- last) can be in any order.
 291 M._rules = {
 292   {'whitespace', ws},
 293
 294   {'comment', comment},
 295
 296   {'string', string},
 297
 298   {'field', field},
 299
 300   {'gawkRegex', gawkRegex},
 301   {'regex', regex},
 302   {'gawkOperator', gawkOperator},
 303   {'operator', operator},
 304
 305   {'gawkNumber', gawkNumber},
 306   {'number', number},
 307
 308   {'keyword', keyword},
 309   {'builtInVariable', builtInVariable},
 310   {'gawkKeyword', gawkKeyword},
 311   {'gawkBuiltInVariable', gawkBuiltInVariable},
 312   {'function', func},
 313   {'identifier', identifier},
 314 }
 315
 316 M._tokenstyles = {
 317   builtInVariable = l.STYLE_CONSTANT,
 318   default = l.STYLE_ERROR,
 319   field = l.STYLE_LABEL,
 320   gawkBuiltInVariable = l.STYLE_CONSTANT..',underlined',
 321   gawkKeyword = l.STYLE_KEYWORD..',underlined',
 322   gawkNumber = l.STYLE_NUMBER..',underlined',
 323   gawkOperator = l.STYLE_OPERATOR..',underlined',
 324   gawkRegex = l.STYLE_PREPROCESSOR..',underlined',
 325   regex = l.STYLE_PREPROCESSOR
 326 }
 327
 328 M._foldsymbols = {
 329   _patterns = {'[{}]', '#'},
 330   [l.OPERATOR] = {['{'] = 1, ['}'] = -1},
 331   [l.COMMENT] = {['#'] = l.fold_line_comments('#')}
 332 }
 333
 334 return M