lexers/lexer.lua

   1 -- Copyright 2006-2015 Mitchell mitchell.att.foicica.com. See LICENSE.
   2
   3 local M = {}
   4
   5 --[=[ This comment is for LuaDoc.
   6 ---
   7 -- Lexes Scintilla documents with Lua and LPeg.
   8 --
   9 -- ## Overview
  10 --
  11 -- Lexers highlight the syntax of source code. Scintilla (the editing component
  12 -- behind [Textadept][] and [SciTE][]) traditionally uses static, compiled C++
  13 -- lexers which are notoriously difficult to create and/or extend. On the other
  14 -- hand, Lua makes it easy to to rapidly create new lexers, extend existing
  15 -- ones, and embed lexers within one another. Lua lexers tend to be more
  16 -- readable than C++ lexers too.
  17 --
  18 -- Lexers are Parsing Expression Grammars, or PEGs, composed with the Lua
  19 -- [LPeg library][]. The following table comes from the LPeg documentation and
  20 -- summarizes all you need to know about constructing basic LPeg patterns. This
  21 -- module provides convenience functions for creating and working with other
  22 -- more advanced patterns and concepts.
  23 --
  24 -- Operator             | Description
  25 -- ---------------------|------------
  26 -- `lpeg.P(string)`     | Matches `string` literally.
  27 -- `lpeg.P(`_`n`_`)`    | Matches exactly _`n`_ characters.
  28 -- `lpeg.S(string)`     | Matches any character in set `string`.
  29 -- `lpeg.R("`_`xy`_`")` | Matches any character between range `x` and `y`.
  30 -- `patt^`_`n`_         | Matches at least _`n`_ repetitions of `patt`.
  31 -- `patt^-`_`n`_        | Matches at most _`n`_ repetitions of `patt`.
  32 -- `patt1 * patt2`      | Matches `patt1` followed by `patt2`.
  33 -- `patt1 + patt2`      | Matches `patt1` or `patt2` (ordered choice).
  34 -- `patt1 - patt2`      | Matches `patt1` if `patt2` does not match.
  35 -- `-patt`              | Equivalent to `("" - patt)`.
  36 -- `#patt`              | Matches `patt` but consumes no input.
  37 --
  38 -- The first part of this document deals with rapidly constructing a simple
  39 -- lexer. The next part deals with more advanced techniques, such as custom
  40 -- coloring and embedding lexers within one another. Following that is a
  41 -- discussion about code folding, or being able to tell Scintilla which code
  42 -- blocks are "foldable" (temporarily hideable from view). After that are
  43 -- instructions on how to use LPeg lexers with the aforementioned Textadept and
  44 -- SciTE editors. Finally there are comments on lexer performance and
  45 -- limitations.
  46 --
  47 -- [LPeg library]: http://www.inf.puc-rio.br/~roberto/lpeg/lpeg.html
  48 -- [Textadept]: http://foicica.com/textadept
  49 -- [SciTE]: http://scintilla.org/SciTE.html
  50 --
  51 -- ## Lexer Basics
  52 --
  53 -- The *lexers/* directory contains all lexers, including your new one. Before
  54 -- attempting to write one from scratch though, first determine if your
  55 -- programming language is similar to any of the 80+ languages supported. If so,
  56 -- you may be able to copy and modify that lexer, saving some time and effort.
  57 -- The filename of your lexer should be the name of your programming language in
  58 -- lower case followed by a *.lua* extension. For example, a new Lua lexer has
  59 -- the name *lua.lua*.
  60 --
  61 -- Note: Try to refrain from using one-character language names like "b", "c",
  62 -- or "d". For example, Scintillua uses "b_lang", "cpp", and "dmd",
  63 -- respectively.
  64 --
  65 -- ### New Lexer Template
  66 --
  67 -- There is a *lexers/template.txt* file that contains a simple template for a
  68 -- new lexer. Feel free to use it, replacing the '?'s with the name of your
  69 -- lexer:
  70 --
  71 --     -- ? LPeg lexer.
  72 --
  73 --     local l = require('lexer')
  74 --     local token, word_match = l.token, l.word_match
  75 --     local P, R, S = lpeg.P, lpeg.R, lpeg.S
  76 --
  77 --     local M = {_NAME = '?'}
  78 --
  79 --     -- Whitespace.
  80 --     local ws = token(l.WHITESPACE, l.space^1)
  81 --
  82 --     M._rules = {
  83 --       {'whitespace', ws},
  84 --     }
  85 --
  86 --     M._tokenstyles = {
  87 --
  88 --     }
  89 --
  90 --     return M
  91 --
  92 -- The first 4 lines of code simply define often used convenience variables. The
  93 -- 5th and last lines define and return the lexer object Scintilla uses; they
  94 -- are very important and must be part of every lexer. The sixth line defines
  95 -- something called a "token", an essential building block of lexers. You will
  96 -- learn about tokens shortly. The rest of the code defines a set of grammar
  97 -- rules and token styles. You will learn about those later. Note, however, the
  98 -- `M.` prefix in front of `_rules` and `_tokenstyles`: not only do these tables
  99 -- belong to their respective lexers, but any non-local variables need the `M.`
 100 -- prefix too so-as not to affect Lua's global environment. All in all, this is
 101 -- a minimal, working lexer that you can build on.
 102 --
 103 -- ### Tokens
 104 --
 105 -- Take a moment to think about your programming language's structure. What kind
 106 -- of key elements does it have? In the template shown earlier, one predefined
 107 -- element all languages have is whitespace. Your language probably also has
 108 -- elements like comments, strings, and keywords. Lexers refer to these elements
 109 -- as "tokens". Tokens are the fundamental "building blocks" of lexers. Lexers
 110 -- break down source code into tokens for coloring, which results in the syntax
 111 -- highlighting familiar to you. It is up to you how specific your lexer is when
 112 -- it comes to tokens. Perhaps only distinguishing between keywords and
 113 -- identifiers is necessary, or maybe recognizing constants and built-in
 114 -- functions, methods, or libraries is desirable. The Lua lexer, for example,
 115 -- defines 11 tokens: whitespace, comments, strings, numbers, keywords, built-in
 116 -- functions, constants, built-in libraries, identifiers, labels, and operators.
 117 -- Even though constants, built-in functions, and built-in libraries are subsets
 118 -- of identifiers, Lua programmers find it helpful for the lexer to distinguish
 119 -- between them all. It is perfectly acceptable to just recognize keywords and
 120 -- identifiers.
 121 --
 122 -- In a lexer, tokens consist of a token name and an LPeg pattern that matches a
 123 -- sequence of characters recognized as an instance of that token. Create tokens
 124 -- using the [`lexer.token()`]() function. Let us examine the "whitespace" token
 125 -- defined in the template shown earlier:
 126 --
 127 --     local ws = token(l.WHITESPACE, l.space^1)
 128 --
 129 -- At first glance, the first argument does not appear to be a string name and
 130 -- the second argument does not appear to be an LPeg pattern. Perhaps you
 131 -- expected something like:
 132 --
 133 --     local ws = token('whitespace', S('\t\v\f\n\r ')^1)
 134 --
 135 -- The `lexer` (`l`) module actually provides a convenient list of common token
 136 -- names and common LPeg patterns for you to use. Token names include
 137 -- [`lexer.DEFAULT`](), [`lexer.WHITESPACE`](), [`lexer.COMMENT`](),
 138 -- [`lexer.STRING`](), [`lexer.NUMBER`](), [`lexer.KEYWORD`](),
 139 -- [`lexer.IDENTIFIER`](), [`lexer.OPERATOR`](), [`lexer.ERROR`](),
 140 -- [`lexer.PREPROCESSOR`](), [`lexer.CONSTANT`](), [`lexer.VARIABLE`](),
 141 -- [`lexer.FUNCTION`](), [`lexer.CLASS`](), [`lexer.TYPE`](), [`lexer.LABEL`](),
 142 -- [`lexer.REGEX`](), and [`lexer.EMBEDDED`](). Patterns include
 143 -- [`lexer.any`](), [`lexer.ascii`](), [`lexer.extend`](), [`lexer.alpha`](),
 144 -- [`lexer.digit`](), [`lexer.alnum`](), [`lexer.lower`](), [`lexer.upper`](),
 145 -- [`lexer.xdigit`](), [`lexer.cntrl`](), [`lexer.graph`](), [`lexer.print`](),
 146 -- [`lexer.punct`](), [`lexer.space`](), [`lexer.newline`](),
 147 -- [`lexer.nonnewline`](), [`lexer.nonnewline_esc`](), [`lexer.dec_num`](),
 148 -- [`lexer.hex_num`](), [`lexer.oct_num`](), [`lexer.integer`](),
 149 -- [`lexer.float`](), and [`lexer.word`](). You may use your own token names if
 150 -- none of the above fit your language, but an advantage to using predefined
 151 -- token names is that your lexer's tokens will inherit the universal syntax
 152 -- highlighting color theme used by your text editor.
 153 --
 154 -- #### Example Tokens
 155 --
 156 -- So, how might you define other tokens like comments, strings, and keywords?
 157 -- Here are some examples.
 158 --
 159 -- **Comments**
 160 --
 161 -- Line-style comments with a prefix character(s) are easy to express with LPeg:
 162 --
 163 --     local shell_comment = token(l.COMMENT, '#' * l.nonnewline^0)
 164 --     local c_line_comment = token(l.COMMENT, '//' * l.nonnewline_esc^0)
 165 --
 166 -- The comments above start with a '#' or "//" and go to the end of the line.
 167 -- The second comment recognizes the next line also as a comment if the current
 168 -- line ends with a '\' escape character.
 169 --
 170 -- C-style "block" comments with a start and end delimiter are also easy to
 171 -- express:
 172 --
 173 --     local c_comment = token(l.COMMENT, '/*' * (l.any - '*/')^0 * P('*/')^-1)
 174 --
 175 -- This comment starts with a "/\*" sequence and contains anything up to and
 176 -- including an ending "\*/" sequence. The ending "\*/" is optional so the lexer
 177 -- can recognize unfinished comments as comments and highlight them properly.
 178 --
 179 -- **Strings**
 180 --
 181 -- It is tempting to think that a string is not much different from the block
 182 -- comment shown above in that both have start and end delimiters:
 183 --
 184 --     local dq_str = '"' * (l.any - '"')^0 * P('"')^-1
 185 --     local sq_str = "'" * (l.any - "'")^0 * P("'")^-1
 186 --     local simple_string = token(l.STRING, dq_str + sq_str)
 187 --
 188 -- However, most programming languages allow escape sequences in strings such
 189 -- that a sequence like "\\&quot;" in a double-quoted string indicates that the
 190 -- '&quot;' is not the end of the string. The above token incorrectly matches
 191 -- such a string. Instead, use the [`lexer.delimited_range()`]() convenience
 192 -- function.
 193 --
 194 --     local dq_str = l.delimited_range('"')
 195 --     local sq_str = l.delimited_range("'")
 196 --     local string = token(l.STRING, dq_str + sq_str)
 197 --
 198 -- In this case, the lexer treats '\' as an escape character in a string
 199 -- sequence.
 200 --
 201 -- **Keywords**
 202 --
 203 -- Instead of matching _n_ keywords with _n_ `P('keyword_`_`n`_`')` ordered
 204 -- choices, use another convenience function: [`lexer.word_match()`](). It is
 205 -- much easier and more efficient to write word matches like:
 206 --
 207 --     local keyword = token(l.KEYWORD, l.word_match{
 208 --       'keyword_1', 'keyword_2', ..., 'keyword_n'
 209 --     })
 210 --
 211 --     local case_insensitive_keyword = token(l.KEYWORD, l.word_match({
 212 --       'KEYWORD_1', 'keyword_2', ..., 'KEYword_n'
 213 --     }, nil, true))
 214 --
 215 --     local hyphened_keyword = token(l.KEYWORD, l.word_match({
 216 --       'keyword-1', 'keyword-2', ..., 'keyword-n'
 217 --     }, '-'))
 218 --
 219 -- By default, characters considered to be in keywords are in the set of
 220 -- alphanumeric characters and underscores. The last token demonstrates how to
 221 -- allow '-' (hyphen) characters to be in keywords as well.
 222 --
 223 -- **Numbers**
 224 --
 225 -- Most programming languages have the same format for integer and float tokens,
 226 -- so it might be as simple as using a couple of predefined LPeg patterns:
 227 --
 228 --     local number = token(l.NUMBER, l.float + l.integer)
 229 --
 230 -- However, some languages allow postfix characters on integers.
 231 --
 232 --     local integer = P('-')^-1 * (l.dec_num * S('lL')^-1)
 233 --     local number = token(l.NUMBER, l.float + l.hex_num + integer)
 234 --
 235 -- Your language may need other tweaks, but it is up to you how fine-grained you
 236 -- want your highlighting to be. After all, you are not writing a compiler or
 237 -- interpreter!
 238 --
 239 -- ### Rules
 240 --
 241 -- Programming languages have grammars, which specify valid token structure. For
 242 -- example, comments usually cannot appear within a string. Grammars consist of
 243 -- rules, which are simply combinations of tokens. Recall from the lexer
 244 -- template the `_rules` table, which defines all the rules used by the lexer
 245 -- grammar:
 246 --
 247 --     M._rules = {
 248 --       {'whitespace', ws},
 249 --     }
 250 --
 251 -- Each entry in a lexer's `_rules` table consists of a rule name and its
 252 -- associated pattern. Rule names are completely arbitrary and serve only to
 253 -- identify and distinguish between different rules. Rule order is important: if
 254 -- text does not match the first rule, the lexer tries the second rule, and so
 255 -- on. This simple grammar says to match whitespace tokens under a rule named
 256 -- "whitespace".
 257 --
 258 -- To illustrate the importance of rule order, here is an example of a
 259 -- simplified Lua grammar:
 260 --
 261 --     M._rules = {
 262 --       {'whitespace', ws},
 263 --       {'keyword', keyword},
 264 --       {'identifier', identifier},
 265 --       {'string', string},
 266 --       {'comment', comment},
 267 --       {'number', number},
 268 --       {'label', label},
 269 --       {'operator', operator},
 270 --     }
 271 --
 272 -- Note how identifiers come after keywords. In Lua, as with most programming
 273 -- languages, the characters allowed in keywords and identifiers are in the same
 274 -- set (alphanumerics plus underscores). If the lexer specified the "identifier"
 275 -- rule before the "keyword" rule, all keywords would match identifiers and thus
 276 -- incorrectly highlight as identifiers instead of keywords. The same idea
 277 -- applies to function, constant, etc. tokens that you may want to distinguish
 278 -- between: their rules should come before identifiers.
 279 --
 280 -- So what about text that does not match any rules? For example in Lua, the '!'
 281 -- character is meaningless outside a string or comment. Normally the lexer
 282 -- skips over such text. If instead you want to highlight these "syntax errors",
 283 -- add an additional end rule:
 284 --
 285 --     M._rules = {
 286 --       {'whitespace', ws},
 287 --       {'error', token(l.ERROR, l.any)},
 288 --     }
 289 --
 290 -- This identifies and highlights any character not matched by an existing
 291 -- rule as an `lexer.ERROR` token.
 292 --
 293 -- Even though the rules defined in the examples above contain a single token,
 294 -- rules may consist of multiple tokens. For example, a rule for an HTML tag
 295 -- could consist of a tag token followed by an arbitrary number of attribute
 296 -- tokens, allowing the lexer to highlight all tokens separately. The rule might
 297 -- look something like this:
 298 --
 299 --     {'tag', tag_start * (ws * attributes)^0 * tag_end^-1}
 300 --
 301 -- Note however that lexers with complex rules like these are more prone to lose
 302 -- track of their state.
 303 --
 304 -- ### Summary
 305 --
 306 -- Lexers primarily consist of tokens and grammar rules. At your disposal are a
 307 -- number of convenience patterns and functions for rapidly creating a lexer. If
 308 -- you choose to use predefined token names for your tokens, you do not have to
 309 -- define how the lexer highlights them. The tokens will inherit the default
 310 -- syntax highlighting color theme your editor uses.
 311 --
 312 -- ## Advanced Techniques
 313 --
 314 -- ### Styles and Styling
 315 --
 316 -- The most basic form of syntax highlighting is assigning different colors to
 317 -- different tokens. Instead of highlighting with just colors, Scintilla allows
 318 -- for more rich highlighting, or "styling", with different fonts, font sizes,
 319 -- font attributes, and foreground and background colors, just to name a few.
 320 -- The unit of this rich highlighting is called a "style". Styles are simply
 321 -- strings of comma-separated property settings. By default, lexers associate
 322 -- predefined token names like `lexer.WHITESPACE`, `lexer.COMMENT`,
 323 -- `lexer.STRING`, etc. with particular styles as part of a universal color
 324 -- theme. These predefined styles include [`lexer.STYLE_CLASS`](),
 325 -- [`lexer.STYLE_COMMENT`](), [`lexer.STYLE_CONSTANT`](),
 326 -- [`lexer.STYLE_ERROR`](), [`lexer.STYLE_EMBEDDED`](),
 327 -- [`lexer.STYLE_FUNCTION`](), [`lexer.STYLE_IDENTIFIER`](),
 328 -- [`lexer.STYLE_KEYWORD`](), [`lexer.STYLE_LABEL`](), [`lexer.STYLE_NUMBER`](),
 329 -- [`lexer.STYLE_OPERATOR`](), [`lexer.STYLE_PREPROCESSOR`](),
 330 -- [`lexer.STYLE_REGEX`](), [`lexer.STYLE_STRING`](), [`lexer.STYLE_TYPE`](),
 331 -- [`lexer.STYLE_VARIABLE`](), and [`lexer.STYLE_WHITESPACE`](). Like with
 332 -- predefined token names and LPeg patterns, you may define your own styles. At
 333 -- their core, styles are just strings, so you may create new ones and/or modify
 334 -- existing ones. Each style consists of the following comma-separated settings:
 335 --
 336 -- Setting        | Description
 337 -- ---------------|------------
 338 -- font:_name_    | The name of the font the style uses.
 339 -- size:_int_     | The size of the font the style uses.
 340 -- [not]bold      | Whether or not the font face is bold.
 341 -- [not]italics   | Whether or not the font face is italic.
 342 -- [not]underlined| Whether or not the font face is underlined.
 343 -- fore:_color_   | The foreground color of the font face.
 344 -- back:_color_   | The background color of the font face.
 345 -- [not]eolfilled | Does the background color extend to the end of the line?
 346 -- case:_char_    | The case of the font ('u': upper, 'l': lower, 'm': normal).
 347 -- [not]visible   | Whether or not the text is visible.
 348 -- [not]changeable| Whether the text is changeable or read-only.
 349 --
 350 -- Specify font colors in either "#RRGGBB" format, "0xBBGGRR" format, or the
 351 -- decimal equivalent of the latter. As with token names, LPeg patterns, and
 352 -- styles, there is a set of predefined color names, but they vary depending on
 353 -- the current color theme in use. Therefore, it is generally not a good idea to
 354 -- manually define colors within styles in your lexer since they might not fit
 355 -- into a user's chosen color theme. Try to refrain from even using predefined
 356 -- colors in a style because that color may be theme-specific. Instead, the best
 357 -- practice is to either use predefined styles or derive new color-agnostic
 358 -- styles from predefined ones. For example, Lua "longstring" tokens use the
 359 -- existing `lexer.STYLE_STRING` style instead of defining a new one.
 360 --
 361 -- #### Example Styles
 362 --
 363 -- Defining styles is pretty straightforward. An empty style that inherits the
 364 -- default theme settings is simply an empty string:
 365 --
 366 --     local style_nothing = ''
 367 --
 368 -- A similar style but with a bold font face looks like this:
 369 --
 370 --     local style_bold = 'bold'
 371 --
 372 -- If you want the same style, but also with an italic font face, define the new
 373 -- style in terms of the old one:
 374 --
 375 --     local style_bold_italic = style_bold..',italics'
 376 --
 377 -- This allows you to derive new styles from predefined ones without having to
 378 -- rewrite them. This operation leaves the old style unchanged. Thus if you
 379 -- had a "static variable" token whose style you wanted to base off of
 380 -- `lexer.STYLE_VARIABLE`, it would probably look like:
 381 --
 382 --     local style_static_var = l.STYLE_VARIABLE..',italics'
 383 --
 384 -- The color theme files in the *lexers/themes/* folder give more examples of
 385 -- style definitions.
 386 --
 387 -- ### Token Styles
 388 --
 389 -- Lexers use the `_tokenstyles` table to assign tokens to particular styles.
 390 -- Recall the token definition and `_tokenstyles` table from the lexer template:
 391 --
 392 --     local ws = token(l.WHITESPACE, l.space^1)
 393 --
 394 --     ...
 395 --
 396 --     M._tokenstyles = {
 397 --
 398 --     }
 399 --
 400 -- Why is a style not assigned to the `lexer.WHITESPACE` token? As mentioned
 401 -- earlier, lexers automatically associate tokens that use predefined token
 402 -- names with a particular style. Only tokens with custom token names need
 403 -- manual style associations. As an example, consider a custom whitespace token:
 404 --
 405 --     local ws = token('custom_whitespace', l.space^1)
 406 --
 407 -- Assigning a style to this token looks like:
 408 --
 409 --     M._tokenstyles = {
 410 --       custom_whitespace = l.STYLE_WHITESPACE
 411 --     }
 412 --
 413 -- Do not confuse token names with rule names. They are completely different
 414 -- entities. In the example above, the lexer assigns the "custom_whitespace"
 415 -- token the existing style for `WHITESPACE` tokens. If instead you want to
 416 -- color the background of whitespace a shade of grey, it might look like:
 417 --
 418 --     local custom_style = l.STYLE_WHITESPACE..',back:$(color.grey)'
 419 --     M._tokenstyles = {
 420 --       custom_whitespace = custom_style
 421 --     }
 422 --
 423 -- Notice that the lexer peforms Scintilla/SciTE-style "$()" property expansion.
 424 -- You may also use "%()". Remember to refrain from assigning specific colors in
 425 -- styles, but in this case, all user color themes probably define the
 426 -- "color.grey" property.
 427 --
 428 -- ### Line Lexers
 429 --
 430 -- By default, lexers match the arbitrary chunks of text passed to them by
 431 -- Scintilla. These chunks may be a full document, only the visible part of a
 432 -- document, or even just portions of lines. Some lexers need to match whole
 433 -- lines. For example, a lexer for the output of a file "diff" needs to know if
 434 -- the line started with a '+' or '-' and then style the entire line
 435 -- accordingly. To indicate that your lexer matches by line, use the
 436 -- `_LEXBYLINE` field:
 437 --
 438 --     M._LEXBYLINE = true
 439 --
 440 -- Now the input text for the lexer is a single line at a time. Keep in mind
 441 -- that line lexers do not have the ability to look ahead at subsequent lines.
 442 --
 443 -- ### Embedded Lexers
 444 --
 445 -- Lexers embed within one another very easily, requiring minimal effort. In the
 446 -- following sections, the lexer being embedded is called the "child" lexer and
 447 -- the lexer a child is being embedded in is called the "parent". For example,
 448 -- consider an HTML lexer and a CSS lexer. Either lexer stands alone for styling
 449 -- their respective HTML and CSS files. However, CSS can be embedded inside
 450 -- HTML. In this specific case, the CSS lexer is the "child" lexer with the HTML
 451 -- lexer being the "parent". Now consider an HTML lexer and a PHP lexer. This
 452 -- sounds a lot like the case with CSS, but there is a subtle difference: PHP
 453 -- _embeds itself_ into HTML while CSS is _embedded in_ HTML. This fundamental
 454 -- difference results in two types of embedded lexers: a parent lexer that
 455 -- embeds other child lexers in it (like HTML embedding CSS), and a child lexer
 456 -- that embeds itself within a parent lexer (like PHP embedding itself in HTML).
 457 --
 458 -- #### Parent Lexer
 459 --
 460 -- Before embedding a child lexer into a parent lexer, the parent lexer needs to
 461 -- load the child lexer. This is done with the [`lexer.load()`]() function. For
 462 -- example, loading the CSS lexer within the HTML lexer looks like:
 463 --
 464 --     local css = l.load('css')
 465 --
 466 -- The next part of the embedding process is telling the parent lexer when to
 467 -- switch over to the child lexer and when to switch back. The lexer refers to
 468 -- these indications as the "start rule" and "end rule", respectively, and are
 469 -- just LPeg patterns. Continuing with the HTML/CSS example, the transition from
 470 -- HTML to CSS is when the lexer encounters a "style" tag with a "type"
 471 -- attribute whose value is "text/css":
 472 --
 473 --     local css_tag = P('<style') * P(function(input, index)
 474 --       if input:find('^[^>]+type="text/css"', index) then
 475 --         return index
 476 --       end
 477 --     end)
 478 --
 479 -- This pattern looks for the beginning of a "style" tag and searches its
 480 -- attribute list for the text "`type="text/css"`". (In this simplified example,
 481 -- the Lua pattern does not consider whitespace between the '=' nor does it
 482 -- consider that using single quotes is valid.) If there is a match, the
 483 -- functional pattern returns a value instead of `nil`. In this case, the value
 484 -- returned does not matter because we ultimately want to style the "style" tag
 485 -- as an HTML tag, so the actual start rule looks like this:
 486 --
 487 --     local css_start_rule = #css_tag * tag
 488 --
 489 -- Now that the parent knows when to switch to the child, it needs to know when
 490 -- to switch back. In the case of HTML/CSS, the switch back occurs when the
 491 -- lexer encounters an ending "style" tag, though the lexer should still style
 492 -- the tag as an HTML tag:
 493 --
 494 --     local css_end_rule = #P('</style>') * tag
 495 --
 496 -- Once the parent loads the child lexer and defines the child's start and end
 497 -- rules, it embeds the child with the [`lexer.embed_lexer()`]() function:
 498 --
 499 --     l.embed_lexer(M, css, css_start_rule, css_end_rule)
 500 --
 501 -- The first parameter is the parent lexer object to embed the child in, which
 502 -- in this case is `M`. The other three parameters are the child lexer object
 503 -- loaded earlier followed by its start and end rules.
 504 --
 505 -- #### Child Lexer
 506 --
 507 -- The process for instructing a child lexer to embed itself into a parent is
 508 -- very similar to embedding a child into a parent: first, load the parent lexer
 509 -- into the child lexer with the [`lexer.load()`]() function and then create
 510 -- start and end rules for the child lexer. However, in this case, swap the
 511 -- lexer object arguments to [`lexer.embed_lexer()`](). For example, in the PHP
 512 -- lexer:
 513 --
 514 --     local html = l.load('html')
 515 --     local php_start_rule = token('php_tag', '<?php ')
 516 --     local php_end_rule = token('php_tag', '?>')
 517 --     l.embed_lexer(html, M, php_start_rule, php_end_rule)
 518 --
 519 -- ## Code Folding
 520 --
 521 -- When reading source code, it is occasionally helpful to temporarily hide
 522 -- blocks of code like functions, classes, comments, etc. This is the concept of
 523 -- "folding". In the Textadept and SciTE editors for example, little indicators
 524 -- in the editor margins appear next to code that can be folded at places called
 525 -- "fold points". When the user clicks an indicator, the editor hides the code
 526 -- associated with the indicator until the user clicks the indicator again. The
 527 -- lexer specifies these fold points and what code exactly to fold.
 528 --
 529 -- The fold points for most languages occur on keywords or character sequences.
 530 -- Examples of fold keywords are "if" and "end" in Lua and examples of fold
 531 -- character sequences are '{', '}', "/\*", and "\*/" in C for code block and
 532 -- comment delimiters, respectively. However, these fold points cannot occur
 533 -- just anywhere. For example, lexers should not recognize fold keywords that
 534 -- appear within strings or comments. The lexer's `_foldsymbols` table allows
 535 -- you to conveniently define fold points with such granularity. For example,
 536 -- consider C:
 537 --
 538 --     M._foldsymbols = {
 539 --       [l.OPERATOR] = {['{'] = 1, ['}'] = -1},
 540 --       [l.COMMENT] = {['/*'] = 1, ['*/'] = -1},
 541 --       _patterns = {'[{}]', '/%*', '%*/'}
 542 --     }
 543 --
 544 -- The first assignment states that any '{' or '}' that the lexer recognized as
 545 -- an `lexer.OPERATOR` token is a fold point. The integer `1` indicates the
 546 -- match is a beginning fold point and `-1` indicates the match is an ending
 547 -- fold point. Likewise, the second assignment states that any "/\*" or "\*/"
 548 -- that the lexer recognizes as part of a `lexer.COMMENT` token is a fold point.
 549 -- The lexer does not consider any occurences of these characters outside their
 550 -- defined tokens (such as in a string) as fold points. Finally, every
 551 -- `_foldsymbols` table must have a `_patterns` field that contains a list of
 552 -- [Lua patterns][] that match fold points. If the lexer encounters text that
 553 -- matches one of those patterns, the lexer looks up the matched text in its
 554 -- token's table to determine whether or not the text is a fold point. In the
 555 -- example above, the first Lua pattern matches any '{' or '}' characters. When
 556 -- the lexer comes across one of those characters, it checks if the match is an
 557 -- `lexer.OPERATOR` token. If so, the lexer identifies the match as a fold
 558 -- point. The same idea applies for the other patterns. (The '%' is in the other
 559 -- patterns because '\*' is a special character in Lua patterns that needs
 560 -- escaping.) How do you specify fold keywords? Here is an example for Lua:
 561 --
 562 --     M._foldsymbols = {
 563 --       [l.KEYWORD] = {
 564 --         ['if'] = 1, ['do'] = 1, ['function'] = 1,
 565 --         ['end'] = -1, ['repeat'] = 1, ['until'] = -1
 566 --       },
 567 --       _patterns = {'%l+'}
 568 --     }
 569 --
 570 -- Any time the lexer encounters a lower case word, if that word is a
 571 -- `lexer.KEYWORD` token and in the associated list of fold points, the lexer
 572 -- identifies the word as a fold point.
 573 --
 574 -- If your lexer needs to do some additional processing to determine if a match
 575 -- is a fold point, assign a function that returns an integer. Returning `1` or
 576 -- `-1` indicates the match is a fold point. Returning `0` indicates it is not.
 577 -- For example:
 578 --
 579 --     local function fold_strange_token(text, pos, line, s, match)
 580 --       if ... then
 581 --         return 1 -- beginning fold point
 582 --       elseif ... then
 583 --         return -1 -- ending fold point
 584 --       end
 585 --       return 0
 586 --     end
 587 --
 588 --     M._foldsymbols = {
 589 --       ['strange_token'] = {['|'] = fold_strange_token},
 590 --       _patterns = {'|'}
 591 --     }
 592 --
 593 -- Any time the lexer encounters a '|' that is a "strange_token", it calls the
 594 -- `fold_strange_token` function to determine if '|' is a fold point. The lexer
 595 -- calls these functions with the following arguments: the text to identify fold
 596 -- points in, the beginning position of the current line in the text to fold,
 597 -- the current line's text, the position in the current line the matched text
 598 -- starts at, and the matched text itself.
 599 --
 600 -- [Lua patterns]: http://www.lua.org/manual/5.2/manual.html#6.4.1
 601 --
 602 -- ### Fold by Indentation
 603 --
 604 -- Some languages have significant whitespace and/or no delimiters that indicate
 605 -- fold points. If your lexer falls into this category and you would like to
 606 -- mark fold points based on changes in indentation, use the
 607 -- `_FOLDBYINDENTATION` field:
 608 --
 609 --     M._FOLDBYINDENTATION = true
 610 --
 611 -- ## Using Lexers
 612 --
 613 -- ### Textadept
 614 --
 615 -- Put your lexer in your *~/.textadept/lexers/* directory so you do not
 616 -- overwrite it when upgrading Textadept. Also, lexers in this directory
 617 -- override default lexers. Thus, Textadept loads a user *lua* lexer instead of
 618 -- the default *lua* lexer. This is convenient for tweaking a default lexer to
 619 -- your liking. Then add a [file type][] for your lexer if necessary.
 620 --
 621 -- [file type]: _M.textadept.file_types.html
 622 --
 623 -- ### SciTE
 624 --
 625 -- Create a *.properties* file for your lexer and `import` it in either your
 626 -- *SciTEUser.properties* or *SciTEGlobal.properties*. The contents of the
 627 -- *.properties* file should contain:
 628 --
 629 --     file.patterns.[lexer_name]=[file_patterns]
 630 --     lexer.$(file.patterns.[lexer_name])=[lexer_name]
 631 --
 632 -- where `[lexer_name]` is the name of your lexer (minus the *.lua* extension)
 633 -- and `[file_patterns]` is a set of file extensions to use your lexer for.
 634 --
 635 -- Please note that Lua lexers ignore any styling information in *.properties*
 636 -- files. Your theme file in the *lexers/themes/* directory contains styling
 637 -- information.
 638 --
 639 -- ## Considerations
 640 --
 641 -- ### Performance
 642 --
 643 -- There might be some slight overhead when initializing a lexer, but loading a
 644 -- file from disk into Scintilla is usually more expensive. On modern computer
 645 -- systems, I see no difference in speed between LPeg lexers and Scintilla's C++
 646 -- ones. Optimize lexers for speed by re-arranging rules in the `_rules` table
 647 -- so that the most common rules match first. Do keep in mind that order matters
 648 -- for similar rules.
 649 --
 650 -- ### Limitations
 651 --
 652 -- Embedded preprocessor languages like PHP cannot completely embed in their
 653 -- parent languages in that the parent's tokens do not support start and end
 654 -- rules. This mostly goes unnoticed, but code like
 655 --
 656 --     <div id="<?php echo $id; ?>">
 657 --
 658 -- or
 659 --
 660 --     <div <?php if ($odd) { echo 'class="odd"'; } ?>>
 661 --
 662 -- will not style correctly.
 663 --
 664 -- ### Troubleshooting
 665 --
 666 -- Errors in lexers can be tricky to debug. Lexers print Lua errors to
 667 -- `io.stderr` and `_G.print()` statements to `io.stdout`. Running your editor
 668 -- from a terminal is the easiest way to see errors as they occur.
 669 --
 670 -- ### Risks
 671 --
 672 -- Poorly written lexers have the ability to crash Scintilla (and thus its
 673 -- containing application), so unsaved data might be lost. However, I have only
 674 -- observed these crashes in early lexer development, when syntax errors or
 675 -- pattern errors are present. Once the lexer actually starts styling text
 676 -- (either correctly or incorrectly, it does not matter), I have not observed
 677 -- any crashes.
 678 --
 679 -- ### Acknowledgements
 680 --
 681 -- Thanks to Peter Odding for his [lexer post][] on the Lua mailing list
 682 -- that inspired me, and thanks to Roberto Ierusalimschy for LPeg.
 683 --
 684 -- [lexer post]: http://lua-users.org/lists/lua-l/2007-04/msg00116.html
 685 -- @field LEXERPATH (string)
 686 --   The path used to search for a lexer to load.
 687 --   Identical in format to Lua's `package.path` string.
 688 --   The default value is `package.path`.
 689 -- @field DEFAULT (string)
 690 --   The token name for default tokens.
 691 -- @field WHITESPACE (string)
 692 --   The token name for whitespace tokens.
 693 -- @field COMMENT (string)
 694 --   The token name for comment tokens.
 695 -- @field STRING (string)
 696 --   The token name for string tokens.
 697 -- @field NUMBER (string)
 698 --   The token name for number tokens.
 699 -- @field KEYWORD (string)
 700 --   The token name for keyword tokens.
 701 -- @field IDENTIFIER (string)
 702 --   The token name for identifier tokens.
 703 -- @field OPERATOR (string)
 704 --   The token name for operator tokens.
 705 -- @field ERROR (string)
 706 --   The token name for error tokens.
 707 -- @field PREPROCESSOR (string)
 708 --   The token name for preprocessor tokens.
 709 -- @field CONSTANT (string)
 710 --   The token name for constant tokens.
 711 -- @field VARIABLE (string)
 712 --   The token name for variable tokens.
 713 -- @field FUNCTION (string)
 714 --   The token name for function tokens.
 715 -- @field CLASS (string)
 716 --   The token name for class tokens.
 717 -- @field TYPE (string)
 718 --   The token name for type tokens.
 719 -- @field LABEL (string)
 720 --   The token name for label tokens.
 721 -- @field REGEX (string)
 722 --   The token name for regex tokens.
 723 -- @field STYLE_CLASS (string)
 724 --   The style typically used for class definitions.
 725 -- @field STYLE_COMMENT (string)
 726 --   The style typically used for code comments.
 727 -- @field STYLE_CONSTANT (string)
 728 --   The style typically used for constants.
 729 -- @field STYLE_ERROR (string)
 730 --   The style typically used for erroneous syntax.
 731 -- @field STYLE_FUNCTION (string)
 732 --   The style typically used for function definitions.
 733 -- @field STYLE_KEYWORD (string)
 734 --   The style typically used for language keywords.
 735 -- @field STYLE_LABEL (string)
 736 --   The style typically used for labels.
 737 -- @field STYLE_NUMBER (string)
 738 --   The style typically used for numbers.
 739 -- @field STYLE_OPERATOR (string)
 740 --   The style typically used for operators.
 741 -- @field STYLE_REGEX (string)
 742 --   The style typically used for regular expression strings.
 743 -- @field STYLE_STRING (string)
 744 --   The style typically used for strings.
 745 -- @field STYLE_PREPROCESSOR (string)
 746 --   The style typically used for preprocessor statements.
 747 -- @field STYLE_TYPE (string)
 748 --   The style typically used for static types.
 749 -- @field STYLE_VARIABLE (string)
 750 --   The style typically used for variables.
 751 -- @field STYLE_WHITESPACE (string)
 752 --   The style typically used for whitespace.
 753 -- @field STYLE_EMBEDDED (string)
 754 --   The style typically used for embedded code.
 755 -- @field STYLE_IDENTIFIER (string)
 756 --   The style typically used for identifier words.
 757 -- @field STYLE_DEFAULT (string)
 758 --   The style all styles are based off of.
 759 -- @field STYLE_LINENUMBER (string)
 760 --   The style used for all margins except fold margins.
 761 -- @field STYLE_BRACELIGHT (string)
 762 --   The style used for highlighted brace characters.
 763 -- @field STYLE_BRACEBAD (string)
 764 --   The style used for unmatched brace characters.
 765 -- @field STYLE_CONTROLCHAR (string)
 766 --   The style used for control characters.
 767 --   Color attributes are ignored.
 768 -- @field STYLE_INDENTGUIDE (string)
 769 --   The style used for indentation guides.
 770 -- @field STYLE_CALLTIP (string)
 771 --   The style used by call tips if [`buffer.call_tip_use_style`]() is set.
 772 --   Only the font name, size, and color attributes are used.
 773 -- @field any (pattern)
 774 --   A pattern that matches any single character.
 775 -- @field ascii (pattern)
 776 --   A pattern that matches any ASCII character (codes 0 to 127).
 777 -- @field extend (pattern)
 778 --   A pattern that matches any ASCII extended character (codes 0 to 255).
 779 -- @field alpha (pattern)
 780 --   A pattern that matches any alphabetic character ('A'-'Z', 'a'-'z').
 781 -- @field digit (pattern)
 782 --   A pattern that matches any digit ('0'-'9').
 783 -- @field alnum (pattern)
 784 --   A pattern that matches any alphanumeric character ('A'-'Z', 'a'-'z',
 785 --     '0'-'9').
 786 -- @field lower (pattern)
 787 --   A pattern that matches any lower case character ('a'-'z').
 788 -- @field upper (pattern)
 789 --   A pattern that matches any upper case character ('A'-'Z').
 790 -- @field xdigit (pattern)
 791 --   A pattern that matches any hexadecimal digit ('0'-'9', 'A'-'F', 'a'-'f').
 792 -- @field cntrl (pattern)
 793 --   A pattern that matches any control character (ASCII codes 0 to 31).
 794 -- @field graph (pattern)
 795 --   A pattern that matches any graphical character ('!' to '~').
 796 -- @field print (pattern)
 797 --   A pattern that matches any printable character (' ' to '~').
 798 -- @field punct (pattern)
 799 --   A pattern that matches any punctuation character ('!' to '/', ':' to '@',
 800 --   '[' to ''', '{' to '~').
 801 -- @field space (pattern)
 802 --   A pattern that matches any whitespace character ('\t', '\v', '\f', '\n',
 803 --   '\r', space).
 804 -- @field newline (pattern)
 805 --   A pattern that matches any set of end of line characters.
 806 -- @field nonnewline (pattern)
 807 --   A pattern that matches any single, non-newline character.
 808 -- @field nonnewline_esc (pattern)
 809 --   A pattern that matches any single, non-newline character or any set of end
 810 --   of line characters escaped with '\'.
 811 -- @field dec_num (pattern)
 812 --   A pattern that matches a decimal number.
 813 -- @field hex_num (pattern)
 814 --   A pattern that matches a hexadecimal number.
 815 -- @field oct_num (pattern)
 816 --   A pattern that matches an octal number.
 817 -- @field integer (pattern)
 818 --   A pattern that matches either a decimal, hexadecimal, or octal number.
 819 -- @field float (pattern)
 820 --   A pattern that matches a floating point number.
 821 -- @field word (pattern)
 822 --   A pattern that matches a typical word. Words begin with a letter or
 823 --   underscore and consist of alphanumeric and underscore characters.
 824 -- @field FOLD_BASE (number)
 825 --   The initial (root) fold level.
 826 -- @field FOLD_BLANK (number)
 827 --   Flag indicating that the line is blank.
 828 -- @field FOLD_HEADER (number)
 829 --   Flag indicating the line is fold point.
 830 -- @field fold_level (table, Read-only)
 831 --   Table of fold level bit-masks for line numbers starting from zero.
 832 --   Fold level masks are composed of an integer level combined with any of the
 833 --   following bits:
 834 --
 835 --   * `lexer.FOLD_BASE`
 836 --     The initial fold level.
 837 --   * `lexer.FOLD_BLANK`
 838 --     The line is blank.
 839 --   * `lexer.FOLD_HEADER`
 840 --     The line is a header, or fold point.
 841 -- @field indent_amount (table, Read-only)
 842 --   Table of indentation amounts in character columns, for line numbers
 843 --   starting from zero.
 844 -- @field property (table)
 845 --   Map of key-value string pairs.
 846 -- @field property_expanded (table, Read-only)
 847 --   Map of key-value string pairs with `$()` and `%()` variable replacement
 848 --   performed in values.
 849 -- @field property_int (table, Read-only)
 850 --   Map of key-value pairs with values interpreted as numbers, or `0` if not
 851 --   found.
 852 -- @field style_at (table, Read-only)
 853 --   Table of style names at positions in the buffer starting from zero.
 854 module('lexer')]=]
 855
 856 local lpeg = require('lpeg')
 857 local lpeg_P, lpeg_R, lpeg_S, lpeg_V = lpeg.P, lpeg.R, lpeg.S, lpeg.V
 858 local lpeg_Ct, lpeg_Cc, lpeg_Cp = lpeg.Ct, lpeg.Cc, lpeg.Cp
 859 local lpeg_Cmt, lpeg_C = lpeg.Cmt, lpeg.C
 860 local lpeg_match = lpeg.match
 861
 862 M.LEXERPATH = package.path
 863
 864 -- Table of loaded lexers.
 865 local lexers = {}
 866
 867 -- Keep track of the last parent lexer loaded. This lexer's rules are used for
 868 -- proxy lexers (those that load parent and child lexers to embed) that do not
 869 -- declare a parent lexer.
 870 local parent_lexer
 871
 872 if not package.searchpath then
 873   -- Searches for the given *name* in the given *path*.
 874   -- This is an implementation of Lua 5.2's `package.searchpath()` function for
 875   -- Lua 5.1.
 876   function package.searchpath(name, path)
 877     local tried = {}
 878     for part in path:gmatch('[^;]+') do
 879       local filename = part:gsub('%?', name)
 880       local f = io.open(filename, 'r')
 881       if f then f:close() return filename end
 882       tried[#tried + 1] = ("no file '%s'"):format(filename)
 883     end
 884     return nil, table.concat(tried, '\n')
 885   end
 886 end
 887
 888 -- Adds a rule to a lexer's current ordered list of rules.
 889 -- @param lexer The lexer to add the given rule to.
 890 -- @param name The name associated with this rule. It is used for other lexers
 891 --   to access this particular rule from the lexer's `_RULES` table. It does not
 892 --   have to be the same as the name passed to `token`.
 893 -- @param rule The LPeg pattern of the rule.
 894 local function add_rule(lexer, id, rule)
 895   if not lexer._RULES then
 896     lexer._RULES = {}
 897     -- Contains an ordered list (by numerical index) of rule names. This is used
 898     -- in conjunction with lexer._RULES for building _TOKENRULE.
 899     lexer._RULEORDER = {}
 900   end
 901   lexer._RULES[id] = rule
 902   lexer._RULEORDER[#lexer._RULEORDER + 1] = id
 903 end
 904
 905 -- Adds a new Scintilla style to Scintilla.
 906 -- @param lexer The lexer to add the given style to.
 907 -- @param token_name The name of the token associated with this style.
 908 -- @param style A Scintilla style created from `style()`.
 909 -- @see style
 910 local function add_style(lexer, token_name, style)
 911   local num_styles = lexer._numstyles
 912   if num_styles == 32 then num_styles = num_styles + 8 end -- skip predefined
 913   if num_styles >= 255 then print('Too many styles defined (255 MAX)') end
 914   lexer._TOKENSTYLES[token_name], lexer._numstyles = num_styles, num_styles + 1
 915   lexer._EXTRASTYLES[token_name] = style
 916 end
 917
 918 -- (Re)constructs `lexer._TOKENRULE`.
 919 -- @param parent The parent lexer.
 920 local function join_tokens(lexer)
 921   local patterns, order = lexer._RULES, lexer._RULEORDER
 922   local token_rule = patterns[order[1]]
 923   for i = 2, #order do token_rule = token_rule + patterns[order[i]] end
 924   lexer._TOKENRULE = token_rule + M.token(M.DEFAULT, M.any)
 925   return lexer._TOKENRULE
 926 end
 927
 928 -- Adds a given lexer and any of its embedded lexers to a given grammar.
 929 -- @param grammar The grammar to add the lexer to.
 930 -- @param lexer The lexer to add.
 931 local function add_lexer(grammar, lexer, token_rule)
 932   local token_rule = join_tokens(lexer)
 933   local lexer_name = lexer._NAME
 934   for _, child in ipairs(lexer._CHILDREN) do
 935     if child._CHILDREN then add_lexer(grammar, child) end
 936     local child_name = child._NAME
 937     local rules = child._EMBEDDEDRULES[lexer_name]
 938     local rules_token_rule = grammar['__'..child_name] or rules.token_rule
 939     grammar[child_name] = (-rules.end_rule * rules_token_rule)^0 *
 940                           rules.end_rule^-1 * lpeg_V(lexer_name)
 941     local embedded_child = '_'..child_name
 942     grammar[embedded_child] = rules.start_rule * (-rules.end_rule *
 943                               rules_token_rule)^0 * rules.end_rule^-1
 944     token_rule = lpeg_V(embedded_child) + token_rule
 945   end
 946   grammar['__'..lexer_name] = token_rule -- can contain embedded lexer rules
 947   grammar[lexer_name] = token_rule^0
 948 end
 949
 950 -- (Re)constructs `lexer._GRAMMAR`.
 951 -- @param lexer The parent lexer.
 952 -- @param initial_rule The name of the rule to start lexing with. The default
 953 --   value is `lexer._NAME`. Multilang lexers use this to start with a child
 954 --   rule if necessary.
 955 local function build_grammar(lexer, initial_rule)
 956   local children = lexer._CHILDREN
 957   if children then
 958     local lexer_name = lexer._NAME
 959     if not initial_rule then initial_rule = lexer_name end
 960     local grammar = {initial_rule}
 961     add_lexer(grammar, lexer)
 962     lexer._INITIALRULE = initial_rule
 963     lexer._GRAMMAR = lpeg_Ct(lpeg_P(grammar))
 964   else
 965     lexer._GRAMMAR = lpeg_Ct(join_tokens(lexer)^0)
 966   end
 967 end
 968
 969 local string_upper = string.upper
 970 -- Default styles.
 971 local default = {
 972   'nothing', 'whitespace', 'comment', 'string', 'number', 'keyword',
 973   'identifier', 'operator', 'error', 'preprocessor', 'constant', 'variable',
 974   'function', 'class', 'type', 'label', 'regex', 'embedded'
 975 }
 976 for _, v in ipairs(default) do
 977   M[string_upper(v)] = v
 978   if not M['STYLE_'..string_upper(v)] then
 979     M['STYLE_'..string_upper(v)] = ''
 980   end
 981 end
 982 -- Predefined styles.
 983 local predefined = {
 984   'default', 'linenumber', 'bracelight', 'bracebad', 'controlchar',
 985   'indentguide', 'calltip'
 986 }
 987 for _, v in ipairs(predefined) do
 988   M[string_upper(v)] = v
 989   if not M['STYLE_'..string_upper(v)] then
 990     M['STYLE_'..string_upper(v)] = ''
 991   end
 992 end
 993
 994 ---
 995 -- Initializes or loads and returns the lexer of string name *name*.
 996 -- Scintilla calls this function to load a lexer. Parent lexers also call this
 997 -- function to load child lexers and vice-versa. The user calls this function
 998 -- to load a lexer when using Scintillua as a Lua library.
 999 -- @param name The name of the lexing language.
1000 -- @param alt_name The alternate name of the lexing language. This is useful for
1001 --   embedding the same child lexer with multiple sets of start and end tokens.
1002 -- @return lexer object
1003 -- @name load
1004 function M.load(name, alt_name)
1005   if lexers[alt_name or name] then return lexers[alt_name or name] end
1006   parent_lexer = nil -- reset
1007
1008   -- When using Scintillua as a stand-alone module, the `property` and
1009   -- `property_int` tables do not exist (they are not useful). Create them to
1010   -- prevent errors from occurring.
1011   if not M.property then
1012     M.property, M.property_int = {}, setmetatable({}, {
1013       __index = function(t, k)
1014         return tostring(tonumber(M.property[k]) or 0)
1015       end,
1016       __newindex = function() error('read-only property') end
1017     })
1018   end
1019
1020   -- Load the language lexer with its rules, styles, etc.
1021   M.WHITESPACE = (alt_name or name)..'_whitespace'
1022   local lexer_file, error = package.searchpath(name, M.LEXERPATH)
1023   local ok, lexer = pcall(dofile, lexer_file or '')
1024   if not ok then
1025     _G.print(error or lexer) -- error message
1026     lexer = {_NAME = alt_name or name}
1027   end
1028   if alt_name then lexer._NAME = alt_name end
1029
1030   -- Create the initial maps for token names to style numbers and styles.
1031   local token_styles = {}
1032   for i = 1, #default do token_styles[default[i]] = i - 1 end
1033   for i = 1, #predefined do token_styles[predefined[i]] = i + 31 end
1034   lexer._TOKENSTYLES, lexer._numstyles = token_styles, #default
1035   lexer._EXTRASTYLES = {}
1036
1037   -- If the lexer is a proxy (loads parent and child lexers to embed) and does
1038   -- not declare a parent, try and find one and use its rules.
1039   if not lexer._rules and not lexer._lexer then lexer._lexer = parent_lexer end
1040
1041   -- If the lexer is a proxy or a child that embedded itself, add its rules and
1042   -- styles to the parent lexer. Then set the parent to be the main lexer.
1043   if lexer._lexer then
1044     local l, _r, _s = lexer._lexer, lexer._rules, lexer._tokenstyles
1045     if not l._tokenstyles then l._tokenstyles = {} end
1046     for _, r in ipairs(_r or {}) do
1047       -- Prevent rule id clashes.
1048       l._rules[#l._rules + 1] = {lexer._NAME..'_'..r[1], r[2]}
1049     end
1050     for token, style in pairs(_s or {}) do l._tokenstyles[token] = style end
1051     lexer = l
1052   end
1053
1054   -- Add the lexer's styles and build its grammar.
1055   if lexer._rules then
1056     for token, style in pairs(lexer._tokenstyles or {}) do
1057       add_style(lexer, token, style)
1058     end
1059     for _, r in ipairs(lexer._rules) do add_rule(lexer, r[1], r[2]) end
1060     build_grammar(lexer)
1061   end
1062   -- Add the lexer's unique whitespace style.
1063   add_style(lexer, lexer._NAME..'_whitespace', M.STYLE_WHITESPACE)
1064
1065   -- Process the lexer's fold symbols.
1066   if lexer._foldsymbols and lexer._foldsymbols._patterns then
1067     local patterns = lexer._foldsymbols._patterns
1068     for i = 1, #patterns do patterns[i] = '()('..patterns[i]..')' end
1069   end
1070
1071   lexer.lex, lexer.fold = M.lex, M.fold
1072   lexers[alt_name or name] = lexer
1073   return lexer
1074 end
1075
1076 ---
1077 -- Lexes a chunk of text *text* (that has an initial style number of
1078 -- *init_style*) with lexer *lexer*.
1079 -- If *lexer* has a `_LEXBYLINE` flag set, the text is lexed one line at a time.
1080 -- Otherwise the text is lexed as a whole.
1081 -- @param lexer The lexer object to lex with.
1082 -- @param text The text in the buffer to lex.
1083 -- @param init_style The current style. Multiple-language lexers use this to
1084 --   determine which language to start lexing in.
1085 -- @return table of token names and positions.
1086 -- @name lex
1087 function M.lex(lexer, text, init_style)
1088   if not lexer._LEXBYLINE then
1089     -- For multilang lexers, build a new grammar whose initial_rule is the
1090     -- current language.
1091     if lexer._CHILDREN then
1092       for style, style_num in pairs(lexer._TOKENSTYLES) do
1093         if style_num == init_style then
1094           local lexer_name = style:match('^(.+)_whitespace') or lexer._NAME
1095           if lexer._INITIALRULE ~= lexer_name then
1096             build_grammar(lexer, lexer_name)
1097           end
1098           break
1099         end
1100       end
1101     end
1102     return lpeg_match(lexer._GRAMMAR, text)
1103   else
1104     local tokens = {}
1105     local function append(tokens, line_tokens, offset)
1106       for i = 1, #line_tokens, 2 do
1107         tokens[#tokens + 1] = line_tokens[i]
1108         tokens[#tokens + 1] = line_tokens[i + 1] + offset
1109       end
1110     end
1111     local offset = 0
1112     local grammar = lexer._GRAMMAR
1113     for line in text:gmatch('[^\r\n]*\r?\n?') do
1114       local line_tokens = lpeg_match(grammar, line)
1115       if line_tokens then append(tokens, line_tokens, offset) end
1116       offset = offset + #line
1117       -- Use the default style to the end of the line if none was specified.
1118       if tokens[#tokens] ~= offset then
1119         tokens[#tokens + 1], tokens[#tokens + 2] = 'default', offset + 1
1120       end
1121     end
1122     return tokens
1123   end
1124 end
1125
1126 ---
1127 -- Folds a chunk of text *text* with lexer *lexer*.
1128 -- Folds *text* starting at position *start_pos* on line number *start_line*
1129 -- with a beginning fold level of *start_level* in the buffer. If *lexer* has a
1130 -- a `_fold` function or a `_foldsymbols` table, that field is used to perform
1131 -- folding. Otherwise, if *lexer* has a `_FOLDBYINDENTATION` field set, or if a
1132 -- `fold.by.indentation` property is set, folding by indentation is done.
1133 -- @param lexer The lexer object to fold with.
1134 -- @param text The text in the buffer to fold.
1135 -- @param start_pos The position in the buffer *text* starts at.
1136 -- @param start_line The line number *text* starts on.
1137 -- @param start_level The fold level *text* starts on.
1138 -- @return table of fold levels.
1139 -- @name fold
1140 function M.fold(lexer, text, start_pos, start_line, start_level)
1141   local folds = {}
1142   if text == '' then return folds end
1143   local fold = M.property_int['fold'] > 0
1144   local FOLD_BASE = M.FOLD_BASE
1145   local FOLD_HEADER, FOLD_BLANK  = M.FOLD_HEADER, M.FOLD_BLANK
1146   if fold and lexer._fold then
1147     return lexer._fold(text, start_pos, start_line, start_level)
1148   elseif fold and lexer._foldsymbols then
1149     local lines = {}
1150     for p, l in (text..'\n'):gmatch('()(.-)\r?\n') do
1151       lines[#lines + 1] = {p, l}
1152     end
1153     local fold_zero_sum_lines = M.property_int['fold.on.zero.sum.lines'] > 0
1154     local fold_symbols = lexer._foldsymbols
1155     local fold_symbols_patterns = fold_symbols._patterns
1156     local style_at, fold_level = M.style_at, M.fold_level
1157     local line_num, prev_level = start_line, start_level
1158     local current_level = prev_level
1159     for i = 1, #lines do
1160       local pos, line = lines[i][1], lines[i][2]
1161       if line ~= '' then
1162         local level_decreased = false
1163         for j = 1, #fold_symbols_patterns do
1164           for s, match in line:gmatch(fold_symbols_patterns[j]) do
1165             local symbols = fold_symbols[style_at[start_pos + pos + s - 1]]
1166             local l = symbols and symbols[match]
1167             if type(l) == 'function' then l = l(text, pos, line, s, match) end
1168             if type(l) == 'number' then
1169               current_level = current_level + l
1170               if l < 0 and current_level < prev_level then
1171                 -- Potential zero-sum line. If the level were to go back up on
1172                 -- the same line, the line may be marked as a fold header.
1173                 level_decreased = true
1174               end
1175             end
1176           end
1177         end
1178         folds[line_num] = prev_level
1179         if current_level > prev_level then
1180           folds[line_num] = prev_level + FOLD_HEADER
1181         elseif level_decreased and current_level == prev_level and
1182                fold_zero_sum_lines then
1183           if line_num > start_line then
1184             folds[line_num] = prev_level - 1 + FOLD_HEADER
1185           else
1186             -- Typing within a zero-sum line.
1187             local level = fold_level[line_num - 1] - 1
1188             if level > FOLD_HEADER then level = level - FOLD_HEADER end
1189             if level > FOLD_BLANK then level = level - FOLD_BLANK end
1190             folds[line_num] = level + FOLD_HEADER
1191             current_level = current_level + 1
1192           end
1193         end
1194         if current_level < FOLD_BASE then current_level = FOLD_BASE end
1195         prev_level = current_level
1196       else
1197         folds[line_num] = prev_level + FOLD_BLANK
1198       end
1199       line_num = line_num + 1
1200     end
1201   elseif fold and (lexer._FOLDBYINDENTATION or
1202                    M.property_int['fold.by.indentation'] > 0) then
1203     -- Indentation based folding.
1204     -- Calculate indentation per line.
1205     local indentation = {}
1206     for indent, line in (text..'\n'):gmatch('([\t ]*)([^\r\n]*)\r?\n') do
1207       indentation[#indentation + 1] = line ~= '' and #indent
1208     end
1209     -- Find the first non-blank line before start_line. If the current line is
1210     -- indented, make that previous line a header and update the levels of any
1211     -- blank lines inbetween. If the current line is blank, match the level of
1212     -- the previous non-blank line.
1213     local current_level = start_level
1214     for i = start_line - 1, 0, -1 do
1215       local level = M.fold_level[i]
1216       if level >= FOLD_HEADER then level = level - FOLD_HEADER end
1217       if level < FOLD_BLANK then
1218         local indent = M.indent_amount[i]
1219         if indentation[1] and indentation[1] > indent then
1220           folds[i] = FOLD_BASE + indent + FOLD_HEADER
1221           for j = i + 1, start_line - 1 do
1222             folds[j] = start_level + FOLD_BLANK
1223           end
1224         elseif not indentation[1] then
1225           current_level = FOLD_BASE + indent
1226         end
1227         break
1228       end
1229     end
1230     -- Iterate over lines, setting fold numbers and fold flags.
1231     for i = 1, #indentation do
1232       if indentation[i] then
1233         current_level = FOLD_BASE + indentation[i]
1234         folds[start_line + i - 1] = current_level
1235         for j = i + 1, #indentation do
1236           if indentation[j] then
1237             if FOLD_BASE + indentation[j] > current_level then
1238               folds[start_line + i - 1] = current_level + FOLD_HEADER
1239               current_level = FOLD_BASE + indentation[j] -- for any blanks below
1240             end
1241             break
1242           end
1243         end
1244       else
1245         folds[start_line + i - 1] = current_level + FOLD_BLANK
1246       end
1247     end
1248   else
1249     -- No folding, reset fold levels if necessary.
1250     local current_line = start_line
1251     for _ in text:gmatch('\r?\n') do
1252       folds[current_line] = start_level
1253       current_line = current_line + 1
1254     end
1255   end
1256   return folds
1257 end
1258
1259 -- The following are utility functions lexers will have access to.
1260
1261 -- Common patterns.
1262 M.any = lpeg_P(1)
1263 M.ascii = lpeg_R('\000\127')
1264 M.extend = lpeg_R('\000\255')
1265 M.alpha = lpeg_R('AZ', 'az')
1266 M.digit = lpeg_R('09')
1267 M.alnum = lpeg_R('AZ', 'az', '09')
1268 M.lower = lpeg_R('az')
1269 M.upper = lpeg_R('AZ')
1270 M.xdigit = lpeg_R('09', 'AF', 'af')
1271 M.cntrl = lpeg_R('\000\031')
1272 M.graph = lpeg_R('!~')
1273 M.print = lpeg_R(' ~')
1274 M.punct = lpeg_R('!/', ':@', '[\'', '{~')
1275 M.space = lpeg_S('\t\v\f\n\r ')
1276
1277 M.newline = lpeg_S('\r\n\f')^1
1278 M.nonnewline = 1 - M.newline
1279 M.nonnewline_esc = 1 - (M.newline + '\\') + '\\' * M.any
1280
1281 M.dec_num = M.digit^1
1282 M.hex_num = '0' * lpeg_S('xX') * M.xdigit^1
1283 M.oct_num = '0' * lpeg_R('07')^1
1284 M.integer = lpeg_S('+-')^-1 * (M.hex_num + M.oct_num + M.dec_num)
1285 M.float = lpeg_S('+-')^-1 *
1286           (M.digit^0 * '.' * M.digit^1 + M.digit^1 * '.' * M.digit^0 +
1287            M.digit^1) *
1288           lpeg_S('eE') * lpeg_S('+-')^-1 * M.digit^1
1289 M.word = (M.alpha + '_') * (M.alnum + '_')^0
1290
1291 ---
1292 -- Creates and returns a token pattern with token name *name* and pattern
1293 -- *patt*.
1294 -- If *name* is not a predefined token name, its style must be defined in the
1295 -- lexer's `_tokenstyles` table.
1296 -- @param name The name of token. If this name is not a predefined token name,
1297 --   then a style needs to be assiciated with it in the lexer's `_tokenstyles`
1298 --   table.
1299 -- @param patt The LPeg pattern associated with the token.
1300 -- @return pattern
1301 -- @usage local ws = token(l.WHITESPACE, l.space^1)
1302 -- @usage local annotation = token('annotation', '@' * l.word)
1303 -- @name token
1304 function M.token(name, patt)
1305   return lpeg_Cc(name) * patt * lpeg_Cp()
1306 end
1307
1308 ---
1309 -- Creates and returns a pattern that matches a range of text bounded by
1310 -- *chars* characters.
1311 -- This is a convenience function for matching more complicated delimited ranges
1312 -- like strings with escape characters and balanced parentheses. *single_line*
1313 -- indicates whether or not the range must be on a single line, *no_escape*
1314 -- indicates whether or not to ignore '\' as an escape character, and *balanced*
1315 -- indicates whether or not to handle balanced ranges like parentheses and
1316 -- requires *chars* to be composed of two characters.
1317 -- @param chars The character(s) that bound the matched range.
1318 -- @param single_line Optional flag indicating whether or not the range must be
1319 --   on a single line.
1320 -- @param no_escape Optional flag indicating whether or not the range end
1321 --   character may be escaped by a '\\' character.
1322 -- @param balanced Optional flag indicating whether or not to match a balanced
1323 --   range, like the "%b" Lua pattern. This flag only applies if *chars*
1324 --   consists of two different characters (e.g. "()").
1325 -- @return pattern
1326 -- @usage local dq_str_escapes = l.delimited_range('"')
1327 -- @usage local dq_str_noescapes = l.delimited_range('"', false, true)
1328 -- @usage local unbalanced_parens = l.delimited_range('()')
1329 -- @usage local balanced_parens = l.delimited_range('()', false, false, true)
1330 -- @see nested_pair
1331 -- @name delimited_range
1332 function M.delimited_range(chars, single_line, no_escape, balanced)
1333   local s = chars:sub(1, 1)
1334   local e = #chars == 2 and chars:sub(2, 2) or s
1335   local range
1336   local b = balanced and s or ''
1337   local n = single_line and '\n' or ''
1338   if no_escape then
1339     local invalid = lpeg_S(e..n..b)
1340     range = M.any - invalid
1341   else
1342     local invalid = lpeg_S(e..n..b) + '\\'
1343     range = M.any - invalid + '\\' * M.any
1344   end
1345   if balanced and s ~= e then
1346     return lpeg_P{s * (range + lpeg_V(1))^0 * e}
1347   else
1348     return s * range^0 * lpeg_P(e)^-1
1349   end
1350 end
1351
1352 ---
1353 -- Creates and returns a pattern that matches pattern *patt* only at the
1354 -- beginning of a line.
1355 -- @param patt The LPeg pattern to match on the beginning of a line.
1356 -- @return pattern
1357 -- @usage local preproc = token(l.PREPROCESSOR, l.starts_line('#') *
1358 --   l.nonnewline^0)
1359 -- @name starts_line
1360 function M.starts_line(patt)
1361   return lpeg_Cmt(lpeg_C(patt), function(input, index, match, ...)
1362     local pos = index - #match
1363     if pos == 1 then return index, ... end
1364     local char = input:sub(pos - 1, pos - 1)
1365     if char == '\n' or char == '\r' or char == '\f' then return index, ... end
1366   end)
1367 end
1368
1369 ---
1370 -- Creates and returns a pattern that verifies that string set *s* contains the
1371 -- first non-whitespace character behind the current match position.
1372 -- @param s String character set like one passed to `lpeg.S()`.
1373 -- @return pattern
1374 -- @usage local regex = l.last_char_includes('+-*!%^&|=,([{') *
1375 --   l.delimited_range('/')
1376 -- @name last_char_includes
1377 function M.last_char_includes(s)
1378   s = '['..s:gsub('[-%%%[]', '%%%1')..']'
1379   return lpeg_P(function(input, index)
1380     if index == 1 then return index end
1381     local i = index
1382     while input:sub(i - 1, i - 1):match('[ \t\r\n\f]') do i = i - 1 end
1383     if input:sub(i - 1, i - 1):match(s) then return index end
1384   end)
1385 end
1386
1387 ---
1388 -- Returns a pattern that matches a balanced range of text that starts with
1389 -- string *start_chars* and ends with string *end_chars*.
1390 -- With single-character delimiters, this function is identical to
1391 -- `delimited_range(start_chars..end_chars, false, true, true)`.
1392 -- @param start_chars The string starting a nested sequence.
1393 -- @param end_chars The string ending a nested sequence.
1394 -- @return pattern
1395 -- @usage local nested_comment = l.nested_pair('/*', '*/')
1396 -- @see delimited_range
1397 -- @name nested_pair
1398 function M.nested_pair(start_chars, end_chars)
1399   local s, e = start_chars, lpeg_P(end_chars)^-1
1400   return lpeg_P{s * (M.any - s - end_chars + lpeg_V(1))^0 * e}
1401 end
1402
1403 ---
1404 -- Creates and returns a pattern that matches any single word in list *words*.
1405 -- Words consist of alphanumeric and underscore characters, as well as the
1406 -- characters in string set *word_chars*. *case_insensitive* indicates whether
1407 -- or not to ignore case when matching words.
1408 -- This is a convenience function for simplifying a set of ordered choice word
1409 -- patterns.
1410 -- @param words A table of words.
1411 -- @param word_chars Optional string of additional characters considered to be
1412 --   part of a word. By default, word characters are alphanumerics and
1413 --   underscores ("%w_" in Lua). This parameter may be `nil` or the empty string
1414 --   to indicate no additional word characters.
1415 -- @param case_insensitive Optional boolean flag indicating whether or not the
1416 --   word match is case-insensitive. The default is `false`.
1417 -- @return pattern
1418 -- @usage local keyword = token(l.KEYWORD, word_match{'foo', 'bar', 'baz'})
1419 -- @usage local keyword = token(l.KEYWORD, word_match({'foo-bar', 'foo-baz',
1420 --   'bar-foo', 'bar-baz', 'baz-foo', 'baz-bar'}, '-', true))
1421 -- @name word_match
1422 function M.word_match(words, word_chars, case_insensitive)
1423   local word_list = {}
1424   for _, word in ipairs(words) do
1425     word_list[case_insensitive and word:lower() or word] = true
1426   end
1427   local chars = M.alnum + '_'
1428   if word_chars then chars = chars + lpeg_S(word_chars) end
1429   return lpeg_Cmt(chars^1, function(input, index, word)
1430     if case_insensitive then word = word:lower() end
1431     return word_list[word] and index or nil
1432   end)
1433 end
1434
1435 ---
1436 -- Embeds child lexer *child* in parent lexer *parent* using patterns
1437 -- *start_rule* and *end_rule*, which signal the beginning and end of the
1438 -- embedded lexer, respectively.
1439 -- @param parent The parent lexer.
1440 -- @param child The child lexer.
1441 -- @param start_rule The pattern that signals the beginning of the embedded
1442 --   lexer.
1443 -- @param end_rule The pattern that signals the end of the embedded lexer.
1444 -- @usage l.embed_lexer(M, css, css_start_rule, css_end_rule)
1445 -- @usage l.embed_lexer(html, M, php_start_rule, php_end_rule)
1446 -- @usage l.embed_lexer(html, ruby, ruby_start_rule, ruby_end_rule)
1447 -- @name embed_lexer
1448 function M.embed_lexer(parent, child, start_rule, end_rule)
1449   -- Add child rules.
1450   if not child._EMBEDDEDRULES then child._EMBEDDEDRULES = {} end
1451   if not child._RULES then -- creating a child lexer to be embedded
1452     if not child._rules then error('Cannot embed language with no rules') end
1453     for _, r in ipairs(child._rules) do add_rule(child, r[1], r[2]) end
1454   end
1455   child._EMBEDDEDRULES[parent._NAME] = {
1456     ['start_rule'] = start_rule,
1457     token_rule = join_tokens(child),
1458     ['end_rule'] = end_rule
1459   }
1460   if not parent._CHILDREN then parent._CHILDREN = {} end
1461   local children = parent._CHILDREN
1462   children[#children + 1] = child
1463   -- Add child styles.
1464   if not parent._tokenstyles then parent._tokenstyles = {} end
1465   local tokenstyles = parent._tokenstyles
1466   tokenstyles[child._NAME..'_whitespace'] = M.STYLE_WHITESPACE
1467   for token, style in pairs(child._tokenstyles or {}) do
1468     tokenstyles[token] = style
1469   end
1470   child._lexer = parent -- use parent's tokens if child is embedding itself
1471   parent_lexer = parent -- use parent's tokens if the calling lexer is a proxy
1472 end
1473
1474 -- Determines if the previous line is a comment.
1475 -- This is used for determining if the current comment line is a fold point.
1476 -- @param prefix The prefix string defining a comment.
1477 -- @param text The text passed to a fold function.
1478 -- @param pos The pos passed to a fold function.
1479 -- @param line The line passed to a fold function.
1480 -- @param s The s passed to a fold function.
1481 local function prev_line_is_comment(prefix, text, pos, line, s)
1482   local start = line:find('%S')
1483   if start < s and not line:find(prefix, start, true) then return false end
1484   local p = pos - 1
1485   if text:sub(p, p) == '\n' then
1486     p = p - 1
1487     if text:sub(p, p) == '\r' then p = p - 1 end
1488     if text:sub(p, p) ~= '\n' then
1489       while p > 1 and text:sub(p - 1, p - 1) ~= '\n' do p = p - 1 end
1490       while text:sub(p, p):find('^[\t ]$') do p = p + 1 end
1491       return text:sub(p, p + #prefix - 1) == prefix
1492     end
1493   end
1494   return false
1495 end
1496
1497 -- Determines if the next line is a comment.
1498 -- This is used for determining if the current comment line is a fold point.
1499 -- @param prefix The prefix string defining a comment.
1500 -- @param text The text passed to a fold function.
1501 -- @param pos The pos passed to a fold function.
1502 -- @param line The line passed to a fold function.
1503 -- @param s The s passed to a fold function.
1504 local function next_line_is_comment(prefix, text, pos, line, s)
1505   local p = text:find('\n', pos + s)
1506   if p then
1507     p = p + 1
1508     while text:sub(p, p):find('^[\t ]$') do p = p + 1 end
1509     return text:sub(p, p + #prefix - 1) == prefix
1510   end
1511   return false
1512 end
1513
1514 ---
1515 -- Returns a fold function (to be used within the lexer's `_foldsymbols` table)
1516 -- that folds consecutive line comments that start with string *prefix*.
1517 -- @param prefix The prefix string defining a line comment.
1518 -- @usage [l.COMMENT] = {['--'] = l.fold_line_comments('--')}
1519 -- @usage [l.COMMENT] = {['//'] = l.fold_line_comments('//')}
1520 -- @name fold_line_comments
1521 function M.fold_line_comments(prefix)
1522   local property_int = M.property_int
1523   return function(text, pos, line, s)
1524     if property_int['fold.line.comments'] == 0 then return 0 end
1525     if s > 1 and line:match('^%s*()') < s then return 0 end
1526     local prev_line_comment = prev_line_is_comment(prefix, text, pos, line, s)
1527     local next_line_comment = next_line_is_comment(prefix, text, pos, line, s)
1528     if not prev_line_comment and next_line_comment then return 1 end
1529     if prev_line_comment and not next_line_comment then return -1 end
1530     return 0
1531   end
1532 end
1533
1534 M.property_expanded = setmetatable({}, {
1535   -- Returns the string property value associated with string property *key*,
1536   -- replacing any "$()" and "%()" expressions with the values of their keys.
1537   __index = function(t, key)
1538     return M.property[key]:gsub('[$%%]%b()', function(key)
1539       return t[key:sub(3, -2)]
1540     end)
1541   end,
1542   __newindex = function() error('read-only property') end
1543 })
1544
1545 --[[ The functions and fields below were defined in C.
1546
1547 ---
1548 -- Individual fields for a lexer instance.
1549 -- @field _NAME The string name of the lexer.
1550 -- @field _rules An ordered list of rules for a lexer grammar.
1551 --   Each rule is a table containing an arbitrary rule name and the LPeg pattern
1552 --   associated with the rule. The order of rules is important as rules are
1553 --   matched sequentially.
1554 --   Child lexers should not use this table to access and/or modify their
1555 --   parent's rules and vice-versa. Use the `_RULES` table instead.
1556 -- @field _tokenstyles A map of non-predefined token names to styles.
1557 --   Remember to use token names, not rule names. It is recommended to use
1558 --   predefined styles or color-agnostic styles derived from predefined styles
1559 --   to ensure compatibility with user color themes.
1560 -- @field _foldsymbols A table of recognized fold points for the lexer.
1561 --   Keys are token names with table values defining fold points. Those table
1562 --   values have string keys of keywords or characters that indicate a fold
1563 --   point whose values are integers. A value of `1` indicates a beginning fold
1564 --   point and a value of `-1` indicates an ending fold point. Values can also
1565 --   be functions that return `1`, `-1`, or `0` (indicating no fold point) for
1566 --   keys which need additional processing.
1567 --   There is also a required `_pattern` key whose value is a table containing
1568 --   Lua pattern strings that match all fold points (the string keys contained
1569 --   in token name table values). When the lexer encounters text that matches
1570 --   one of those patterns, the matched text is looked up in its token's table
1571 --   to determine whether or not it is a fold point.
1572 -- @field _fold If this function exists in the lexer, it is called for folding
1573 --   the document instead of using `_foldsymbols` or indentation.
1574 -- @field _lexer The parent lexer object whose rules should be used. This field
1575 --   is only necessary to disambiguate a proxy lexer that loaded parent and
1576 --   child lexers for embedding and ended up having multiple parents loaded.
1577 -- @field _RULES A map of rule name keys with their associated LPeg pattern
1578 --   values for the lexer.
1579 --   This is constructed from the lexer's `_rules` table and accessible to other
1580 --   lexers for embedded lexer applications like modifying parent or child
1581 --   rules.
1582 -- @field _LEXBYLINE Indicates the lexer can only process one whole line of text
1583 --    (instead of an arbitrary chunk of text) at a time.
1584 --    The default value is `false`. Line lexers cannot look ahead to subsequent
1585 --    lines.
1586 -- @field _FOLDBYINDENTATION Declares the lexer does not define fold points and
1587 --    that fold points should be calculated based on changes in indentation.
1588 -- @class table
1589 -- @name lexer
1590 local lexer
1591 ]]
1592
1593 function M.get_style(lexer, lang, token_name)
1594   return lexer['STYLE_'..string_upper(token_name)] or lang._EXTRASTYLES[token_name]
1595 end
1596
1597 local files = {
1598         [".as|.asc"] = "actionscript",
1599         [".adb|.ads"] = "ada",
1600         [".g|.g4"] = "antlr",
1601         [".ans|.inp|.mac"] = "apdl",
1602         [".applescript"] = "applescript",
1603         [".asm|.ASM|.s|.S"] = "asm",
1604         [".asa|.asp|.hta"] = "asp",
1605         [".awk"] = "awk",
1606         [".bat|.cmd"] = "batch",
1607         [".bib"] = "bibtex",
1608         [".boo"] = "boo",
1609         [".cs"] = "csharp",
1610         [".c|.cc|.C"] = "ansi_c",
1611         [".cpp|.cxx|.c++|.h|.hh|.hpp|.hxx|.h++"] = "cpp",
1612         [".ck"] = "chuck",
1613         [".cmake|.cmake.in|.ctest|.ctest.in"] = "cmake",
1614         [".coffee"] = "coffeescript",
1615         [".css"] = "css",
1616         [".cu|.cuh"] = "cuda",
1617         [".d|.di"] = "dmd",
1618         [".dart"] = "dart",
1619         [".desktop"] = "desktop",
1620         [".diff|.patch"] = "diff",
1621         [".dot"] = "dot",
1622         [".e|.eif"] = "eiffel",
1623         [".ex|.exs"] = "elixir",
1624         [".erl|.hrl"] = "erlang",
1625         [".fs"] = "fsharp",
1626         [".fish"] = "fish",
1627         [".forth|.frt|.fs"] = "forth",
1628         [".f|.for|.ftn|.fpp|.f77|.f90|.f95|.f03|.f08"] = "fortran",
1629         [".g|.gd|.gi|.gap"] = "gap",
1630         [".po|.pot"] = "gettext",
1631         [".glslf|.glslv"] = "glsl",
1632         [".dem|.plt"] = "gnuplot",
1633         [".go"] = "go",
1634         [".groovy|.gvy"] = "groovy",
1635         [".gtkrc"] = "gtkrc",
1636         [".hs"] = "haskell",
1637         [".htm|.html|.shtm|.shtml|.xhtml"] = "html",
1638         [".idl|.odl"] = "idl",
1639         [".inf|.ni"] = "inform",
1640         [".cfg|.cnf|.inf|.ini|.reg"] = "ini",
1641         [".io"] = "io_lang",
1642         [".bsh|.java"] = "java",
1643         [".js|.jsfl"] = "javascript",
1644         [".json"] = "json",
1645         [".jsp"] = "jsp",
1646         [".bbl|.dtx|.ins|.ltx|.tex|.sty"] = "latex",
1647         [".less"] = "less",
1648         [".lily|.ly"] = "lilypond",
1649         [".cl|.el|.lisp|.lsp"] = "lisp",
1650         [".litcoffee"] = "litcoffee",
1651         [".lua"] = "lua",
1652         ["GNUmakefile|.iface|.mak|.mk|makefile|Makefile"] = "makefile",
1653         [".md"] = "markdown",
1654         [".n"] = "nemerle",
1655         [".nim"] = "nim",
1656         [".nsh|.nsi|.nsis"] = "nsis",
1657         [".m|.mm|.objc"] = "objective_c",
1658         [".caml|.ml|.mli|.mll|.mly"] = "caml",
1659         [".dpk|.dpr|.p|.pas"] = "pascal",
1660         [".al|.perl|.pl|.pm|.pod"] = "perl",
1661         [".inc|.php|.php3|.php4|.phtml"] = "php",
1662         [".pike|.pmod"] = "pike",
1663         [".PKGBUILD"] = "pkgbuild",
1664         [".ps1"] = "powershell",
1665         [".eps|.ps"] = "ps",
1666         [".prolog"] = "prolog",
1667         [".props|.properties"] = "props",
1668         [".sc|.py|.pyw"] = "python",
1669         [".R|.Rout|.Rhistory|.Rt|Rout.save|Rout.fail"] = "rstats",
1670         [".r|.reb"] = "rebol",
1671         [".rst"] = "rest",
1672         [".orx|.rex"] = "rexx",
1673         [".erb|.rhtml"] = "rhtml",
1674         [".Rakefile|.rake|.rb|.rbw"] = "ruby",
1675         [".rs"] = "rust",
1676         [".sass|.scss"] = "sass",
1677         [".scala"] = "scala",
1678         [".sch|.scm"] = "scheme",
1679         [".bash|.bashrc|.bash_profile|.configure|.csh|.sh|.zsh"] = "bash",
1680         [".changes|.st|.sources"] = "smalltalk",
1681         [".ddl|.sql"] = "sql",
1682         [".tcl|.tk"] = "tcl",
1683         [".texi"] = "texinfo",
1684         [".toml"] = "toml",
1685         [".vala"] = "vala",
1686         [".vcf|.vcard"] = "vcard",
1687         [".v|.ver"] = "verilog",
1688         [".vh|.vhd|.vhdl"] = "vhdl",
1689         [".asa|.bas|.cls|.ctl|.dob|.dsm|.dsr|.frm|.pag|.vb|.vba|.vbs"] = "vb",
1690         [".wsf"] = "wsf",
1691         [".dtd|.svg|.xml|.xsd|.xsl|.xslt|.xul"] = "xml",
1692         [".xtend"] = "xtend",
1693         [".yaml"] = "yaml",
1694 }
1695
1696 function M.lexer_name(filename)
1697         -- filename = string.lower(filename)
1698         for patterns, lang in pairs(files) do
1699                 for pattern in string.gmatch(patterns, '[^|]+') do
1700                         if #filename >= #pattern then
1701                                 local s, e = string.find(filename, pattern, -#pattern, true)
1702                                 if s ~= e and e == #filename then
1703                                         return lang
1704                                 end
1705                         end
1706                 end
1707         end
1708         return nil
1709 end
1710
1711 return M