basis/xmode/marker/marker.factor

   1 ! Copyright (C) 2008 Slava Pestov.
   2 ! See http://factorcode.org/license.txt for BSD license.
   3 IN: xmode.marker
   4 USING: kernel namespaces make xmode.rules xmode.tokens
   5 xmode.marker.state xmode.marker.context xmode.utilities
   6 xmode.catalog sequences math assocs combinators strings
   7 parser-combinators.regexp splitting parser-combinators ascii
   8 ascii combinators.short-circuit accessors ;
   9
  10 ! Based on org.gjt.sp.jedit.syntax.TokenMarker
  11
  12 : current-keyword ( -- string )
  13     last-offset get position get line get subseq ;
  14
  15 : keyword-number? ( keyword -- ? )
  16     {
  17         [ current-rule-set highlight-digits?>> ]
  18         [ dup [ digit? ] contains? ]
  19         [
  20             dup [ digit? ] all? [
  21                 current-rule-set digit-re>>
  22                 dup [ dupd matches? ] [ drop f ] if
  23             ] unless*
  24         ]
  25     } 0&& nip ;
  26
  27 : mark-number ( keyword -- id )
  28     keyword-number? DIGIT and ;
  29
  30 : mark-keyword ( keyword -- id )
  31     current-rule-set keywords>> at ;
  32
  33 : add-remaining-token ( -- )
  34     current-rule-set default>> prev-token, ;
  35
  36 : mark-token ( -- )
  37     current-keyword
  38     dup mark-number [ ] [ mark-keyword ] ?if
  39     [ prev-token, ] when* ;
  40
  41 : current-char ( -- char )
  42     position get line get nth ;
  43
  44 GENERIC: match-position ( rule -- n )
  45
  46 M: mark-previous-rule match-position drop last-offset get ;
  47
  48 M: rule match-position drop position get ;
  49
  50 : can-match-here? ( matcher rule -- ? )
  51     match-position {
  52         [ over ]
  53         [ over at-line-start?>>     over zero?                implies ]
  54         [ over at-whitespace-end?>> over whitespace-end get = implies ]
  55         [ over at-word-start?>>     over last-offset get =    implies ]
  56     } 0&& 2nip ;
  57
  58 : rest-of-line ( -- str )
  59     line get position get tail-slice ;
  60
  61 GENERIC: text-matches? ( string text -- match-count/f )
  62
  63 M: f text-matches?
  64     2drop f ;
  65
  66 M: string-matcher text-matches?
  67     [
  68         [ string>> ] [ ignore-case?>> ] bi string-head?
  69     ] keep string>> length and ;
  70
  71 M: regexp text-matches?
  72     [ >string ] dip match-head ;
  73
  74 : rule-start-matches? ( rule -- match-count/f )
  75     dup start>> tuck swap can-match-here? [
  76         rest-of-line swap text>> text-matches?
  77     ] [
  78         drop f
  79     ] if ;
  80
  81 : rule-end-matches? ( rule -- match-count/f )
  82     dup mark-following-rule? [
  83         dup start>> swap can-match-here? 0 and
  84     ] [
  85         dup end>> tuck swap can-match-here? [
  86             rest-of-line
  87             swap text>> context get end>> or
  88             text-matches?
  89         ] [
  90             drop f
  91         ] if
  92     ] if ;
  93
  94 DEFER: get-rules
  95
  96 : get-always-rules ( vector/f ruleset -- vector/f )
  97     f swap rules>> at ?push-all ;
  98
  99 : get-char-rules ( vector/f char ruleset -- vector/f )
 100     [ ch>upper ] dip rules>> at ?push-all ;
 101
 102 : get-rules ( char ruleset -- seq )
 103     f -rot [ get-char-rules ] keep get-always-rules ;
 104
 105 GENERIC: handle-rule-start ( match-count rule -- )
 106
 107 GENERIC: handle-rule-end ( match-count rule -- )
 108
 109 : find-escape-rule ( -- rule )
 110     context get dup
 111     in-rule-set>> escape-rule>> [ ] [
 112         parent>> in-rule-set>>
 113         dup [ escape-rule>> ] when
 114     ] ?if ;
 115
 116 : check-escape-rule ( rule -- ? )
 117     no-escape?>> [ f ] [
 118         find-escape-rule dup [
 119             dup rule-start-matches? dup [
 120                 swap handle-rule-start
 121                 delegate-end-escaped? [ not ] change
 122                 t
 123             ] [
 124                 2drop f
 125             ] if
 126         ] when
 127     ] if ;
 128
 129 : check-every-rule ( -- ? )
 130     current-char current-rule-set get-rules
 131     [ rule-start-matches? ] map-find
 132     dup [ handle-rule-start t ] [ 2drop f ] if ;
 133
 134 : ?end-rule ( -- )
 135     current-rule [
 136         dup rule-end-matches?
 137         dup [ swap handle-rule-end ] [ 2drop ] if
 138     ] when* ;
 139
 140 : rule-match-token* ( rule -- id )
 141     dup match-token>> {
 142         { f [ dup body-token>> ] }
 143         { t [ current-rule-set default>> ] }
 144         [ ]
 145     } case nip ;
 146
 147 M: escape-rule handle-rule-start
 148     drop
 149     ?end-rule
 150     process-escape? get [
 151         escaped? [ not ] change
 152         position [ + ] change
 153     ] [ 2drop ] if ;
 154
 155 M: seq-rule handle-rule-start
 156     ?end-rule
 157     mark-token
 158     add-remaining-token
 159     tuck body-token>> next-token,
 160     delegate>> [ push-context ] when* ;
 161
 162 UNION: abstract-span-rule span-rule eol-span-rule ;
 163
 164 M: abstract-span-rule handle-rule-start
 165     ?end-rule
 166     mark-token
 167     add-remaining-token
 168     tuck rule-match-token* next-token,
 169     ! ... end subst ...
 170     dup context get (>>in-rule)
 171     delegate>> push-context ;
 172
 173 M: span-rule handle-rule-end
 174     2drop ;
 175
 176 M: mark-following-rule handle-rule-start
 177     ?end-rule
 178     mark-token add-remaining-token
 179     tuck rule-match-token* next-token,
 180     f context get (>>end)
 181     context get (>>in-rule) ;
 182
 183 M: mark-following-rule handle-rule-end
 184     nip rule-match-token* prev-token,
 185     f context get (>>in-rule) ;
 186
 187 M: mark-previous-rule handle-rule-start
 188     ?end-rule
 189     mark-token
 190     dup body-token>> prev-token,
 191     rule-match-token* next-token, ;
 192
 193 : do-escaped ( -- )
 194     escaped? get [
 195         escaped? off
 196         ! ...
 197     ] when ;
 198
 199 : check-end-delegate ( -- ? )
 200     context get parent>> [
 201         in-rule>> [
 202             dup rule-end-matches? dup [
 203                 [
 204                     swap handle-rule-end
 205                     ?end-rule
 206                     mark-token
 207                     add-remaining-token
 208                 ] keep context get parent>> in-rule>>
 209                 rule-match-token* next-token,
 210                 pop-context
 211                 seen-whitespace-end? on t
 212             ] [ drop check-escape-rule ] if
 213         ] [ f ] if*
 214     ] [ f ] if* ;
 215
 216 : handle-no-word-break ( -- )
 217     context get parent>> [
 218         in-rule>> [
 219             dup no-word-break?>> [
 220                 rule-match-token* prev-token,
 221                 pop-context
 222             ] [ drop ] if
 223         ] when*
 224     ] when* ;
 225
 226 : check-rule ( -- )
 227     ?end-rule
 228     handle-no-word-break
 229     mark-token
 230     add-remaining-token ;
 231
 232 : (check-word-break) ( -- )
 233     check-rule
 234
 235     1 current-rule-set default>> next-token, ;
 236
 237 : rule-set-empty? ( ruleset -- ? )
 238     [ rules>> ] [ keywords>> ] bi
 239     [ assoc-empty? ] bi@ and ;
 240
 241 : check-word-break ( -- ? )
 242     current-char dup blank? [
 243         drop
 244
 245         seen-whitespace-end? get [
 246             position get 1+ whitespace-end set
 247         ] unless
 248
 249         (check-word-break)
 250
 251     ] [
 252         ! Micro-optimization with incorrect semantics; we keep
 253         ! it here because jEdit mode files depend on it now...
 254         current-rule-set rule-set-empty? [
 255             drop
 256         ] [
 257             dup alpha? [
 258                 drop
 259             ] [
 260                 current-rule-set rule-set-no-word-sep* member? [
 261                     (check-word-break)
 262                 ] unless
 263             ] if
 264         ] if
 265
 266         seen-whitespace-end? on
 267     ] if
 268     escaped? off
 269     delegate-end-escaped? off t ;
 270
 271
 272 : mark-token-loop ( -- )
 273     position get line get length < [
 274         {
 275             [ check-end-delegate ]
 276             [ check-every-rule ]
 277             [ check-word-break ]
 278         } 0|| drop
 279
 280         position inc
 281         mark-token-loop
 282     ] when ;
 283
 284 : mark-remaining ( -- )
 285     line get length position set
 286     check-rule ;
 287
 288 : unwind-no-line-break ( -- )
 289     context get parent>> [
 290         in-rule>> [
 291             no-line-break?>> [
 292                 pop-context
 293                 unwind-no-line-break
 294             ] when
 295         ] when*
 296     ] when* ;
 297
 298 : tokenize-line ( line-context line rules -- line-context' seq )
 299     [
 300         "MAIN" swap at -rot
 301         init-token-marker
 302         mark-token-loop
 303         mark-remaining
 304         unwind-no-line-break
 305         context get
 306     ] { } make ;