basis/regexp/parser/parser.factor

   1 ! Copyright (C) 2008 Doug Coleman.
   2 ! See http://factorcode.org/license.txt for BSD license.
   3 USING: accessors arrays assocs combinators io io.streams.string
   4 kernel math math.parser namespaces sets
   5 quotations sequences splitting vectors math.order
   6 strings regexp.backend regexp.utils
   7 unicode.case unicode.categories words locals regexp.classes ;
   8 IN: regexp.parser
   9
  10 FROM: math.ranges => [a,b] ;
  11
  12 TUPLE: concatenation seq ; INSTANCE: concatenation node
  13 TUPLE: alternation seq ; INSTANCE: alternation node
  14 TUPLE: kleene-star term ; INSTANCE: kleene-star node
  15
  16 ! !!!!!!!!
  17 TUPLE: possessive-question term ; INSTANCE: possessive-question node
  18 TUPLE: possessive-kleene-star term ; INSTANCE: possessive-kleene-star node
  19
  20 ! !!!!!!!!
  21 TUPLE: reluctant-question term ; INSTANCE: reluctant-question node
  22 TUPLE: reluctant-kleene-star term ; INSTANCE: reluctant-kleene-star node
  23
  24 TUPLE: negation term ; INSTANCE: negation node
  25 TUPLE: constant char ; INSTANCE: constant node
  26 TUPLE: range from to ; INSTANCE: range node
  27
  28 MIXIN: parentheses-group
  29 TUPLE: lookahead term ; INSTANCE: lookahead node
  30 INSTANCE: lookahead parentheses-group
  31 TUPLE: lookbehind term ; INSTANCE: lookbehind node
  32 INSTANCE: lookbehind parentheses-group
  33 TUPLE: capture-group term ; INSTANCE: capture-group node
  34 INSTANCE: capture-group parentheses-group
  35 TUPLE: non-capture-group term ; INSTANCE: non-capture-group node
  36 INSTANCE: non-capture-group parentheses-group
  37 TUPLE: independent-group term ; INSTANCE: independent-group node ! atomic group
  38 INSTANCE: independent-group parentheses-group
  39 TUPLE: comment-group term ; INSTANCE: comment-group node
  40 INSTANCE: comment-group parentheses-group
  41
  42 SINGLETON: epsilon INSTANCE: epsilon node
  43
  44 TUPLE: option option on? ; INSTANCE: option node
  45
  46 SINGLETONS: unix-lines dotall multiline comments case-insensitive
  47 unicode-case reversed-regexp ;
  48
  49 SINGLETONS: beginning-of-character-class end-of-character-class
  50 left-parenthesis pipe caret dash ;
  51
  52 : push1 ( obj -- ) input-stream get stream>> push ;
  53 : peek1 ( -- obj ) input-stream get stream>> [ f ] [ peek ] if-empty ;
  54 : pop3 ( seq -- obj1 obj2 obj3 ) [ pop ] [ pop ] [ pop ] tri spin ;
  55 : drop1 ( -- ) read1 drop ;
  56
  57 : stack ( -- obj ) current-regexp get stack>> ;
  58 : change-whole-stack ( quot -- )
  59     current-regexp get
  60     [ stack>> swap call ] keep (>>stack) ; inline
  61 : push-stack ( obj -- ) stack push ;
  62 : pop-stack ( -- obj ) stack pop ;
  63 : cut-out ( vector n -- vector' vector ) cut rest ;
  64 ERROR: cut-stack-error ;
  65 : cut-stack ( obj vector -- vector' vector )
  66     [ nip ] [ last-index ] 2bi [ cut-stack-error ] unless* cut-out swap ;
  67
  68 : <possessive-kleene-star> ( obj -- kleene ) possessive-kleene-star boa ;
  69 : <reluctant-kleene-star> ( obj -- kleene ) reluctant-kleene-star boa ;
  70 : <possessive-question> ( obj -- kleene ) possessive-question boa ;
  71 : <reluctant-question> ( obj -- kleene ) reluctant-question boa ;
  72
  73 : <negation> ( obj -- negation ) negation boa ;
  74 : <concatenation> ( seq -- concatenation )
  75     >vector [ epsilon ] [ concatenation boa ] if-empty ;
  76 : <alternation> ( seq -- alternation ) >vector alternation boa ;
  77 : <capture-group> ( obj -- capture-group ) capture-group boa ;
  78 : <kleene-star> ( obj -- kleene-star ) kleene-star boa ;
  79 : <constant> ( obj -- constant ) constant boa ;
  80
  81 : first|concatenation ( seq -- first/concatenation )
  82     dup length 1 = [ first ] [ <concatenation> ] if ;
  83
  84 : first|alternation ( seq -- first/alternation )
  85     dup length 1 = [ first ] [ <alternation> ] if ;
  86
  87 : <character-class-range> ( from to -- obj )
  88     2dup <
  89     [ character-class-range boa ] [ 2drop unmatchable-class ] if ;
  90
  91 ERROR: unmatched-parentheses ;
  92
  93 ERROR: unknown-regexp-option option ;
  94
  95 : ch>option ( ch -- singleton )
  96     {
  97         { CHAR: i [ case-insensitive ] }
  98         { CHAR: d [ unix-lines ] }
  99         { CHAR: m [ multiline ] }
 100         { CHAR: n [ multiline ] }
 101         { CHAR: r [ reversed-regexp ] }
 102         { CHAR: s [ dotall ] }
 103         { CHAR: u [ unicode-case ] }
 104         { CHAR: x [ comments ] }
 105         [ unknown-regexp-option ]
 106     } case ;
 107
 108 : option>ch ( option -- string )
 109     {
 110         { case-insensitive [ CHAR: i ] }
 111         { multiline [ CHAR: m ] }
 112         { reversed-regexp [ CHAR: r ] }
 113         { dotall [ CHAR: s ] }
 114         [ unknown-regexp-option ]
 115     } case ;
 116
 117 : toggle-option ( ch ? -- )
 118     [ ch>option ] dip option boa push-stack ;
 119
 120 : (parse-options) ( string ? -- ) [ toggle-option ] curry each ;
 121
 122 : parse-options ( string -- )
 123     "-" split1 [ t (parse-options) ] [ f (parse-options) ] bi* ;
 124
 125 ERROR: bad-special-group string ;
 126
 127 DEFER: (parse-regexp)
 128 : nested-parse-regexp ( token ? -- )
 129     [ push-stack (parse-regexp) pop-stack ] dip
 130     [ <negation> ] when pop-stack new swap >>term push-stack ;
 131
 132 ! non-capturing groups
 133 : (parse-special-group) ( -- )
 134     read1 {
 135         { [ dup CHAR: # = ] ! comment
 136             [ drop comment-group f nested-parse-regexp pop-stack drop ] }
 137         { [ dup CHAR: : = ]
 138             [ drop non-capture-group f nested-parse-regexp ] }
 139         { [ dup CHAR: = = ]
 140             [ drop lookahead f nested-parse-regexp ] }
 141         { [ dup CHAR: ! = ]
 142             [ drop lookahead t nested-parse-regexp ] }
 143         { [ dup CHAR: > = ]
 144             [ drop non-capture-group f nested-parse-regexp ] }
 145         { [ dup CHAR: < = peek1 CHAR: = = and ]
 146             [ drop drop1 lookbehind f nested-parse-regexp ] }
 147         { [ dup CHAR: < = peek1 CHAR: ! = and ]
 148             [ drop drop1 lookbehind t nested-parse-regexp ] }
 149         [
 150             ":)" read-until
 151             [ swap prefix ] dip
 152             {
 153                 { CHAR: : [ parse-options non-capture-group f nested-parse-regexp ] }
 154                 { CHAR: ) [ parse-options ] }
 155                 [ drop bad-special-group ]
 156             } case
 157         ]
 158     } cond ;
 159
 160 : handle-left-parenthesis ( -- )
 161     peek1 CHAR: ? =
 162     [ drop1 (parse-special-group) ]
 163     [ capture-group f nested-parse-regexp ] if ;
 164
 165 : handle-dot ( -- ) any-char push-stack ;
 166 : handle-pipe ( -- ) pipe push-stack ;
 167 : (handle-star) ( obj -- kleene-star )
 168     peek1 {
 169         { CHAR: + [ drop1 <possessive-kleene-star> ] }
 170         { CHAR: ? [ drop1 <reluctant-kleene-star> ] }
 171         [ drop <kleene-star> ]
 172     } case ;
 173 : handle-star ( -- ) stack pop (handle-star) push-stack ;
 174 : handle-question ( -- )
 175     stack pop peek1 {
 176         { CHAR: + [ drop1 <possessive-question> ] }
 177         { CHAR: ? [ drop1 <reluctant-question> ] }
 178         [ drop epsilon 2array <alternation> ]
 179     } case push-stack ;
 180 : handle-plus ( -- )
 181     stack pop dup (handle-star)
 182     2array <concatenation> push-stack ;
 183
 184 ERROR: unmatched-brace ;
 185 : parse-repetition ( -- start finish ? )
 186     "}" read-until [ unmatched-brace ] unless
 187     [ "," split1 [ string>number ] bi@ ]
 188     [ CHAR: , swap index >boolean ] bi ;
 189
 190 : replicate/concatenate ( n obj -- obj' )
 191     over zero? [ 2drop epsilon ]
 192     [ <repetition> first|concatenation ] if ;
 193
 194 : exactly-n ( n -- )
 195     stack pop replicate/concatenate push-stack ;
 196
 197 : at-least-n ( n -- )
 198     stack pop
 199     [ replicate/concatenate ] keep
 200     <kleene-star> 2array <concatenation> push-stack ;
 201
 202 : at-most-n ( n -- )
 203     1+
 204     stack pop
 205     [ replicate/concatenate ] curry map <alternation> push-stack ;
 206
 207 : from-m-to-n ( m n -- )
 208     [a,b]
 209     stack pop
 210     [ replicate/concatenate ] curry map
 211     <alternation> push-stack ;
 212
 213 ERROR: invalid-range a b ;
 214
 215 : handle-left-brace ( -- )
 216     parse-repetition
 217     [ 2dup [ [ 0 < [ invalid-range ] when ] when* ] bi@ ] dip
 218     [
 219         2dup and [ from-m-to-n ]
 220         [ [ nip at-most-n ] [ at-least-n ] if* ] if
 221     ] [ drop 0 max exactly-n ] if ;
 222
 223 : handle-front-anchor ( -- ) beginning-of-line push-stack ;
 224 : handle-back-anchor ( -- ) end-of-line push-stack ;
 225
 226 ERROR: bad-character-class obj ;
 227 ERROR: expected-posix-class ;
 228
 229 : parse-posix-class ( -- obj )
 230     read1 CHAR: { = [ expected-posix-class ] unless
 231     "}" read-until [ bad-character-class ] unless
 232     {
 233         { "Lower" [ letter-class ] }
 234         { "Upper" [ LETTER-class ] }
 235         { "Alpha" [ Letter-class ] }
 236         { "ASCII" [ ascii-class ] }
 237         { "Digit" [ digit-class ] }
 238         { "Alnum" [ alpha-class ] }
 239         { "Punct" [ punctuation-class ] }
 240         { "Graph" [ java-printable-class ] }
 241         { "Print" [ java-printable-class ] }
 242         { "Blank" [ non-newline-blank-class ] }
 243         { "Cntrl" [ control-character-class ] }
 244         { "XDigit" [ hex-digit-class ] }
 245         { "Space" [ java-blank-class ] }
 246         ! TODO: unicode-character-class, fallthrough in unicode is bad-char-clss
 247         [ bad-character-class ]
 248     } case ;
 249
 250 : parse-octal ( -- n ) 3 read oct> check-octal ;
 251 : parse-short-hex ( -- n ) 2 read hex> check-hex ;
 252 : parse-long-hex ( -- n ) 6 read hex> check-hex ;
 253 : parse-control-character ( -- n ) read1 ;
 254
 255 ERROR: bad-escaped-literals seq ;
 256
 257 : parse-til-E ( -- obj )
 258     "\\E" read-until [ bad-escaped-literals ] unless ;
 259
 260 :: (parse-escaped-literals) ( quot: ( obj -- obj' ) -- obj )
 261     parse-til-E
 262     drop1
 263     [ epsilon ] [
 264         quot call [ <constant> ] V{ } map-as
 265         first|concatenation
 266     ] if-empty ; inline
 267
 268 : parse-escaped-literals ( -- obj )
 269     [ ] (parse-escaped-literals) ;
 270
 271 : lower-case-literals ( -- obj )
 272     [ >lower ] (parse-escaped-literals) ;
 273
 274 : upper-case-literals ( -- obj )
 275     [ >upper ] (parse-escaped-literals) ;
 276
 277 : parse-escaped ( -- obj )
 278     read1
 279     {
 280         { CHAR: t [ CHAR: \t <constant> ] }
 281         { CHAR: n [ CHAR: \n <constant> ] }
 282         { CHAR: r [ CHAR: \r <constant> ] }
 283         { CHAR: f [ HEX: c <constant> ] }
 284         { CHAR: a [ HEX: 7 <constant> ] }
 285         { CHAR: e [ HEX: 1b <constant> ] }
 286
 287         { CHAR: w [ c-identifier-class ] }
 288         { CHAR: W [ c-identifier-class <negation> ] }
 289         { CHAR: s [ java-blank-class ] }
 290         { CHAR: S [ java-blank-class <negation> ] }
 291         { CHAR: d [ digit-class ] }
 292         { CHAR: D [ digit-class <negation> ] }
 293
 294         { CHAR: p [ parse-posix-class ] }
 295         { CHAR: P [ parse-posix-class <negation> ] }
 296         { CHAR: x [ parse-short-hex <constant> ] }
 297         { CHAR: u [ parse-long-hex <constant> ] }
 298         { CHAR: 0 [ parse-octal <constant> ] }
 299         { CHAR: c [ parse-control-character ] }
 300
 301         { CHAR: Q [ parse-escaped-literals ] }
 302
 303         ! { CHAR: b [ word-boundary-class ] }
 304         ! { CHAR: B [ word-boundary-class <negation> ] }
 305         ! { CHAR: A [ handle-beginning-of-input ] }
 306         ! { CHAR: z [ handle-end-of-input ] }
 307
 308         ! { CHAR: Z [ handle-end-of-input ] } ! plus a final terminator
 309
 310         ! m//g mode
 311         ! { CHAR: G [ end of previous match ] }
 312
 313         ! Group capture
 314         ! { CHAR: 1 [ CHAR: 1 <constant> ] }
 315         ! { CHAR: 2 [ CHAR: 2 <constant> ] }
 316         ! { CHAR: 3 [ CHAR: 3 <constant> ] }
 317         ! { CHAR: 4 [ CHAR: 4 <constant> ] }
 318         ! { CHAR: 5 [ CHAR: 5 <constant> ] }
 319         ! { CHAR: 6 [ CHAR: 6 <constant> ] }
 320         ! { CHAR: 7 [ CHAR: 7 <constant> ] }
 321         ! { CHAR: 8 [ CHAR: 8 <constant> ] }
 322         ! { CHAR: 9 [ CHAR: 9 <constant> ] }
 323
 324         ! Perl extensions
 325         ! can't do \l and \u because \u is already a 4-hex
 326         { CHAR: L [ lower-case-literals ] }
 327         { CHAR: U [ upper-case-literals ] }
 328
 329         [ <constant> ]
 330     } case ;
 331
 332 : handle-escape ( -- ) parse-escaped push-stack ;
 333
 334 : handle-dash ( vector -- vector' )
 335     H{ { dash CHAR: - } } substitute ;
 336
 337 : character-class>alternation ( seq -- alternation )
 338     [ dup number? [ <constant> ] when ] map first|alternation ;
 339
 340 : handle-caret ( vector -- vector' )
 341     dup [ length 2 >= ] [ first caret eq? ] bi and [
 342         rest-slice character-class>alternation <negation>
 343     ] [
 344         character-class>alternation
 345     ] if ;
 346
 347 : make-character-class ( -- character-class )
 348     [ beginning-of-character-class swap cut-stack ] change-whole-stack
 349     handle-dash handle-caret ;
 350
 351 : apply-dash ( -- )
 352     stack [ pop3 nip <character-class-range> ] keep push ;
 353
 354 : apply-dash? ( -- ? )
 355     stack dup length 3 >=
 356     [ [ length 2 - ] keep nth dash eq? ] [ drop f ] if ;
 357
 358 ERROR: empty-negated-character-class ;
 359 DEFER: handle-left-bracket
 360 : (parse-character-class) ( -- )
 361     read1 [ empty-negated-character-class ] unless* {
 362         { CHAR: [ [ handle-left-bracket t ] }
 363         { CHAR: ] [ make-character-class push-stack f ] }
 364         { CHAR: - [ dash push-stack t ] }
 365         { CHAR: \ [ parse-escaped push-stack t ] }
 366         [ push-stack apply-dash? [ apply-dash ] when t ]
 367     } case
 368     [ (parse-character-class) ] when ;
 369
 370 : push-constant ( ch -- ) <constant> push-stack ;
 371
 372 : parse-character-class-second ( -- )
 373     read1 {
 374         { CHAR: [ [ CHAR: [ push-constant ] }
 375         { CHAR: ] [ CHAR: ] push-constant ] }
 376         { CHAR: - [ CHAR: - push-constant ] }
 377         [ push1 ]
 378     } case ;
 379
 380 : parse-character-class-first ( -- )
 381     read1 {
 382         { CHAR: ^ [ caret push-stack parse-character-class-second ] }
 383         { CHAR: [ [ CHAR: [ push-constant ] }
 384         { CHAR: ] [ CHAR: ] push-constant ] }
 385         { CHAR: - [ CHAR: - push-constant ] }
 386         [ push1 ]
 387     } case ;
 388
 389 : handle-left-bracket ( -- )
 390     beginning-of-character-class push-stack
 391     parse-character-class-first (parse-character-class) ;
 392
 393 : finish-regexp-parse ( stack -- obj )
 394     { pipe } split
 395     [ first|concatenation ] map first|alternation ;
 396
 397 : handle-right-parenthesis ( -- )
 398     stack dup [ parentheses-group "members" word-prop member? ] find-last
 399     -rot cut rest
 400     [ [ push ] keep current-regexp get (>>stack) ]
 401     [ finish-regexp-parse push-stack ] bi* ;
 402
 403 : parse-regexp-token ( token -- ? )
 404     {
 405         { CHAR: ( [ handle-left-parenthesis t ] } ! handle (?..) at beginning?
 406         { CHAR: ) [ handle-right-parenthesis f ] }
 407         { CHAR: . [ handle-dot t ] }
 408         { CHAR: | [ handle-pipe t ] }
 409         { CHAR: ? [ handle-question t ] }
 410         { CHAR: * [ handle-star t ] }
 411         { CHAR: + [ handle-plus t ] }
 412         { CHAR: { [ handle-left-brace t ] }
 413         { CHAR: [ [ handle-left-bracket t ] }
 414         { CHAR: \ [ handle-escape t ] }
 415         [
 416             dup CHAR: $ = peek1 f = and
 417             [ drop handle-back-anchor f ]
 418             [ push-constant t ] if
 419         ]
 420     } case ;
 421
 422 : (parse-regexp) ( -- )
 423     read1 [ parse-regexp-token [ (parse-regexp) ] when ] when* ;
 424
 425 : parse-regexp-beginning ( -- )
 426     peek1 CHAR: ^ = [ drop1 handle-front-anchor ] when ;
 427
 428 : parse-regexp ( regexp -- )
 429     dup current-regexp [
 430         raw>> [
 431             <string-reader> [
 432                 parse-regexp-beginning (parse-regexp)
 433             ] with-input-stream
 434         ] unless-empty
 435         current-regexp get [ finish-regexp-parse ] change-stack
 436         dup stack>> >>parse-tree drop
 437     ] with-variable ;