2 # irb/ruby-lex.rb - ruby lexcal analizer
3 # $Release Version: 0.9.5$
5 # $Date: 2007-02-12 15:01:19 -0800 (Mon, 12 Feb 2007) $
6 # by Keiju ISHITSUKA(keiju@ruby-lang.org)
15 require "irb/ruby-token"
18 @RCS_ID='-$Id: ruby-lex.rb 11708 2007-02-12 23:01:19Z shyouhei $-'
20 extend Exception2MessageMapper
21 def_exception(:AlreadyDefinedToken, "Already defined token(%s)")
22 def_exception(:TkReading2TokenNoKey, "key nothing(key='%s')")
23 def_exception(:TkSymbol2TokenNoKey, "key nothing(key='%s')")
24 def_exception(:TkReading2TokenDuplicateError,
25 "key duplicate(token_n='%s', key='%s')")
26 def_exception(:SyntaxError, "%s")
28 def_exception(:TerminateLineInput, "Terminate Line Input")
33 attr_accessor :debug_level
45 @exp_line_no = @line_no = 1
62 @readed_auto_clean_up = false
63 @exception_on_syntax_error = true
68 attr_accessor :skip_space
69 attr_accessor :readed_auto_clean_up
70 attr_accessor :exception_on_syntax_error
78 def set_input(io, p = nil, &block)
80 if p and p.respond_to?(:call)
85 @input = Proc.new{@io.gets}
90 if idx = @readed.reverse.index("\n")
93 @base_char_no += @readed.size
96 readed = @readed.join("")
103 # return nil unless buf_input
104 @rests.push nil unless buf_input
128 return nil if l == "" and c.nil?
145 if @here_readed.empty?
148 c2 = @here_readed.pop
151 @rests.unshift c #c =
155 if idx = @readed.reverse.index("\n")
156 @char_no = @readed.size - idx
158 @char_no = @base_char_no + @readed.size
167 until @rests.size >= chrs.size
168 return false unless buf_input
170 @rests[0, chrs.size] == chrs
173 def peek_match?(regexp)
175 return false unless buf_input
177 regexp =~ @rests.join("")
181 while @rests.size <= i
182 return nil unless buf_input
190 return nil unless line
191 @rests.concat line.split(//)
196 def set_prompt(p = nil, &block)
197 p = block if block_given?
198 if p.respond_to?(:call)
201 @prompt = Proc.new{print p}
207 @prompt.call(@ltype, @indent, @continue, @line_no)
216 @lex_state = EXPR_BEG
224 @exp_line_no = @line_no
227 def each_top_level_statement
229 catch(:TERM_INPUT) do
235 throw :TERM_INPUT if @line == ''
239 if @ltype or @continue or @indent > 0
244 yield @line, @exp_line_no
248 @exp_line_no = @line_no
253 rescue TerminateLineInput
263 until (((tk = token).kind_of?(TkNL) || tk.kind_of?(TkEND_OF_SCRIPT)) &&
272 if line == "" and tk.kind_of?(TkEND_OF_SCRIPT) || tk.nil?
283 @prev_line_no = @line_no
284 @prev_char_no = @char_no
288 @space_seen = tk.kind_of?(TkSPACE)
290 raise if @exception_on_syntax_error
291 tk = TkError.new(@seek, @line_no, @char_no)
293 end while @skip_space and tk.kind_of?(TkSPACE)
294 if @readed_auto_clean_up
302 "case", "class", "def", "do", "for", "if",
303 "module", "unless", "until", "while", "begin" #, "when"
305 DEINDENT_CLAUSE = ["end" #, "when"
341 @OP.def_rules("\0", "\004", "\032") do |op, io|
342 Token(TkEND_OF_SCRIPT)
345 @OP.def_rules(" ", "\t", "\f", "\r", "\13") do |op, io|
347 while getc =~ /[ \t\f\r\13]/; end
352 @OP.def_rule("#") do |op, io|
356 @OP.def_rule("=begin",
357 proc{|op, io| @prev_char_no == 0 && peek(0) =~ /\s/}) do
360 until getc == "\n"; end
361 until peek_equal?("=end") && peek(4) =~ /\s/
362 until getc == "\n"; end
369 @OP.def_rule("\n") do |op, io|
370 print "\\n\n" if RubyLex.debug?
372 when EXPR_BEG, EXPR_FNAME, EXPR_DOT
376 @lex_state = EXPR_BEG
377 until (@indent_stack.empty? ||
378 [TkLPAREN, TkLBRACK, TkLBRACE,
379 TkfLPAREN, TkfLBRACK, TkfLBRACE].include?(@indent_stack.last))
388 @OP.def_rules("*", "**",
395 when EXPR_FNAME, EXPR_DOT
396 @lex_state = EXPR_ARG
398 @lex_state = EXPR_BEG
403 @OP.def_rules("!", "!=", "!~") do
405 @lex_state = EXPR_BEG
409 @OP.def_rules("<<") do
412 if @lex_state != EXPR_END && @lex_state != EXPR_CLASS &&
413 (@lex_state != EXPR_ARG || @space_seen)
415 if /\S/ =~ c && (/["'`]/ =~ c || /[\w_]/ =~ c || c == "-")
416 tk = identify_here_document
422 when EXPR_FNAME, EXPR_DOT
423 @lex_state = EXPR_ARG
425 @lex_state = EXPR_BEG
431 @OP.def_rules("'", '"') do
436 @OP.def_rules("`") do
438 if @lex_state == EXPR_FNAME
439 @lex_state = EXPR_END
446 @OP.def_rules('?') do
448 if @lex_state == EXPR_END
449 @lex_state = EXPR_BEG
453 if @lex_state == EXPR_ARG && ch =~ /\s/
455 @lex_state = EXPR_BEG;
461 @lex_state = EXPR_END
467 @OP.def_rules("&", "&&", "|", "||") do
469 @lex_state = EXPR_BEG
473 @OP.def_rules("+=", "-=", "*=", "**=",
474 "&=", "|=", "^=", "<<=", ">>=", "||=", "&&=") do
476 @lex_state = EXPR_BEG
481 @OP.def_rule("+@", proc{|op, io| @lex_state == EXPR_FNAME}) do
483 @lex_state = EXPR_ARG
487 @OP.def_rule("-@", proc{|op, io| @lex_state == EXPR_FNAME}) do
489 @lex_state = EXPR_ARG
493 @OP.def_rules("+", "-") do
496 if @lex_state == EXPR_ARG
497 if @space_seen and peek(0) =~ /[0-9]/
498 throw :RET, identify_number
500 @lex_state = EXPR_BEG
502 elsif @lex_state != EXPR_END and peek(0) =~ /[0-9]/
503 throw :RET, identify_number
505 @lex_state = EXPR_BEG
513 @lex_state = EXPR_BEG
514 if peek(0) =~ /[0-9]/
519 @lex_state = EXPR_DOT
524 @OP.def_rules("..", "...") do
526 @lex_state = EXPR_BEG
534 @OP.def_rules("]", "}", ")") do
536 @lex_state = EXPR_END
544 if @lex_state == EXPR_END || peek(0) =~ /\s/
545 @lex_state = EXPR_BEG
548 @lex_state = EXPR_FNAME;
553 @OP.def_rule("::") do
555 # p @lex_state.id2name, @space_seen
556 if @lex_state == EXPR_BEG or @lex_state == EXPR_ARG && @space_seen
557 @lex_state = EXPR_BEG
560 @lex_state = EXPR_DOT
567 if @lex_state == EXPR_BEG || @lex_state == EXPR_MID
571 @lex_state = EXPR_BEG
572 Token(TkOPASGN, "/") #/)
573 elsif @lex_state == EXPR_ARG and @space_seen and peek(0) !~ /\s/
576 @lex_state = EXPR_BEG
581 @OP.def_rules("^") do
583 @lex_state = EXPR_BEG
587 # @OP.def_rules("^=") do
588 # @lex_state = EXPR_BEG
592 @OP.def_rules(",") do
594 @lex_state = EXPR_BEG
598 @OP.def_rules(";") do
600 @lex_state = EXPR_BEG
601 until (@indent_stack.empty? ||
602 [TkLPAREN, TkLBRACK, TkLBRACE,
603 TkfLPAREN, TkfLBRACK, TkfLBRACE].include?(@indent_stack.last))
611 @lex_state = EXPR_BEG
615 @OP.def_rule("~@", proc{|op, io| @lex_state == EXPR_FNAME}) do
617 @lex_state = EXPR_BEG
624 if @lex_state == EXPR_BEG || @lex_state == EXPR_MID
625 @lex_state = EXPR_BEG
628 @lex_state = EXPR_BEG
631 @indent_stack.push tk_c
635 @OP.def_rule("[]", proc{|op, io| @lex_state == EXPR_FNAME}) do
637 @lex_state = EXPR_ARG
641 @OP.def_rule("[]=", proc{|op, io| @lex_state == EXPR_FNAME}) do
643 @lex_state = EXPR_ARG
650 if @lex_state == EXPR_FNAME
653 if @lex_state == EXPR_BEG || @lex_state == EXPR_MID
655 elsif @lex_state == EXPR_ARG && @space_seen
660 @lex_state = EXPR_BEG
662 @indent_stack.push tk_c
669 if @lex_state != EXPR_END && @lex_state != EXPR_ARG
674 @lex_state = EXPR_BEG
675 @indent_stack.push tk_c
679 @OP.def_rule('\\') do
693 if @lex_state == EXPR_BEG || @lex_state == EXPR_MID
698 elsif @lex_state == EXPR_ARG and @space_seen and peek(0) !~ /\s/
701 @lex_state = EXPR_BEG
713 if peek(0) =~ /[\w_@]/
721 # @OP.def_rule("def", proc{|op, io| /\s/ =~ io.peek(0)}) do
724 # @lex_state = EXPR_FNAME
725 # # @lex_state = EXPR_END
726 # # until @rests[0] == "\n" or @rests[0] == ";"
733 printf "MATCH: start %s: %s\n", op, io.inspect if RubyLex.debug?
734 if peek(0) =~ /[0-9]/
736 elsif peek(0) =~ /[\w_]/
737 t = identify_identifier
739 printf "MATCH: end %s: %s\n", op, io.inspect if RubyLex.debug?
743 p @OP if RubyLex.debug?
747 @lex_state = EXPR_END
750 when /[~_*$?!@\/\\;,=:<>".]/ #"
751 Token(TkGVAR, "$" + ch)
753 Token(TkGVAR, "$-" + getc)
754 when "&", "`", "'", "+"
755 Token(TkBACK_REF, "$"+ch)
757 while getc =~ /[0-9]/; end
770 def identify_identifier
773 token.concat(c = getc)
774 if c == "@" and peek(0) == "@"
779 while (ch = getc) =~ /\w|_/
780 print ":", ch, ":" if RubyLex.debug?
785 if (ch == "!" || ch == "?") && token[0,1] =~ /\w/ && peek(0) != "="
793 return Token(TkGVAR, token)
795 @lex_state = EXPR_END
796 # p Token(TkCVAR, token)
797 return Token(TkCVAR, token)
799 @lex_state = EXPR_END
800 return Token(TkIVAR, token)
803 if @lex_state != EXPR_DOT
804 print token, "\n" if RubyLex.debug?
806 token_c, *trans = TkReading2Token[token]
810 if (@lex_state != EXPR_BEG &&
811 @lex_state != EXPR_FNAME &&
814 token_c = TkSymbol2Token[trans[1]]
815 @lex_state = trans[0]
817 if @lex_state != EXPR_FNAME
818 if ENINDENT_CLAUSE.include?(token)
819 # check for ``class = val'' etc.
823 valid = false unless peek_match?(/^\s*(<<|\w|::)/)
825 valid = false if peek_match?(/^\s*(([+-\/*&\|^]|<<|>>|\|\||\&\&)=|\&\&|\|\|)/)
827 valid = false if peek_match?(/^\s*([+-\/*]?=|\*|<|>|\&)/)
828 when *ENINDENT_CLAUSE
829 valid = false if peek_match?(/^\s*([+-\/*]?=|\*|<|>|\&|\|)/)
835 if ![TkFOR, TkWHILE, TkUNTIL].include?(@indent_stack.last)
837 @indent_stack.push token_c
841 @indent_stack.push token_c
846 elsif DEINDENT_CLAUSE.include?(token)
850 @lex_state = trans[0]
852 @lex_state = EXPR_END
855 return Token(token_c, token)
859 if @lex_state == EXPR_FNAME
860 @lex_state = EXPR_END
864 elsif @lex_state == EXPR_BEG || @lex_state == EXPR_DOT
865 @lex_state = EXPR_ARG
867 @lex_state = EXPR_END
870 if token[0, 1] =~ /[A-Z]/
871 return Token(TkCONSTANT, token)
872 elsif token[token.size - 1, 1] =~ /[!?]/
873 return Token(TkFID, token)
875 return Token(TkIDENTIFIER, token)
879 def identify_here_document
881 # if lt = PERCENT_LTYPE[ch]
889 while (c = getc) && c != lt
895 while (c = getc) && c =~ /\w/
901 ltback, @ltype = @ltype, lt
906 reserve.push ch = getc
914 l = l.sub(/(:?\r)?\n\z/, '')
915 if (indent ? l.strip : l) == quoted
921 @here_readed.concat reserve
922 while ch = reserve.pop
927 @lex_state = EXPR_END
928 Token(Ltype2Token[lt])
931 def identify_quotation
933 if lt = PERCENT_LTYPE[ch]
938 RubyLex.fail SyntaxError, "unknown type of %string"
945 @quoted = ch unless @quoted = PERCENT_PAREN[ch]
946 identify_string(lt, @quoted)
950 @lex_state = EXPR_END
952 if peek(0) == "0" && peek(1) !~ /[.eE]/
957 match = /[0-9a-fA-F_]/
970 RubyLex.fail SyntaxError, "Illegal octal digit"
972 return Token(TkINTEGER)
981 RubyLex.fail SyntaxError, "trailing `#{ch}' in number"
992 RubyLex.fail SyntaxError, "numeric literal without digits"
995 RubyLex.fail SyntaxError, "trailing `#{non_digit}' in number"
1000 return Token(TkINTEGER)
1013 when allow_point && "."
1015 RubyLex.fail SyntaxError, "trailing `#{non_digit}' in number"
1018 if peek(0) !~ /[0-9]/
1024 when allow_e && "e", allow_e && "E"
1026 RubyLex.fail SyntaxError, "trailing `#{non_digit}' in number"
1029 if peek(0) =~ /[+-]/
1037 RubyLex.fail SyntaxError, "trailing `#{non_digit}' in number"
1046 def identify_string(ltype, quoted = ltype)
1053 if @quoted == ch and nest == 0
1055 elsif @ltype != "'" && @ltype != "]" && @ltype != ":" and ch == "#"
1060 if PERCENT_PAREN.values.include?(@quoted)
1061 if PERCENT_PAREN[ch] == @quoted
1069 if peek(0) =~ /i|m|x|o|e|s|u|n/
1074 Token(DLtype2Token[ltype])
1076 Token(Ltype2Token[ltype])
1081 @lex_state = EXPR_END
1085 def identify_comment
1098 return Token(TkCOMMENT)
1103 when "\n", "\r", "\f"
1104 when "\\", "n", "t", "r", "f", "v", "a", "e", "b", "s" #"
1131 if (ch = getc) != '-'
1134 if (ch = getc) == "\\" #"
1139 when "C", "c" #, "^"
1140 if ch == "C" and (ch = getc) != "-"
1142 elsif (ch = getc) == "\\" #"