lib/rdoc/markup.rb

   1 require 'rdoc'
   2
   3 ##
   4 # RDoc::Markup parses plain text documents and attempts to decompose them into
   5 # their constituent parts.  Some of these parts are high-level: paragraphs,
   6 # chunks of verbatim text, list entries and the like.  Other parts happen at
   7 # the character level: a piece of bold text, a word in code font.  This markup
   8 # is similar in spirit to that used on WikiWiki webs, where folks create web
   9 # pages using a simple set of formatting rules.
  10 #
  11 # RDoc::Markup itself does no output formatting: this is left to a different
  12 # set of classes.
  13 #
  14 # RDoc::Markup is extendable at runtime: you can add \new markup elements to
  15 # be recognised in the documents that RDoc::Markup parses.
  16 #
  17 # RDoc::Markup is intended to be the basis for a family of tools which share
  18 # the common requirement that simple, plain-text should be rendered in a
  19 # variety of different output formats and media.  It is envisaged that
  20 # RDoc::Markup could be the basis for formatting RDoc style comment blocks,
  21 # Wiki entries, and online FAQs.
  22 #
  23 # = Basic Formatting
  24 #
  25 # * RDoc::Markup looks for a document's natural left margin.  This is
  26 #   used as the initial margin for the document.
  27 #
  28 # * Consecutive lines starting at this margin are considered to be a
  29 #   paragraph.
  30 #
  31 # * If a paragraph starts with a "*", "-", or with "<digit>.", then it is
  32 #   taken to be the start of a list.  The margin in increased to be the first
  33 #   non-space following the list start flag.  Subsequent lines should be
  34 #   indented to this \new margin until the list ends.  For example:
  35 #
  36 #      * this is a list with three paragraphs in
  37 #        the first item.  This is the first paragraph.
  38 #
  39 #        And this is the second paragraph.
  40 #
  41 #        1. This is an indented, numbered list.
  42 #        2. This is the second item in that list
  43 #
  44 #        This is the third conventional paragraph in the
  45 #        first list item.
  46 #
  47 #      * This is the second item in the original list
  48 #
  49 # * You can also construct labeled lists, sometimes called description
  50 #   or definition lists.  Do this by putting the label in square brackets
  51 #   and indenting the list body:
  52 #
  53 #       [cat]  a small furry mammal
  54 #              that seems to sleep a lot
  55 #
  56 #       [ant]  a little insect that is known
  57 #              to enjoy picnics
  58 #
  59 #   A minor variation on labeled lists uses two colons to separate the
  60 #   label from the list body:
  61 #
  62 #       cat::  a small furry mammal
  63 #              that seems to sleep a lot
  64 #
  65 #       ant::  a little insect that is known
  66 #              to enjoy picnics
  67 #
  68 #   This latter style guarantees that the list bodies' left margins are
  69 #   aligned: think of them as a two column table.
  70 #
  71 # * Any line that starts to the right of the current margin is treated
  72 #   as verbatim text.  This is useful for code listings.  The example of a
  73 #   list above is also verbatim text.
  74 #
  75 # * A line starting with an equals sign (=) is treated as a
  76 #   heading.  Level one headings have one equals sign, level two headings
  77 #   have two,and so on.
  78 #
  79 # * A line starting with three or more hyphens (at the current indent)
  80 #   generates a horizontal rule.  The more hyphens, the thicker the rule
  81 #   (within reason, and if supported by the output device)
  82 #
  83 # * You can use markup within text (except verbatim) to change the
  84 #   appearance of parts of that text.  Out of the box, RDoc::Markup
  85 #   supports word-based and general markup.
  86 #
  87 #   Word-based markup uses flag characters around individual words:
  88 #
  89 #   [\*word*]  displays word in a *bold* font
  90 #   [\_word_]  displays word in an _emphasized_ font
  91 #   [\+word+]  displays word in a +code+ font
  92 #
  93 #   General markup affects text between a start delimiter and and end
  94 #   delimiter.  Not surprisingly, these delimiters look like HTML markup.
  95 #
  96 #   [\<b>text...</b>]    displays word in a *bold* font
  97 #   [\<em>text...</em>]  displays word in an _emphasized_ font
  98 #   [\<i>text...</i>]    displays word in an _emphasized_ font
  99 #   [\<tt>text...</tt>]  displays word in a +code+ font
 100 #
 101 #   Unlike conventional Wiki markup, general markup can cross line
 102 #   boundaries.  You can turn off the interpretation of markup by
 103 #   preceding the first character with a backslash, so \\\<b>bold
 104 #   text</b> and \\\*bold* produce \<b>bold text</b> and \*bold*
 105 #   respectively.
 106 #
 107 # * Hyperlinks to the web starting http:, mailto:, ftp:, or www. are
 108 #   recognized.  An HTTP url that references an external image file is
 109 #   converted into an inline <IMG..>.  Hyperlinks starting 'link:' are
 110 #   assumed to refer to local files whose path is relative to the --op
 111 #   directory.
 112 #
 113 #   Hyperlinks can also be of the form <tt>label</tt>[url], in which
 114 #   case the label is used in the displayed text, and <tt>url</tt> is
 115 #   used as the target.  If <tt>label</tt> contains multiple words,
 116 #   put it in braces: <em>{multi word label}[</em>url<em>]</em>.
 117 #
 118 # == Synopsis
 119 #
 120 # This code converts +input_string+ to HTML.  The conversion takes place in
 121 # the +convert+ method, so you can use the same RDoc::Markup converter to
 122 # convert multiple input strings.
 123 #
 124 #   require 'rdoc/markup/to_html'
 125 #
 126 #   h = RDoc::Markup::ToHtml.new
 127 #
 128 #   puts h.convert(input_string)
 129 #
 130 # You can extend the RDoc::Markup parser to recognise new markup
 131 # sequences, and to add special processing for text that matches a
 132 # regular expression.  Here we make WikiWords significant to the parser,
 133 # and also make the sequences {word} and \<no>text...</no> signify
 134 # strike-through text.  When then subclass the HTML output class to deal
 135 # with these:
 136 #
 137 #   require 'rdoc/markup'
 138 #   require 'rdoc/markup/to_html'
 139 #
 140 #   class WikiHtml < RDoc::Markup::ToHtml
 141 #     def handle_special_WIKIWORD(special)
 142 #       "<font color=red>" + special.text + "</font>"
 143 #     end
 144 #   end
 145 #
 146 #   m = RDoc::Markup.new
 147 #   m.add_word_pair("{", "}", :STRIKE)
 148 #   m.add_html("no", :STRIKE)
 149 #
 150 #   m.add_special(/\b([A-Z][a-z]+[A-Z]\w+)/, :WIKIWORD)
 151 #
 152 #   wh = WikiHtml.new
 153 #   wh.add_tag(:STRIKE, "<strike>", "</strike>")
 154 #
 155 #   puts "<body>#{wh.convert ARGF.read}</body>"
 156 #
 157 #--
 158 # Author::   Dave Thomas,  dave@pragmaticprogrammer.com
 159 # License::  Ruby license
 160
 161 class RDoc::Markup
 162
 163   SPACE = ?\s
 164
 165   # List entries look like:
 166   #   *       text
 167   #   1.      text
 168   #   [label] text
 169   #   label:: text
 170   #
 171   # Flag it as a list entry, and work out the indent for subsequent lines
 172
 173   SIMPLE_LIST_RE = /^(
 174                 (  \*          (?# bullet)
 175                   |-           (?# bullet)
 176                   |\d+\.       (?# numbered )
 177                   |[A-Za-z]\.  (?# alphabetically numbered )
 178                 )
 179                 \s+
 180               )\S/x
 181
 182   LABEL_LIST_RE = /^(
 183                       (  \[.*?\]    (?# labeled  )
 184                         |\S.*::     (?# note     )
 185                       )(?:\s+|$)
 186                     )/x
 187
 188   ##
 189   # Take a block of text and use various heuristics to determine it's
 190   # structure (paragraphs, lists, and so on).  Invoke an event handler as we
 191   # identify significant chunks.
 192
 193   def initialize
 194     @am = RDoc::Markup::AttributeManager.new
 195     @output = nil
 196   end
 197
 198   ##
 199   # Add to the sequences used to add formatting to an individual word (such
 200   # as *bold*).  Matching entries will generate attributes that the output
 201   # formatters can recognize by their +name+.
 202
 203   def add_word_pair(start, stop, name)
 204     @am.add_word_pair(start, stop, name)
 205   end
 206
 207   ##
 208   # Add to the sequences recognized as general markup.
 209
 210   def add_html(tag, name)
 211     @am.add_html(tag, name)
 212   end
 213
 214   ##
 215   # Add to other inline sequences.  For example, we could add WikiWords using
 216   # something like:
 217   #
 218   #    parser.add_special(/\b([A-Z][a-z]+[A-Z]\w+)/, :WIKIWORD)
 219   #
 220   # Each wiki word will be presented to the output formatter via the
 221   # accept_special method.
 222
 223   def add_special(pattern, name)
 224     @am.add_special(pattern, name)
 225   end
 226
 227   ##
 228   # We take a string, split it into lines, work out the type of each line,
 229   # and from there deduce groups of lines (for example all lines in a
 230   # paragraph).  We then invoke the output formatter using a Visitor to
 231   # display the result.
 232
 233   def convert(str, op)
 234     lines = str.split(/\r?\n/).map { |line| Line.new line }
 235     @lines = Lines.new lines
 236
 237     return "" if @lines.empty?
 238     @lines.normalize
 239     assign_types_to_lines
 240     group = group_lines
 241     # call the output formatter to handle the result
 242     #group.each { |line| p line }
 243     group.accept @am, op
 244   end
 245
 246   private
 247
 248   ##
 249   # Look through the text at line indentation.  We flag each line as being
 250   # Blank, a paragraph, a list element, or verbatim text.
 251
 252   def assign_types_to_lines(margin = 0, level = 0)
 253     while line = @lines.next
 254       if line.blank? then
 255         line.stamp :BLANK, level
 256         next
 257       end
 258
 259       # if a line contains non-blanks before the margin, then it must belong
 260       # to an outer level
 261
 262       text = line.text
 263
 264       for i in 0...margin
 265         if text[i] != SPACE
 266           @lines.unget
 267           return
 268         end
 269       end
 270
 271       active_line = text[margin..-1]
 272
 273       # Rules (horizontal lines) look like
 274       #
 275       #  ---   (three or more hyphens)
 276       #
 277       # The more hyphens, the thicker the rule
 278       #
 279
 280       if /^(---+)\s*$/ =~ active_line
 281         line.stamp :RULE, level, $1.length-2
 282         next
 283       end
 284
 285       # Then look for list entries.  First the ones that have to have
 286       # text following them (* xxx, - xxx, and dd. xxx)
 287
 288       if SIMPLE_LIST_RE =~ active_line
 289         offset = margin + $1.length
 290         prefix = $2
 291         prefix_length = prefix.length
 292
 293         flag = case prefix
 294                when "*","-" then :BULLET
 295                when /^\d/   then :NUMBER
 296                when /^[A-Z]/ then :UPPERALPHA
 297                when /^[a-z]/ then :LOWERALPHA
 298                else raise "Invalid List Type: #{self.inspect}"
 299                end
 300
 301         line.stamp :LIST, level+1, prefix, flag
 302         text[margin, prefix_length] = " " * prefix_length
 303         assign_types_to_lines(offset, level + 1)
 304         next
 305       end
 306
 307       if LABEL_LIST_RE =~ active_line
 308         offset = margin + $1.length
 309         prefix = $2
 310         prefix_length = prefix.length
 311
 312         next if handled_labeled_list(line, level, margin, offset, prefix)
 313       end
 314
 315       # Headings look like
 316       # = Main heading
 317       # == Second level
 318       # === Third
 319       #
 320       # Headings reset the level to 0
 321
 322       if active_line[0] == ?= and active_line =~ /^(=+)\s*(.*)/
 323         prefix_length = $1.length
 324         prefix_length = 6 if prefix_length > 6
 325         line.stamp :HEADING, 0, prefix_length
 326         line.strip_leading(margin + prefix_length)
 327         next
 328       end
 329
 330       # If the character's a space, then we have verbatim text,
 331       # otherwise
 332
 333       if active_line[0] == SPACE
 334         line.strip_leading(margin) if margin > 0
 335         line.stamp :VERBATIM, level
 336       else
 337         line.stamp :PARAGRAPH, level
 338       end
 339     end
 340   end
 341
 342   ##
 343   # Handle labeled list entries, We have a special case to deal with.
 344   # Because the labels can be long, they force the remaining block of text
 345   # over the to right:
 346   #
 347   #   this is a long label that I wrote:: and here is the
 348   #                                       block of text with
 349   #                                       a silly margin
 350   #
 351   # So we allow the special case.  If the label is followed by nothing, and
 352   # if the following line is indented, then we take the indent of that line
 353   # as the new margin.
 354   #
 355   #   this is a long label that I wrote::
 356   #       here is a more reasonably indented block which
 357   #       will be attached to the label.
 358   #
 359
 360   def handled_labeled_list(line, level, margin, offset, prefix)
 361     prefix_length = prefix.length
 362     text = line.text
 363     flag = nil
 364
 365     case prefix
 366     when /^\[/ then
 367       flag = :LABELED
 368       prefix = prefix[1, prefix.length-2]
 369     when /:$/ then
 370       flag = :NOTE
 371       prefix.chop!
 372     else
 373       raise "Invalid List Type: #{self.inspect}"
 374     end
 375
 376     # body is on the next line
 377     if text.length <= offset then
 378       original_line = line
 379       line = @lines.next
 380       return false unless line
 381       text = line.text
 382
 383       for i in 0..margin
 384         if text[i] != SPACE
 385           @lines.unget
 386           return false
 387         end
 388       end
 389
 390       i = margin
 391       i += 1 while text[i] == SPACE
 392
 393       if i >= text.length then
 394         @lines.unget
 395         return false
 396       else
 397         offset = i
 398         prefix_length = 0
 399
 400         if text[offset..-1] =~ SIMPLE_LIST_RE then
 401           @lines.unget
 402           line = original_line
 403           line.text = ''
 404         else
 405           @lines.delete original_line
 406         end
 407       end
 408     end
 409
 410     line.stamp :LIST, level+1, prefix, flag
 411     text[margin, prefix_length] = " " * prefix_length
 412     assign_types_to_lines(offset, level + 1)
 413     return true
 414   end
 415
 416   ##
 417   # Return a block consisting of fragments which are paragraphs, list
 418   # entries or verbatim text.  We merge consecutive lines of the same type
 419   # and level together.  We are also slightly tricky with lists: the lines
 420   # following a list introduction look like paragraph lines at the next
 421   # level, and we remap them into list entries instead.
 422
 423   def group_lines
 424     @lines.rewind
 425
 426     in_list = false
 427     wanted_type = wanted_level = nil
 428
 429     block = LineCollection.new
 430     group = nil
 431
 432     while line = @lines.next
 433       if line.level == wanted_level and line.type == wanted_type
 434         group.add_text(line.text)
 435       else
 436         group = block.fragment_for(line)
 437         block.add(group)
 438
 439         if line.type == :LIST
 440           wanted_type = :PARAGRAPH
 441         else
 442           wanted_type = line.type
 443         end
 444
 445         wanted_level = line.type == :HEADING ? line.param : line.level
 446       end
 447     end
 448
 449     block.normalize
 450     block
 451   end
 452
 453   ##
 454   # For debugging, we allow access to our line contents as text.
 455
 456   def content
 457     @lines.as_text
 458   end
 459   public :content
 460
 461   ##
 462   # For debugging, return the list of line types.
 463
 464   def get_line_types
 465     @lines.line_types
 466   end
 467   public :get_line_types
 468
 469 end
 470
 471 require 'rdoc/markup/fragments'
 472 require 'rdoc/markup/inline'
 473 require 'rdoc/markup/lines'