lib/wiki_to_markdown.php

   1 <?php
   2 // Utility function to convert wiki-like to Markdown format
   3 // Howard Miller, 2005
   4
   5 // state defines
   6 define( "STATE_NONE",1 ); // blank line has been detected, so looking for first line on next para
   7 define( "STATE_PARAGRAPH",2 ); // currently processing vanilla paragraph
   8 define( "STATE_BLOCKQUOTE",3 ); // currently processing blockquote section
   9 define( "STATE_PREFORM",4 ); // currently processing preformatted text
  10 define( "STATE_NOTIKI",5 ); // currently processing preformatted / no formatting
  11
  12 // list defines
  13 define( "LIST_NONE", 1 ); // no lists active
  14 define( "LIST_UNORDERED", 2 ); // unordered list active
  15 define( "LIST_ORDERED", 3 ); // ordered list active
  16 define( "LIST_DEFINITION", 4 ); // definition list active
  17
  18 class WikiToMarkdown {
  19
  20   var $block_state;
  21   var $list_state;
  22   var $list_depth;
  23   var $list_backtrack;
  24   var $output; // output buffer
  25   var $courseid;
  26
  27   function close_block( $state ) {
  28     // provide appropriate closure for block according to state
  29
  30     // if in list close this first
  31     $lclose = "";
  32     if ($this->list_state != LIST_NONE) {
  33       $lclose = $this->do_list( " ",true );
  34     }
  35
  36     $sclose = "";
  37     switch ($state) {
  38       case STATE_PARAGRAPH:
  39         $sclose =  "\n";
  40         break;
  41       case STATE_BLOCKQUOTE:
  42         $sclose =  "\n";
  43         break;
  44       case STATE_PREFORM:
  45         $sclose =  "</pre>\n";
  46         break;
  47       case STATE_NOTIKI:
  48         $sclose =  "\n";
  49         break;
  50     }
  51
  52     return $lclose . $sclose;
  53   }
  54
  55   function do_replace( $line, $mark, $tag ) {
  56     // do the regex thingy for things like bold, italic etc
  57     // $mark is the magic character, and $tag the HTML tag to insert
  58
  59     // BODGE: replace inline $mark characters in places where we want them ignored
  60     // they will be put back after main substitutue, stops problems with eg, and/or
  61     $bodge = chr(1);
  62     $line = eregi_replace( '([[:alnum:]])'.$mark.'([[:alnum:]])', '\\1'.$bodge.'\\2',$line );
  63
  64     $regex = '(^| |[(.,])'.$mark.'([^'.$mark.']*)'.$mark.'([^[:alnum:]]|$)';
  65     $replace = '\\1<'.$tag.'>\\2</'.$tag.'>\\3';
  66     $line = eregi_replace( $regex, $replace, $line );
  67
  68     // BODGE: back we go
  69     $line = eregi_replace( $bodge, $mark, $line );
  70
  71     return $line;
  72   }
  73
  74
  75   function do_replace_markdown( $line, $mark, $tag ) {
  76     // do the regex thingy for things like bold, italic etc
  77     // $mark is the magic character, and $tag the HTML tag to insert
  78     // MARKDOWN version does not generate HTML tags, just straigt replace
  79
  80     // BODGE: replace inline $mark characters in places where we want them ignored
  81     // they will be put back after main substitutue, stops problems with eg, and/or
  82     $bodge = chr(1);
  83     $line = eregi_replace( '([[:alnum:]])'.$mark.'([[:alnum:]])', '\\1'.$bodge.'\\2',$line );
  84
  85     $regex = '(^| |[(.,])'.$mark.'([^'.$mark.']*)'.$mark.'([^[:alnum:]]|$)';
  86     $replace = '\\1'.$tag.'\\2'.$tag.'\\3';
  87     $line = eregi_replace( $regex, $replace, $line );
  88
  89     // BODGE: back we go
  90     $line = eregi_replace( $bodge, $mark, $line );
  91
  92     return $line;
  93   }
  94
  95
  96   function do_replace_sub( $line, $mark, $tag ) {
  97     // do regex for subscript and superscript (slightly different)
  98     // $mark is the magic character and $tag the HTML tag to insert
  99
 100     $regex = $mark.'([^'.$mark.']*)'.$mark;
 101     $replace = '<'.$tag.'>\\1</'.$tag.'>';
 102
 103     return eregi_replace( $regex, $replace, $line );
 104   }
 105
 106   function do_list( $line, $blank=false ) {
 107     // handle line with list character on it
 108     // if blank line implies drop to level 0
 109
 110     // get magic character and then delete it from the line if not blank
 111     if ($blank) {
 112       $listchar="";
 113       $count = 0;
 114     }
 115     else {
 116       $listchar = $line{0};
 117       $count = strspn( $line, $listchar );
 118       $line = eregi_replace( "^[".$listchar."]+ ", "", $line );
 119     }
 120
 121     // find what sort of list this character represents
 122     $list_tag = "";
 123     $list_close_tag = "";
 124     $item_tag = "";
 125     $item_close_tag = "";
 126     $list_style = LIST_NONE;
 127     switch ($listchar) {
 128       case '*':
 129         $list_tag = "";
 130         $list_close_tag = "";
 131         $item_tag = "*";
 132         $item_close_tag = "";
 133         $list_style = LIST_UNORDERED;
 134         break;
 135       case '#':
 136         $list_tag = "";
 137         $list_close_tag = "";
 138         $item_tag = "1.";
 139         $item_close_tag = "";
 140         $list_style = LIST_ORDERED;
 141         break;
 142       case ';':
 143         $list_tag = "<dl>";
 144         $list_close_tag = "</dl>";
 145         $item_tag = "<dd>";
 146         $item_close_tag = "</dd>";
 147         $list_style = LIST_DEFINITION;
 148         break;
 149       case ':':
 150         $list_tag = "<dl>";
 151         $list_close_tag = "</dl>";
 152         $item_tag = "<dt>";
 153         $item_close_tag = "</dt>";
 154         $list_style = LIST_DEFINITION;
 155         break;
 156       }
 157
 158     // tag opening/closing regime now - fun bit :-)
 159     $tags = "";
 160
 161     // if depth has reduced do number of closes to restore level
 162     for ($i=$this->list_depth; $i>$count; $i-- ) {
 163       $close_tag = array_pop( $this->list_backtrack );
 164       $tags = $tags . $close_tag;
 165       }
 166
 167     // if depth has increased do number of opens to balance
 168     for ($i=$this->list_depth; $i<$count; $i++ ) {
 169       array_push( $this->list_backtrack, "$list_close_tag" );
 170       $tags = $tags . "$list_tag";
 171     }
 172
 173     // ok, so list state is now same as style and depth same as count
 174     $this->list_state = $list_style;
 175     $this->list_depth = $count;
 176
 177     // get indent
 178     $indent = substr( "                      ",1,$count-1 );
 179
 180     if ($blank) {
 181       $newline = $tags;
 182     }
 183     else {
 184       $newline = $tags . $indent . "$item_tag " . $line . "$item_close_tag";
 185     }
 186
 187     return $newline;
 188   }
 189
 190
 191   function line_replace( $line ) {
 192     // return line after various formatting replacements
 193     // have been made - order is vital to stop them interfering with each other
 194
 195     global $CFG;
 196
 197     // ---- (at least) means a <hr />
 198     // MARKDOWN: no change so leave
 199
 200     // is this a list line (starts with * # ; :)
 201     if (eregi( "^([*]+|[#]+|[;]+|[:]+) ", $line )) {
 202       $line = $this->do_list( $line );
 203     }
 204
 205    // typographic conventions
 206    // MARKDOWN: no equiv. so convert to entity as before
 207     // $line = str_replace( "--", "&#8212;", $line );
 208     // $line = str_replace( " - ", " &#8211; ", $line );
 209     $line = str_replace( "...", " &#8230; ", $line );
 210     $line = str_replace( "(R)", "&#174;", $line );
 211     $line = str_replace( "(r)", "&#174;", $line );
 212     $line = str_replace( "(TM)", "&#8482;", $line );
 213     $line = str_replace( "(tm)", "&#8482;", $line );
 214     $line = str_replace( "(C)", "&#169;", $line );
 215     $line = str_replace( "1/4", "&#188;", $line );
 216     $line = str_replace( "1/2", "&#189;", $line );
 217     $line = str_replace( "3/4", "&#190;", $line );
 218     $line = eregi_replace( "([[:digit:]]+[[:space:]]*)x([[:space:]]*[[:digit:]]+)", "\\1&#215;\\2", $line ); // (digits) x (digits) - multiply
 219     // do formatting tags
 220     // NOTE: The / replacement  *has* to be first, or it will screw the
 221     //    HTML tags that are added by the other ones
 222     // MARKDOWN: only bold and italic change, rest are just HTML
 223     $line = $this->do_replace_markdown( $line, "\*", "**" );
 224     $line = $this->do_replace_markdown( $line, "/", "*" );
 225     $line = $this->do_replace( $line, "\+", "ins" );
 226     // $line = $this->do_replace( $line, "-", "del" );
 227     $line = $this->do_replace_sub( $line, "~", "sub" );
 228     $line = $this->do_replace_sub( $line, "\^", "sup" );
 229     $line = $this->do_replace( $line, "%", "code" );
 230     $line = $this->do_replace( $line, "@", "cite" );
 231
 232     // convert urls into proper link with optional link text URL(text)
 233     // MARDOWN: HTML conversion should work fine
 234     $line = eregi_replace("([[:space:]]|^)([[:alnum:]]+)://([^[:space:]]*)([[:alnum:]#?/&=])\(([^)]+)\)",
 235       "\\1[\\5](\\2://\\3\\4)", $line);
 236     $line = eregi_replace("([[:space:]])www\.([^[:space:]]*)([[:alnum:]#?/&=])\(([^)]+)\)",
 237       "\\1[\\5](http://www.\\2\\3)", $line);
 238
 239     // make urls (with and without httpd) into proper links
 240     $line = eregi_replace("([[:space:]]|^)([[:alnum:]]+)://([^[:space:]]*)([[:alnum:]#?/&=])",
 241       "\\1<\\2://\\3\\4>", $line);
 242     $line = eregi_replace("([[:space:]])www\.([^[:space:]]*)([[:alnum:]#?/&=])",
 243       "\\1<http://www.\\2\\3\>", $line);
 244
 245     // make email addresses into mailtos....
 246     // MARKDOWN doesn't quite support this, so do as html
 247     $line = eregi_replace("([[:space:]]|^)([[:alnum:]._-]+@[[:alnum:]._-]+)\(([^)]+)\)",
 248        "\\1<a href=\"mailto:\\2\">\\3</a>", $line);
 249
 250     // !# at the beginning of any lines means a heading
 251     // MARKDOWN: value (1-6) becomes number of hashes
 252     if (eregi( "^!([1-6]) (.*)$", $line, $regs )) {
 253       $depth = substr( $line, 1, 1 );
 254       $out = substr( '##########', 0, $depth);
 255       $line = eregi_replace( "^!([1-6]) (.*)$", "$out \\2", $line );
 256     }
 257
 258     // acronym handing, example HTML(Hypertext Markyp Language)
 259     // MARKDOWN: no equiv. so just leave as HTML
 260     $line = ereg_replace( "([A-Z]+)\(([^)]+)\)", "<acronym title=\"\\2\">\\1</acronym>", $line );
 261
 262     // Replace resource link >>##(Description Text)
 263     // MARKDOWN: change to MD web link style
 264     $line = eregi_replace( " ([a-zA-Z]+):([0-9]+)\(([^)]+)\)",
 265        " [\\3](".$CFG->wwwroot."/mod/\\1/view.php?id=\\2) ", $line );
 266
 267     // Replace picture resource link
 268     if ($CFG->slasharguments) {
 269       $line = eregi_replace( "/([a-zA-Z0-9./_-]+)(png|gif|jpg)\(([^)]+)\)",
 270         "![\\3]($CFG->wwwroot/file.php/$this->courseid/\\1\\2)", $line );
 271     } else {
 272       $line = eregi_replace( "/([a-zA-Z0-9./_-]+)(png|gif|jpg)\(([^)]+)\)",
 273         "![\\3]($CFG->wwwroot/file.php?file=/$this->courseid/\\1\\2)", $line );
 274     }
 275
 276     // Replace file resource link
 277     if ($CFG->slasharguments) {
 278       $line = eregi_replace( "file:/([[:alnum:]/._-]+)\(([^)]+)\)",
 279         "[\\2]($CFG->wwwroot/file.php/$this->courseid/\\1)", $line );
 280     } else {
 281       $line = eregi_replace( "file:/([[:alnum:]/._-]+)\(([^)]+)\)",
 282         "[\\2]($CFG->wwwroot/file.php?file=/$this->courseid/\\1)", $line );
 283     }
 284
 285
 286     return $line;
 287   }
 288
 289   function convert( $content,$courseid ) {
 290
 291     // main entry point for processing Wiki-like text
 292     // $content is string containing text with Wiki-Like formatting
 293     // return: string containing Markdown formatting
 294
 295     // initialisation stuff
 296     $this->output = "";
 297     $this->block_state = STATE_NONE;
 298     $this->list_state = LIST_NONE;
 299     $this->list_depth = 0;
 300     $this->list_backtrack = array();
 301     $this->spelling_on = false;
 302     $this->courseid = $courseid;
 303
 304     // split content into array of single lines
 305     $lines = explode( "\n",$content );
 306     $buffer = "";
 307
 308     // run through lines
 309     foreach( $lines as $line ) {
 310       // is this a blank line?
 311       $blank_line = eregi( "^[[:blank:]\r]*$", $line );
 312       if ($blank_line) {
 313         // first end current block according to state
 314         $buffer = $buffer . $this->close_block( $this->block_state );
 315         $this->block_state = STATE_NONE;
 316         continue;
 317       }
 318
 319       // act now depending on current block state
 320       if ($this->block_state == STATE_NONE) {
 321         // first character of line defines block type
 322         if (eregi( "^> ",$line )) {
 323           // blockquote
 324           $buffer = $buffer . $this->line_replace( $line ). "\n";
 325           $this->block_state = STATE_BLOCKQUOTE;
 326         }
 327         else
 328         if (eregi( "^  ",$line) ) {
 329           // preformatted text
 330           // MARKDOWN: no real equiv. so just use <pre>
 331           $buffer = $buffer . "<pre>\n";
 332           $buffer = $buffer . $this->line_replace($line) . "\n";
 333           $this->block_state = STATE_PREFORM;
 334         }
 335         else
 336         if (eregi("^\% ",$line) ) {
 337                 // preformatted text - no processing
 338                 // MARKDOWN: this is MD code form of a paragraph
 339                 $buffer = $buffer . "    " . eregi_replace( "^\%","",$line) . "\n";
 340                 $this->block_state = STATE_NOTIKI;
 341         }
 342         else {
 343           // ordinary paragraph
 344           $buffer = $buffer . $this->line_replace($line) . "\n";
 345           $this->block_state = STATE_PARAGRAPH;
 346         }
 347         continue;
 348       }
 349
 350       if (($this->block_state == STATE_PARAGRAPH) |
 351           ($this->block_state == STATE_BLOCKQUOTE) |
 352           ($this->block_state == STATE_PREFORM) ) {
 353         $buffer = $buffer . $this->line_replace($line) . "\n";
 354         continue;
 355       }
 356       elseif ($this->block_state == STATE_NOTIKI) {
 357         $buffer = $buffer . "    " .$line . "\n";
 358       }
 359     }
 360
 361     // close off any block level tags
 362     $buffer = $buffer . $this->close_block( $this->block_state );
 363
 364     //return $buffer;
 365     return $buffer;
 366   }
 367
 368   function update( $thing, $textfield, $formatfield, $coursesql='' ) {
 369     // converts the text in a particular activity (or sub-activity)
 370     // $thing = the database name for that 'thing' (eg, resource, choice)
 371     // $textfield = the name of the field that might hold the wiki-text
 372     // $formatfield = the name of the field that contains the format type
 373     // $coursesql = if supplied, the query to get the courseid, if not get from the 'course' field
 374     //   ($id of record is tacked on right at the end, so phrase accordingly)
 375     // returns a count of records converted
 376     $count = 0;
 377     if ($records = get_records( $thing,$formatfield,FORMAT_WIKI )) {
 378         foreach( $records as $record ) {
 379           $text = $record->$textfield;
 380           $id = $record->id;
 381           if (!$coursesql) {
 382             $courseid = $record->course;
 383           } else {
 384             $r = get_record_sql( $coursesql . "$id" );
 385             $courseid = $r->course;
 386           }
 387           $newtext = $this->convert( $text,$courseid );
 388
 389           $record->$textfield = $newtext;
 390           $record->$formatfield = FORMAT_MARKDOWN;
 391           update_record( $thing, addslashes_object( $record ) );
 392           $count++;
 393         }
 394     }
 395     return $count;
 396   }
 397 }
 398 ?>