MDL-11082 Improved groups upgrade performance 1.8x -> 1.9; thanks Eloy for telling...
[moodle-pu.git] / lib / wiki_to_markdown.php
blobfa95f9995209ebecb87f634f53aa8eb622dfbfbc
1 <?php
2 // Utility function to convert wiki-like to Markdown format
3 // Howard Miller, 2005
5 // state defines
6 define( "STATE_NONE",1 ); // blank line has been detected, so looking for first line on next para
7 define( "STATE_PARAGRAPH",2 ); // currently processing vanilla paragraph
8 define( "STATE_BLOCKQUOTE",3 ); // currently processing blockquote section
9 define( "STATE_PREFORM",4 ); // currently processing preformatted text
10 define( "STATE_NOTIKI",5 ); // currently processing preformatted / no formatting
12 // list defines
13 define( "LIST_NONE", 1 ); // no lists active
14 define( "LIST_UNORDERED", 2 ); // unordered list active
15 define( "LIST_ORDERED", 3 ); // ordered list active
16 define( "LIST_DEFINITION", 4 ); // definition list active
18 class WikiToMarkdown {
20 var $block_state;
21 var $list_state;
22 var $list_depth;
23 var $list_backtrack;
24 var $output; // output buffer
25 var $courseid;
27 function close_block( $state ) {
28 // provide appropriate closure for block according to state
30 // if in list close this first
31 $lclose = "";
32 if ($this->list_state != LIST_NONE) {
33 $lclose = $this->do_list( " ",true );
36 $sclose = "";
37 switch ($state) {
38 case STATE_PARAGRAPH:
39 $sclose = "\n";
40 break;
41 case STATE_BLOCKQUOTE:
42 $sclose = "\n";
43 break;
44 case STATE_PREFORM:
45 $sclose = "</pre>\n";
46 break;
47 case STATE_NOTIKI:
48 $sclose = "\n";
49 break;
52 return $lclose . $sclose;
55 function do_replace( $line, $mark, $tag ) {
56 // do the regex thingy for things like bold, italic etc
57 // $mark is the magic character, and $tag the HTML tag to insert
59 // BODGE: replace inline $mark characters in places where we want them ignored
60 // they will be put back after main substitutue, stops problems with eg, and/or
61 $bodge = chr(1);
62 $line = eregi_replace( '([[:alnum:]])'.$mark.'([[:alnum:]])', '\\1'.$bodge.'\\2',$line );
64 $regex = '(^| |[(.,])'.$mark.'([^'.$mark.']*)'.$mark.'([^[:alnum:]]|$)';
65 $replace = '\\1<'.$tag.'>\\2</'.$tag.'>\\3';
66 $line = eregi_replace( $regex, $replace, $line );
68 // BODGE: back we go
69 $line = eregi_replace( $bodge, $mark, $line );
71 return $line;
75 function do_replace_markdown( $line, $mark, $tag ) {
76 // do the regex thingy for things like bold, italic etc
77 // $mark is the magic character, and $tag the HTML tag to insert
78 // MARKDOWN version does not generate HTML tags, just straigt replace
80 // BODGE: replace inline $mark characters in places where we want them ignored
81 // they will be put back after main substitutue, stops problems with eg, and/or
82 $bodge = chr(1);
83 $line = eregi_replace( '([[:alnum:]])'.$mark.'([[:alnum:]])', '\\1'.$bodge.'\\2',$line );
85 $regex = '(^| |[(.,])'.$mark.'([^'.$mark.']*)'.$mark.'([^[:alnum:]]|$)';
86 $replace = '\\1'.$tag.'\\2'.$tag.'\\3';
87 $line = eregi_replace( $regex, $replace, $line );
89 // BODGE: back we go
90 $line = eregi_replace( $bodge, $mark, $line );
92 return $line;
96 function do_replace_sub( $line, $mark, $tag ) {
97 // do regex for subscript and superscript (slightly different)
98 // $mark is the magic character and $tag the HTML tag to insert
100 $regex = $mark.'([^'.$mark.']*)'.$mark;
101 $replace = '<'.$tag.'>\\1</'.$tag.'>';
103 return eregi_replace( $regex, $replace, $line );
106 function do_list( $line, $blank=false ) {
107 // handle line with list character on it
108 // if blank line implies drop to level 0
110 // get magic character and then delete it from the line if not blank
111 if ($blank) {
112 $listchar="";
113 $count = 0;
115 else {
116 $listchar = $line{0};
117 $count = strspn( $line, $listchar );
118 $line = eregi_replace( "^[".$listchar."]+ ", "", $line );
121 // find what sort of list this character represents
122 $list_tag = "";
123 $list_close_tag = "";
124 $item_tag = "";
125 $item_close_tag = "";
126 $list_style = LIST_NONE;
127 switch ($listchar) {
128 case '*':
129 $list_tag = "";
130 $list_close_tag = "";
131 $item_tag = "*";
132 $item_close_tag = "";
133 $list_style = LIST_UNORDERED;
134 break;
135 case '#':
136 $list_tag = "";
137 $list_close_tag = "";
138 $item_tag = "1.";
139 $item_close_tag = "";
140 $list_style = LIST_ORDERED;
141 break;
142 case ';':
143 $list_tag = "<dl>";
144 $list_close_tag = "</dl>";
145 $item_tag = "<dd>";
146 $item_close_tag = "</dd>";
147 $list_style = LIST_DEFINITION;
148 break;
149 case ':':
150 $list_tag = "<dl>";
151 $list_close_tag = "</dl>";
152 $item_tag = "<dt>";
153 $item_close_tag = "</dt>";
154 $list_style = LIST_DEFINITION;
155 break;
158 // tag opening/closing regime now - fun bit :-)
159 $tags = "";
161 // if depth has reduced do number of closes to restore level
162 for ($i=$this->list_depth; $i>$count; $i-- ) {
163 $close_tag = array_pop( $this->list_backtrack );
164 $tags = $tags . $close_tag;
167 // if depth has increased do number of opens to balance
168 for ($i=$this->list_depth; $i<$count; $i++ ) {
169 array_push( $this->list_backtrack, "$list_close_tag" );
170 $tags = $tags . "$list_tag";
173 // ok, so list state is now same as style and depth same as count
174 $this->list_state = $list_style;
175 $this->list_depth = $count;
177 // get indent
178 $indent = substr( " ",1,$count-1 );
180 if ($blank) {
181 $newline = $tags;
183 else {
184 $newline = $tags . $indent . "$item_tag " . $line . "$item_close_tag";
187 return $newline;
191 function line_replace( $line ) {
192 // return line after various formatting replacements
193 // have been made - order is vital to stop them interfering with each other
195 global $CFG;
197 // ---- (at least) means a <hr />
198 // MARKDOWN: no change so leave
200 // is this a list line (starts with * # ; :)
201 if (eregi( "^([*]+|[#]+|[;]+|[:]+) ", $line )) {
202 $line = $this->do_list( $line );
205 // typographic conventions
206 // MARKDOWN: no equiv. so convert to entity as before
207 // $line = str_replace( "--", "&#8212;", $line );
208 // $line = str_replace( " - ", " &#8211; ", $line );
209 $line = str_replace( "...", " &#8230; ", $line );
210 $line = str_replace( "(R)", "&#174;", $line );
211 $line = str_replace( "(r)", "&#174;", $line );
212 $line = str_replace( "(TM)", "&#8482;", $line );
213 $line = str_replace( "(tm)", "&#8482;", $line );
214 $line = str_replace( "(C)", "&#169;", $line );
215 $line = str_replace( "1/4", "&#188;", $line );
216 $line = str_replace( "1/2", "&#189;", $line );
217 $line = str_replace( "3/4", "&#190;", $line );
218 $line = eregi_replace( "([[:digit:]]+[[:space:]]*)x([[:space:]]*[[:digit:]]+)", "\\1&#215;\\2", $line ); // (digits) x (digits) - multiply
219 // do formatting tags
220 // NOTE: The / replacement *has* to be first, or it will screw the
221 // HTML tags that are added by the other ones
222 // MARKDOWN: only bold and italic change, rest are just HTML
223 $line = $this->do_replace_markdown( $line, "\*", "**" );
224 $line = $this->do_replace_markdown( $line, "/", "*" );
225 $line = $this->do_replace( $line, "\+", "ins" );
226 // $line = $this->do_replace( $line, "-", "del" );
227 $line = $this->do_replace_sub( $line, "~", "sub" );
228 $line = $this->do_replace_sub( $line, "\^", "sup" );
229 $line = $this->do_replace( $line, "%", "code" );
230 $line = $this->do_replace( $line, "@", "cite" );
232 // convert urls into proper link with optional link text URL(text)
233 // MARDOWN: HTML conversion should work fine
234 $line = eregi_replace("([[:space:]]|^)([[:alnum:]]+)://([^[:space:]]*)([[:alnum:]#?/&=])\(([^)]+)\)",
235 "\\1[\\5](\\2://\\3\\4)", $line);
236 $line = eregi_replace("([[:space:]])www\.([^[:space:]]*)([[:alnum:]#?/&=])\(([^)]+)\)",
237 "\\1[\\5](http://www.\\2\\3)", $line);
239 // make urls (with and without httpd) into proper links
240 $line = eregi_replace("([[:space:]]|^)([[:alnum:]]+)://([^[:space:]]*)([[:alnum:]#?/&=])",
241 "\\1<\\2://\\3\\4>", $line);
242 $line = eregi_replace("([[:space:]])www\.([^[:space:]]*)([[:alnum:]#?/&=])",
243 "\\1<http://www.\\2\\3\>", $line);
245 // make email addresses into mailtos....
246 // MARKDOWN doesn't quite support this, so do as html
247 $line = eregi_replace("([[:space:]]|^)([[:alnum:]._-]+@[[:alnum:]._-]+)\(([^)]+)\)",
248 "\\1<a href=\"mailto:\\2\">\\3</a>", $line);
250 // !# at the beginning of any lines means a heading
251 // MARKDOWN: value (1-6) becomes number of hashes
252 if (eregi( "^!([1-6]) (.*)$", $line, $regs )) {
253 $depth = substr( $line, 1, 1 );
254 $out = substr( '##########', 0, $depth);
255 $line = eregi_replace( "^!([1-6]) (.*)$", "$out \\2", $line );
258 // acronym handing, example HTML(Hypertext Markyp Language)
259 // MARKDOWN: no equiv. so just leave as HTML
260 $line = ereg_replace( "([A-Z]+)\(([^)]+)\)", "<acronym title=\"\\2\">\\1</acronym>", $line );
262 // Replace resource link >>##(Description Text)
263 // MARKDOWN: change to MD web link style
264 $line = eregi_replace( " ([a-zA-Z]+):([0-9]+)\(([^)]+)\)",
265 " [\\3](".$CFG->wwwroot."/mod/\\1/view.php?id=\\2) ", $line );
267 // Replace picture resource link
268 if ($CFG->slasharguments) {
269 $line = eregi_replace( "/([a-zA-Z0-9./_-]+)(png|gif|jpg)\(([^)]+)\)",
270 "![\\3]($CFG->wwwroot/file.php/$this->courseid/\\1\\2)", $line );
271 } else {
272 $line = eregi_replace( "/([a-zA-Z0-9./_-]+)(png|gif|jpg)\(([^)]+)\)",
273 "![\\3]($CFG->wwwroot/file.php?file=/$this->courseid/\\1\\2)", $line );
276 // Replace file resource link
277 if ($CFG->slasharguments) {
278 $line = eregi_replace( "file:/([[:alnum:]/._-]+)\(([^)]+)\)",
279 "[\\2]($CFG->wwwroot/file.php/$this->courseid/\\1)", $line );
280 } else {
281 $line = eregi_replace( "file:/([[:alnum:]/._-]+)\(([^)]+)\)",
282 "[\\2]($CFG->wwwroot/file.php?file=/$this->courseid/\\1)", $line );
286 return $line;
289 function convert( $content,$courseid ) {
291 // main entry point for processing Wiki-like text
292 // $content is string containing text with Wiki-Like formatting
293 // return: string containing Markdown formatting
295 // initialisation stuff
296 $this->output = "";
297 $this->block_state = STATE_NONE;
298 $this->list_state = LIST_NONE;
299 $this->list_depth = 0;
300 $this->list_backtrack = array();
301 $this->spelling_on = false;
302 $this->courseid = $courseid;
304 // split content into array of single lines
305 $lines = explode( "\n",$content );
306 $buffer = "";
308 // run through lines
309 foreach( $lines as $line ) {
310 // is this a blank line?
311 $blank_line = eregi( "^[[:blank:]\r]*$", $line );
312 if ($blank_line) {
313 // first end current block according to state
314 $buffer = $buffer . $this->close_block( $this->block_state );
315 $this->block_state = STATE_NONE;
316 continue;
319 // act now depending on current block state
320 if ($this->block_state == STATE_NONE) {
321 // first character of line defines block type
322 if (eregi( "^> ",$line )) {
323 // blockquote
324 $buffer = $buffer . $this->line_replace( $line ). "\n";
325 $this->block_state = STATE_BLOCKQUOTE;
327 else
328 if (eregi( "^ ",$line) ) {
329 // preformatted text
330 // MARKDOWN: no real equiv. so just use <pre>
331 $buffer = $buffer . "<pre>\n";
332 $buffer = $buffer . $this->line_replace($line) . "\n";
333 $this->block_state = STATE_PREFORM;
335 else
336 if (eregi("^\% ",$line) ) {
337 // preformatted text - no processing
338 // MARKDOWN: this is MD code form of a paragraph
339 $buffer = $buffer . " " . eregi_replace( "^\%","",$line) . "\n";
340 $this->block_state = STATE_NOTIKI;
342 else {
343 // ordinary paragraph
344 $buffer = $buffer . $this->line_replace($line) . "\n";
345 $this->block_state = STATE_PARAGRAPH;
347 continue;
350 if (($this->block_state == STATE_PARAGRAPH) |
351 ($this->block_state == STATE_BLOCKQUOTE) |
352 ($this->block_state == STATE_PREFORM) ) {
353 $buffer = $buffer . $this->line_replace($line) . "\n";
354 continue;
356 elseif ($this->block_state == STATE_NOTIKI) {
357 $buffer = $buffer . " " .$line . "\n";
361 // close off any block level tags
362 $buffer = $buffer . $this->close_block( $this->block_state );
364 //return $buffer;
365 return $buffer;
368 function update( $thing, $textfield, $formatfield, $coursesql='' ) {
369 // converts the text in a particular activity (or sub-activity)
370 // $thing = the database name for that 'thing' (eg, resource, choice)
371 // $textfield = the name of the field that might hold the wiki-text
372 // $formatfield = the name of the field that contains the format type
373 // $coursesql = if supplied, the query to get the courseid, if not get from the 'course' field
374 // ($id of record is tacked on right at the end, so phrase accordingly)
375 // returns a count of records converted
376 $count = 0;
377 if ($records = get_records( $thing,$formatfield,FORMAT_WIKI )) {
378 foreach( $records as $record ) {
379 $text = $record->$textfield;
380 $id = $record->id;
381 if (!$coursesql) {
382 $courseid = $record->course;
383 } else {
384 $r = get_record_sql( $coursesql . "$id" );
385 $courseid = $r->course;
387 $newtext = $this->convert( $text,$courseid );
389 $record->$textfield = $newtext;
390 $record->$formatfield = FORMAT_MARKDOWN;
391 update_record( $thing, addslashes_object( $record ) );
392 $count++;
395 return $count;