MDL-15476
[moodle-linuxchix.git] / lib / html2text.php
blob42c05565ff3b17c279365d06dfdcfb20c10e428e
1 <?php
3 /***************************************************************
4 * Library to convert HTML into an approximate text equivalent *
5 ***************************************************************
7 Version: 1.0.3 (with modifications)
8 Copyright 2003 Mark Wilton-Jones
9 License: HowToCreate script license with written permission
10 URL: http://www.howtocreate.co.uk/php/
12 For full details about the script and to get the latest version,
13 please see the HowToCreate web site above.
15 This version contains modifications for Moodle. In each case the
16 lines are marked with "Moodle", so you can see what has changed.
18 ********************************************************************/
20 function html2text( $badStr ) {
22 $is_open_tb = false;
23 $is_open_dq = false;
24 $is_open_sq = false;
26 //remove comments
28 while (substr_count($badStr, '<!--') &&
29 substr_count($badStr, '-->') &&
30 strpos($badStr, '-->', strpos($badStr, '<!--' ) ) > strpos( $badStr, '<!--' ) ) {
31 $badStr = substr( $badStr, 0, strpos( $badStr, '<!--' ) ) .
32 substr( $badStr, strpos( $badStr, '-->',
33 strpos( $badStr, '<!--' ) ) + 3 );
36 //now make sure all HTML tags are correctly written (> not in between quotes)
38 $len = strlen($badStr); // Moodle
39 $chr = $badStr{0}; // Moodle
40 $goodStr = ''; // Moodle
42 if ($len > 0) { // Moodle
43 for ($x=0; $x < $len; $x++ ) { // Moodle
44 $chr = $badStr{$x}; //take each letter in turn and check if that character is permitted there
45 switch ( $chr ) {
46 case '<':
47 if ( !$is_open_tb && strtolower( substr( $badStr, $x + 1, 5 ) ) == 'style' ) {
48 $x = strpos( strtolower( $badStr ), '</style>', $x ) + 7; // Moodle
49 $chr = '';
50 } else if ( !$is_open_tb && strtolower( substr( $badStr, $x + 1, 6 ) ) == 'script' ) {
51 $x = strpos( strtolower( $badStr ), '</script>', $x ) + 8; // Moodle
52 $chr = '';
53 } else if (!$is_open_tb) {
54 $is_open_tb = true;
55 } else {
56 $chr = '&lt;';
58 break;
60 case '>':
61 if ( !$is_open_tb || $is_open_dq || $is_open_sq ) {
62 $chr = '&gt;';
63 } else {
64 $is_open_tb = false;
66 break;
68 case '"':
69 if ( $is_open_tb && !$is_open_dq && !$is_open_sq ) {
70 $is_open_dq = true;
71 } else if ( $is_open_tb && $is_open_dq && !$is_open_sq ) {
72 $is_open_dq = false;
73 } else {
74 $chr = '&quot;';
76 break;
78 case "'":
79 if ( $is_open_tb && !$is_open_dq && !$is_open_sq ) {
80 $is_open_sq = true;
81 } else if ( $is_open_tb && !$is_open_dq && $is_open_sq ) {
82 $is_open_sq = false;
84 break;
86 $goodStr .= $chr;
88 } // Moodle
90 //now that the page is valid (I hope) for strip_tags, strip all unwanted tags
92 $goodStr = strip_tags( $goodStr, '<title><hr><h1><h2><h3><h4><h5><h6><div><p><pre><sup><ul><ol><br><dl><dt><table><caption><tr><li><dd><th><td><a><area><img><form><input><textarea><button><select><option>' );
94 //strip extra whitespace except between <pre> and <textarea> tags
96 $badStr = preg_split( "/<\/?pre[^>]*>/i", $goodStr );
98 for ( $x = 0; isset($badStr[$x]) && is_string( $badStr[$x] ); $x++ ) { // Moodle: added isset() test
99 if ( $x % 2 ) { $badStr[$x] = '<pre>'.$badStr[$x].'</pre>'; } else {
100 $goodStr = preg_split( "/<\/?textarea[^>]*>/i", $badStr[$x] );
101 for ( $z = 0; isset($goodStr[$z]) && is_string( $goodStr[$z] ); $z++ ) { // Moodle: added isset() test
102 if ( $z % 2 ) { $goodStr[$z] = '<textarea>'.$goodStr[$z].'</textarea>'; } else {
103 $goodStr[$z] = str_replace(' ', ' ', $goodStr[$z] );
106 $badStr[$x] = implode('',$goodStr);
110 $goodStr = implode('',$badStr);
112 //remove all options from select inputs
114 $goodStr = preg_replace( "/<option[^>]*>[^<]*/i", '', $goodStr );
116 //replace all tags with their text equivalents
118 $goodStr = preg_replace( "/<(\/title|hr)[^>]*>/i", "\n --------------------\n", $goodStr );
120 $goodStr = preg_replace( "/<(h|div|p)[^>]*>/i", "\n\n", $goodStr );
122 $goodStr = preg_replace( "/<sup[^>]*>/i", '^', $goodStr );
124 $goodStr = preg_replace( "/<(ul|ol|br|dl|dt|table|caption|\/textarea|tr[^>]*>\s*<(td|th))[^>]*>/i", "\n", $goodStr );
126 $goodStr = preg_replace( "/<li[^>]*>/i", "\n� ", $goodStr );
128 $goodStr = preg_replace( "/<dd[^>]*>/i", "\n\t", $goodStr );
130 $goodStr = preg_replace( "/<(th|td)[^>]*>/i", "\t", $goodStr );
132 // $goodStr = preg_replace( "/<a[^>]* href=(\"((?!\"|#|javascript:)[^\"#]*)(\"|#)|'((?!'|#|javascript:)[^'#]*)('|#)|((?!'|\"|>|#|javascript:)[^#\"'> ]*))[^>]*>/i", "[LINK: $2$4$6] ", $goodStr ); // Moodle
133 $goodStr = preg_replace( "/<a[^>]* href=(\"((?!\"|#|javascript:)[^\"#]*)(\"|#)|'((?!'|#|javascript:)[^'#]*)('|#)|((?!'|\"|>|#|javascript:)[^#\"'> ]*))[^>]*>([^<]*)<\/a>/i", "$7 [$2$4$6]", $goodStr );
135 // $goodStr = preg_replace( "/<img[^>]* alt=(\"([^\"]+)\"|'([^']+)'|([^\"'> ]+))[^>]*>/i", "[IMAGE: $2$3$4] ", $goodStr ); // Moodle
136 $goodStr = preg_replace( "/<img[^>]* alt=(\"([^\"]+)\"|'([^']+)'|([^\"'> ]+))[^>]*>/i", "[$2$3$4] ", $goodStr );
138 $goodStr = preg_replace( "/<form[^>]* action=(\"([^\"]+)\"|'([^']+)'|([^\"'> ]+))[^>]*>/i", "\n[FORM: $2$3$4] ", $goodStr );
140 $goodStr = preg_replace( "/<(input|textarea|button|select)[^>]*>/i", "[INPUT] ", $goodStr );
142 //strip all remaining tags (mostly closing tags)
144 $goodStr = strip_tags( $goodStr );
146 //convert HTML entities
148 $goodStr = strtr( $goodStr, array_flip( get_html_translation_table( HTML_ENTITIES ) ) );
150 preg_replace( "/&#(\d+);/me", "chr('$1')", $goodStr );
152 //wordwrap
154 // $goodStr = wordwrap( $goodStr ); // Moodle
155 $goodStr = wordwrap( $goodStr, 78 );
157 //make sure there are no more than 3 linebreaks in a row and trim whitespace
158 $goodStr = preg_replace("/\r\n?|\f/", "\n", $goodStr);
159 $goodStr = preg_replace("/\n(\s*\n){2}/", "\n\n\n", $goodStr);
160 $goodStr = preg_replace("/[ \t]+(\n|$)/", "$1", $goodStr);
161 $goodStr = preg_replace("/^\n*|\n*$/", '', $goodStr);
163 return $goodStr;