2 * Based on XML_Utility functions submitted by troels_kn.
3 * credit also to adios, who helped with reg exps:
4 * http://www.sitepoint.com/forums/showthread.php?t=201052
6 * A replacement for HTMLArea.getHTML
9 * - Generates XHTML code
10 * - Much faster than HTMLArea.getHTML
11 * - Eliminates the hacks to accomodate browser quirks
12 * - Returns correct code for Flash objects and scripts
13 * - Formats html in an indented, readable format in html mode
14 * - Preserves script and pre formatting
15 * - Preserves formatting in comments
16 * - Removes contenteditable from body tag in full-page mode
17 * - Supports only7BitPrintablesInURLs config option
18 * - Supports htmlRemoveTags config option
21 function GetHtml(editor
) {
25 GetHtml
._pluginInfo
= {
28 developer
: "Nelson Bright",
29 developer_url
: "http://www.brightworkweb.com/",
33 HTMLArea
.RegExpCache
= [
34 /*00*/ new RegExp().compile(/<\s*\/?([^\s\/>]+)[\s*\/>]/gi),//lowercase tags
35 /*01*/ new RegExp().compile(/(\S*\s*=\s*)?_moz[^=>]*(=\s*[^>]*)?/gi),//strip _moz attributes
36 /*02*/ new RegExp().compile(/\s*=\s*(([^'"][^>\s]*)([>\s])|"([^"]+)"|'([^']+)')/g),// find attributes
37 /*03*/ new RegExp().compile(/\/>/g),//strip singlet terminators
38 /*04*/ // new RegExp().compile(/<(br|hr|img|input|link|meta|param|embed)([^>]*)>/g),//terminate singlet tags
39 /*04*/ new RegExp().compile(/<(br|hr|img|input|link|meta|param|embed|area)((\s*\S*="[^"]*")*)>/g),//terminate singlet tags
40 /*05*/ new RegExp().compile(/(checked|compact|declare|defer|disabled|ismap|multiple|no(href|resize|shade|wrap)|readonly|selected)([\s>])/gi),//expand singlet attributes
41 /*06*/ new RegExp().compile(/(="[^']*)'([^'"]*")/),//check quote nesting
42 /*07*/ new RegExp().compile(/&(?=[^<]*>)/g),//expand query ampersands
43 /*08*/ new RegExp().compile(/<\s+/g),//strip tagstart whitespace
44 /*09*/ new RegExp().compile(/\s+(\/)?>/g),//trim whitespace
45 /*10*/ new RegExp().compile(/\s{2,}/g),//trim extra whitespace
46 /*11*/ new RegExp().compile(/\s+([^=\s]+)(="[^"]+")/g),// lowercase attribute names
47 /*12*/ new RegExp().compile(/(\S*\s*=\s*)?contenteditable[^=>]*(=\s*[^>\s\/]*)?/gi),//strip contenteditable
48 /*13*/ new RegExp().compile(/((href|src)=")([^\s]*)"/g), //find href and src for stripBaseHref()
49 /*14*/ new RegExp().compile(/<\/?(div|p|h[1-6]|table|tr|td|th|ul|ol|li|blockquote|object|br|hr|img|embed|param|pre|script|html|head|body|meta|link|title|area)[^>]*>/g),
50 /*15*/ new RegExp().compile(/<\/(div|p|h[1-6]|table|tr|td|th|ul|ol|li|blockquote|object|html|head|body|script)( [^>]*)?>/g),//blocklevel closing tag
51 /*16*/ new RegExp().compile(/<(div|p|h[1-6]|table|tr|td|th|ul|ol|li|blockquote|object|html|head|body|script)( [^>]*)?>/g),//blocklevel opening tag
52 /*17*/ new RegExp().compile(/<(br|hr|img|embed|param|pre|meta|link|title|area)[^>]*>/g),//singlet tag
53 /*18*/ new RegExp().compile(/(^|<\/(pre|script)>)(\s|[^\s])*?(<(pre|script)[^>]*>|$)/g),//find content NOT inside pre and script tags
54 /*19*/ new RegExp().compile(/(<pre[^>]*>)(\s|[^\s])*?(<\/pre>)/g),//find content inside pre tags
55 /*20*/ new RegExp().compile(/(^|<!--(\s|\S)*?-->)((\s|\S)*?)(?=<!--(\s|\S)*?-->|$)/g),//find content NOT inside comments
56 /*21*/ new RegExp().compile(/\S*=""/g), //find empty attributes
57 /*22*/ new RegExp().compile(/<!--[\s\S]*?-->|<\?[\s\S]*?\?>|<[^>]*>/g) //find all tags, including comments and php
61 * Cleans HTML into wellformed xhtml
63 HTMLArea
.prototype.cleanHTML = function(sHtml
) {
64 var c
= HTMLArea
.RegExpCache
;
66 replace(c
[0], function(str
) { return str
.toLowerCase(); } ).//lowercase tags/attribute names
67 replace(c
[1], ' ').//strip _moz attributes
68 replace(c
[12], ' ').//strip contenteditable
69 replace(c
[2], '="$2$4$5"$3').//add attribute quotes
70 replace(c
[21], ' ').//strip empty attributes
71 replace(c
[11], function(str
, p1
, p2
) { return ' '+p1
.toLowerCase()+p2
; }).//lowercase attribute names
72 replace(c
[3], '>').//strip singlet terminators
73 replace(c
[9], '$1>').//trim whitespace
74 replace(c
[5], '$1="$1"$3').//expand singlet attributes
75 replace(c
[4], '<$1$2 />').//terminate singlet tags
76 replace(c
[6], '$1$2').//check quote nesting
77 // replace(c[7], '&').//expand query ampersands
78 replace(c
[8], '<').//strip tagstart whitespace
79 replace(c
[10], ' ');//trim extra whitespace
80 if(HTMLArea
.is_ie
&& c
[13].test(sHtml
)) {//
81 sHtml
= sHtml
.replace(c
[13],'$1'+this.stripBaseURL(RegExp
.$3)+'"');
83 if(this.config
.only7BitPrintablesInURLs
&& c
[13].test(sHtml
)) {
84 sHtml
= sHtml
.replace(c
[13], '$1'+RegExp
.$3.replace(/([^!-~]+)/g,function(chr
){return escape(chr
);})+'"');
90 * Prettyfies html by inserting linebreaks before tags, and indenting blocklevel tags
92 HTMLArea
.indent = function(s
, sindentChar
) {
93 HTMLArea
.__nindent
= 0;
94 HTMLArea
.__sindent
= "";
95 HTMLArea
.__sindentChar
= (typeof sindentChar
== "undefined") ? " " : sindentChar
;
96 var c
= HTMLArea
.RegExpCache
;
97 if(HTMLArea
.is_gecko
) { //moz changes returns into <br> inside <pre> tags
98 s
= s
.replace(c
[19], function(str
){return str
.replace(/<br \/>/g,"\n")});
100 s
= s
.replace(c
[18], function(strn
) { //skip pre and script tags
101 strn
= strn
.replace(c
[20], function(st
,$1,$2,$3) { //exclude comments
102 string
= $3.replace(/[\n\r]/gi, " ").replace(/\s+/gi," ").replace(c
[14], function(str
) {
103 if (str
.match(c
[16])) {
104 var s
= "\n" + HTMLArea
.__sindent
+ str
;
105 // blocklevel openingtag - increase indent
106 HTMLArea
.__sindent
+= HTMLArea
.__sindentChar
;
107 ++HTMLArea
.__nindent
;
109 } else if (str
.match(c
[15])) {
110 // blocklevel closingtag - decrease indent
111 --HTMLArea
.__nindent
;
112 HTMLArea
.__sindent
= "";
113 for (var i
=HTMLArea
.__nindent
;i
>0;--i
) {
114 HTMLArea
.__sindent
+= HTMLArea
.__sindentChar
;
116 return "\n" + HTMLArea
.__sindent
+ str
;
117 } else if (str
.match(c
[17])) {
119 return "\n" + HTMLArea
.__sindent
+ str
;
121 return str
; // this won't actually happen
126 if (s
.charAt(0) == "\n") {
127 return s
.substring(1, s
.length
);
129 s
= s
.replace(/ *\n/g,'\n');//strip spaces at end
of lines
133 HTMLArea
.getHTML = function(root
, outputRoot
, editor
) {
135 var c
= HTMLArea
.RegExpCache
;
137 if(root
.nodeType
== 11) {//document fragment
138 //we can't get innerHTML from the root (type 11) node, so we
139 //copy all the child nodes into a new div and get innerHTML from the div
140 var div
= document
.createElement("div");
141 var temp
= root
.insertBefore(div
,root
.firstChild
);
142 for (j
= temp
.nextSibling
; j
; j
= j
.nextSibling
) {
143 temp
.appendChild(j
.cloneNode(true));
145 html
+= temp
.innerHTML
.replace(c
[22], function(tag
){
146 if(/^<[!\?]/.test(tag
)) return tag
; //skip comments and php tags
147 else return editor
.cleanHTML(tag
)});
151 var root_tag
= (root
.nodeType
== 1) ? root
.tagName
.toLowerCase() : '';
152 if (outputRoot
) { //only happens with <html> tag in fullpage mode
153 html
+= "<" + root_tag
;
154 var attrs
= root
.attributes
; // strangely, this doesn't work in moz
155 for (i
= 0; i
< attrs
.length
; ++i
) {
156 var a
= attrs
.item(i
);
160 var name
= a
.nodeName
.toLowerCase();
161 var value
= a
.nodeValue
;
162 html
+= " " + name
+ '="' + value
+ '"';
166 if(root_tag
== "html") {
167 innerhtml
= editor
._doc
.documentElement
.innerHTML
;
169 innerhtml
= root
.innerHTML
;
171 //pass tags to cleanHTML() one at a time
172 //includes support for htmlRemoveTags config option
173 html
+= innerhtml
.replace(c
[22], function(tag
){
174 if(/^<[!\?]/.test(tag
)) return tag
; //skip comments and php tags
175 else if(!(editor
.config
.htmlRemoveTags
&& editor
.config
.htmlRemoveTags
.test(tag
.replace(/<([^\s>\/]+)/,'$1'))))
176 return editor
.cleanHTML(tag
);
178 //IE drops all </li> tags in a list except the last one
180 html
= html
.replace(/<li( [^>]*)?>/g,'</li><li$1>').
181 replace(/(<(ul|ol)[^>]*>)[\s\n]*<\/li>/g, '$1').
182 replace(/<\/li>([\s\n]*<\/li>)+/g, '<\/li>');
184 if(HTMLArea
.is_gecko
)
185 html
= html
.replace(/(.*)<br \/>\n$/, '$1'). //strip trailing <br> added by moz
186 replace(/^\n(.*)/, '$1'); //strip leading newline added by moz
188 html
+= "</" + root_tag
+ ">";
190 // html = HTMLArea.indent(html);//see bug #6106
192 // html = HTMLArea.htmlEncode(html);
197 //override (hack) outwardHtml() to handle onclick suppression
198 HTMLArea
.prototype._origOutwardHtml
= HTMLArea
.prototype.outwardHtml
;
199 HTMLArea
.prototype.outwardHtml = function(html
) {
200 html
= html
.replace("onclick=\"try{if(document.designMode && document.designMode == 'on') return false;}catch(e){} window.open(", "onclick=\"window.open(");
201 html
= this._origOutwardHtml(html
);