1 package Thrasher
::XHTML_IM_Normalize
;
5 use Thrasher
::HTMLNormalize
qw(normalize);
11 Thrasher::XHTMLNormalize - provides utility functions for creating
12 XHTML-IM (XEP-0071) messages out of some HTML-like stuff
16 use Thrasher::XHTMLNormalize qw(xhtml xhtml_and_text);
18 my $test_string = "<html><body><p><i>Almost <b>XHTML</i></b>";
20 # Will return something suitable for insertion as XML.
21 # In this case, '<p><i>Almost <b>XHTML</b></i></p>'.
22 my $xhtml = xhtml($test_string);
24 # Produces the same $xhtml as the previous statement, but
25 # $text will have just the pure-text part of the string.
26 my ($text, $xhtml) = xhtml_and_text($test_string);
30 This utilizes Thrasher::HTMLNormalize to create two functions, one
31 which takes vaguely HTML-ish stuff and outputs XHTML-IM/XEP-0071
32 complaint text suitable for inclusion in an XML stream, and
33 one which additionally returns a pure-text version.
35 The RECOMMENDEDs, SHOULDs, and SHOULD NOTs are resolved
42 Per section 7.1 of XEP-0071, we ignore C<html>, C<head>, and
43 C<title>, stripping them out, and we ignore any C<body> tag,
44 replacing it with our own, on the theory that if we need this
45 normalization in the first place it's probably because we're
46 not actually processing XHTML-IM.
50 Section 7.6.1's recommend styles is upgraded from RECOMMENDED to
51 REQUIRED; no other styles will be permitted to pass through.
55 Section 7.7.1 NOT RECOMMENDs class, id, title, and xml:lang.
56 These are always discarded, in accordance with the given logic.
57 Even if you had some crazy XMPP app that actually expected to
58 be able to push out stylesheets, things on the other end of
59 a transport have no chance of using these correctly.
61 Section 7.7.2 NOT RECOMMENDEDs are also tossed out.
65 There is no support for xml:lang. I don't think anything is
66 emitted localized messages with multiple bodies in a way that works
71 Section 8, bullet point 9 recommends \n => <br/>; this will
72 do that, it also replaces \n\n => <p/>.
80 our @EXPORT_OK = qw(xhtml xhtml_and_text text);
81 our %EXPORT_TAGS = (all
=> \
@EXPORT_OK);
83 my %ACCEPTABLE_STYLES = map { $_ => 1 }
84 qw(background-color color font-family font-size font-style
85 font-weight margin-left margin-right text-align
91 my @style_chunks = split /;/, $style;
94 for my $chunk (@style_chunks) {
95 my ($name, $value) = split /\s*:\s*/, $chunk;
96 if (!defined($value)) {
101 $name = lc $name; # force lowercase, which it should be anyhow
102 if (!$ACCEPTABLE_STYLES{$name}) {
106 push @permitted_chunks, "$name\:$value";
109 if (@permitted_chunks) {
110 return join '; ', @permitted_chunks;
116 my $standard_atts = {style
=> \
&clean_style
};
117 my $with_cite = {style
=> \
&clean_style
, cite
=> 1};
119 my $xhtml_im_permissions =
121 # XEP-0071 section 6.1 - all rejected, see perldocs
123 # section 6.2, as described
124 abbr
=> $standard_atts,
125 acronym
=> $standard_atts,
126 address
=> $standard_atts,
127 br
=> $standard_atts,
128 cite
=> $standard_atts,
129 code
=> $standard_atts,
130 dfn
=> $standard_atts,
131 div
=> $standard_atts,
132 em
=> $standard_atts,
133 h1
=> $standard_atts,
134 h2
=> $standard_atts,
135 h3
=> $standard_atts,
136 h4
=> $standard_atts,
137 h5
=> $standard_atts,
138 h6
=> $standard_atts,
139 kdb
=> $standard_atts,
141 pre
=> $standard_atts,
143 samp
=> $standard_atts,
144 span
=> $standard_atts,
145 strong
=> $standard_atts,
146 var
=> $standard_atts,
148 blockquote
=> $with_cite,
151 # Section 6.3, modded by 7.7.2
152 a
=> {style
=> \
&clean_style
, type
=> 1,
153 href
=> \
&Thrasher
::HTMLNormalize
::fixURL
},
156 dl
=> $standard_atts,
157 dt
=> $standard_atts,
158 dd
=> $standard_atts,
159 ol
=> $standard_atts,
160 ul
=> $standard_atts,
161 li
=> $standard_atts,
163 # Section 6.5, modded by 7.7.2
164 img
=> {style
=> \
&clean_style
,
165 alt
=> 1, height
=> 1, src
=> 1, width
=> 1},
168 # This translates from certain HTML tags to equivalent tags
170 my $tag_translations = {
176 my $plain_text_permissions = {};
179 my $original_text = shift;
180 my $normalized = normalize
($original_text, $xhtml_im_permissions, 0,
181 1, $tag_translations);
182 # Strip out the first <p>, since it tends to get in our way
183 # in this application.
184 $normalized =~ s/^\<p[^>]*\>(.*?)\<\/p[^>]*\>/$1/;
189 my $original_text = shift;
191 my $xhtml = xhtml
($original_text);
192 my $text = text
($original_text);
194 return ($xhtml, $text);
198 my $original_text = shift;
199 my $text = normalize
($original_text, $plain_text_permissions, 0,