XHTML_IM_Normalize: factor out text-decoration span callback creator.
[thrasher.git] / perl / lib / Thrasher / XHTML_IM_Normalize.pm
blob9cefb4ca4864b8d56d128fb53fa949a5109d5d4d
1 package Thrasher::XHTML_IM_Normalize;
2 use strict;
3 use warnings;
5 use Thrasher::HTMLNormalize qw(normalize);
7 =pod
9 =head1 NAME
11 Thrasher::XHTMLNormalize - provides utility functions for creating
12 XHTML-IM (XEP-0071) messages out of some HTML-like stuff
14 =head1 SYNOPSIS
16 use Thrasher::XHTMLNormalize qw(xhtml xhtml_and_text);
18 my $test_string = "<html><body><p><i>Almost <b>XHTML</i></b>";
20 # Will return something suitable for insertion as XML.
21 # In this case, '<p><i>Almost <b>XHTML</b></i></p>'.
22 my $xhtml = xhtml($test_string);
24 # Produces the same $xhtml as the previous statement, but
25 # $text will have just the pure-text part of the string.
26 my ($text, $xhtml) = xhtml_and_text($test_string);
28 =head1 DESCRIPTION
30 This utilizes Thrasher::HTMLNormalize to create two functions, one
31 which takes vaguely HTML-ish stuff and outputs XHTML-IM/XEP-0071
32 complaint text suitable for inclusion in an XML stream, and
33 one which additionally returns a pure-text version.
35 The RECOMMENDEDs, SHOULDs, and SHOULD NOTs are resolved
36 as follows:
38 =over
40 =item *
42 Per section 7.1 of XEP-0071, we ignore C<html>, C<head>, and
43 C<title>, stripping them out, and we ignore any C<body> tag,
44 replacing it with our own, on the theory that if we need this
45 normalization in the first place it's probably because we're
46 not actually processing XHTML-IM.
48 =item *
50 Section 7.6.1's recommend styles is upgraded from RECOMMENDED to
51 REQUIRED; no other styles will be permitted to pass through.
53 =item *
55 Section 7.7.1 NOT RECOMMENDs class, id, title, and xml:lang.
56 These are always discarded, in accordance with the given logic.
57 Even if you had some crazy XMPP app that actually expected to
58 be able to push out stylesheets, things on the other end of
59 a transport have no chance of using these correctly.
61 Section 7.7.2 NOT RECOMMENDEDs are also tossed out.
63 =item *
65 There is no support for xml:lang. I don't think anything is
66 emitted localized messages with multiple bodies in a way that works
67 with this correctly.
69 =item *
71 Section 8, bullet point 9 recommends \n => <br/>; this will
72 do that, it also replaces \n\n => <p/>.
74 =back
76 =cut
78 use base 'Exporter';
80 our @EXPORT_OK = qw(xhtml xhtml_and_text text);
81 our %EXPORT_TAGS = (all => \@EXPORT_OK);
83 my %ACCEPTABLE_STYLES = map { $_ => 1 }
84 qw(background-color color font-family font-size font-style
85 font-weight margin-left margin-right text-align
86 text-decoration);
88 sub clean_style {
89 my $style = shift;
91 my @style_chunks = split /;/, $style;
93 my @permitted_chunks;
94 for my $chunk (@style_chunks) {
95 my ($name, $value) = split /\s*:\s*/, $chunk;
96 if (!defined($value)) {
97 # No actual value
98 next;
101 $name = lc $name; # force lowercase, which it should be anyhow
102 if (!$ACCEPTABLE_STYLES{$name}) {
103 next;
106 push @permitted_chunks, "$name\:$value";
109 if (@permitted_chunks) {
110 return join '; ', @permitted_chunks;
111 } else {
112 return 0;
116 my $standard_atts = {style => \&clean_style};
117 my $with_cite = {style => \&clean_style, cite => 1};
119 my $xhtml_im_permissions =
121 # XEP-0071 section 6.1 - all rejected, see perldocs
123 # section 6.2, as described
124 abbr => $standard_atts,
125 acronym => $standard_atts,
126 address => $standard_atts,
127 br => $standard_atts,
128 cite => $standard_atts,
129 code => $standard_atts,
130 dfn => $standard_atts,
131 div => $standard_atts,
132 em => $standard_atts,
133 h1 => $standard_atts,
134 h2 => $standard_atts,
135 h3 => $standard_atts,
136 h4 => $standard_atts,
137 h5 => $standard_atts,
138 h6 => $standard_atts,
139 kdb => $standard_atts,
140 p => $standard_atts,
141 pre => $standard_atts,
142 q => $standard_atts,
143 samp => $standard_atts,
144 span => $standard_atts,
145 strong => $standard_atts,
146 var => $standard_atts,
148 blockquote => $with_cite,
149 q => $with_cite,
151 # Section 6.3, modded by 7.7.2
152 a => {style => \&clean_style, type => 1,
153 href => \&Thrasher::HTMLNormalize::fixURL},
155 # Section 6.4
156 dl => $standard_atts,
157 dt => $standard_atts,
158 dd => $standard_atts,
159 ol => $standard_atts,
160 ul => $standard_atts,
161 li => $standard_atts,
163 # Section 6.5, modded by 7.7.2
164 img => {style => \&clean_style,
165 alt => 1, height => 1, src => 1, width => 1},
168 # $tag_translations translate certain HTML tags to equivalent XHTML(-IM) tags.
170 # throw_a_spanner($text_decoration): Return a tag translator to modify
171 # a tokenizing $groups to be a <span> with the given text-decoration style.
172 sub throw_a_spanner {
173 my ($text_decoration) = @_;
174 return sub {
175 my ($groups) = @_;
176 $groups->{'tagname'} = 'span';
177 if (! $groups->{'slash'}) {
178 $groups->{'tagremains'}
179 = qq|style="text-decoration: $text_decoration;"|;
183 my $tag_translations = {
184 'b' => 'strong',
185 'i' => 'em',
186 'u' => throw_a_spanner('underline'),
189 # No tags for you!
190 my $plain_text_permissions = {};
192 sub xhtml {
193 my $original_text = shift;
194 my $normalized = normalize($original_text, $xhtml_im_permissions, 0,
195 1, $tag_translations);
196 # Strip out the first <p>, since it tends to get in our way
197 # in this application.
198 $normalized =~ s/^\<p[^>]*\>(.*?)\<\/p[^>]*\>/$1/;
199 return $normalized;
202 sub xhtml_and_text {
203 my $original_text = shift;
205 my $xhtml = xhtml($original_text);
206 my $text = text($original_text);
208 return ($xhtml, $text);
211 sub text {
212 my $original_text = shift;
213 my $text = normalize($original_text, $plain_text_permissions, 0,
214 1, undef, 1);
215 return $text;