ConnectionManager: Disable hard limit in favor of twiddled hammering values.
[thrasher.git] / perl / lib / Thrasher / XHTML_IM_Normalize.pm
blob67825d9c2c5564cabe92650d7913811b98e45741
1 package Thrasher::XHTML_IM_Normalize;
2 use strict;
3 use warnings;
5 use Thrasher::HTMLNormalize qw(normalize);
7 =pod
9 =head1 NAME
11 Thrasher::XHTMLNormalize - provides utility functions for creating
12 XHTML-IM (XEP-0071) messages out of some HTML-like stuff
14 =head1 SYNOPSIS
16 use Thrasher::XHTMLNormalize qw(xhtml xhtml_and_text);
18 my $test_string = "<html><body><p><i>Almost <b>XHTML</i></b>";
20 # Will return something suitable for insertion as XML.
21 # In this case, '<p><i>Almost <b>XHTML</b></i></p>'.
22 my $xhtml = xhtml($test_string);
24 # Produces the same $xhtml as the previous statement, but
25 # $text will have just the pure-text part of the string.
26 my ($text, $xhtml) = xhtml_and_text($test_string);
28 =head1 DESCRIPTION
30 This utilizes Thrasher::HTMLNormalize to create two functions, one
31 which takes vaguely HTML-ish stuff and outputs XHTML-IM/XEP-0071
32 complaint text suitable for inclusion in an XML stream, and
33 one which additionally returns a pure-text version.
35 The RECOMMENDEDs, SHOULDs, and SHOULD NOTs are resolved
36 as follows:
38 =over
40 =item *
42 Per section 7.1 of XEP-0071, we ignore C<html>, C<head>, and
43 C<title>, stripping them out, and we ignore any C<body> tag,
44 replacing it with our own, on the theory that if we need this
45 normalization in the first place it's probably because we're
46 not actually processing XHTML-IM.
48 =item *
50 Section 7.6.1's recommend styles is upgraded from RECOMMENDED to
51 REQUIRED; no other styles will be permitted to pass through.
53 =item *
55 Section 7.7.1 NOT RECOMMENDs class, id, title, and xml:lang.
56 These are always discarded, in accordance with the given logic.
57 Even if you had some crazy XMPP app that actually expected to
58 be able to push out stylesheets, things on the other end of
59 a transport have no chance of using these correctly.
61 Section 7.7.2 NOT RECOMMENDEDs are also tossed out.
63 =item *
65 There is no support for xml:lang. I don't think anything is
66 emitted localized messages with multiple bodies in a way that works
67 with this correctly.
69 =item *
71 Section 8, bullet point 9 recommends \n => <br/>; this will
72 do that, it also replaces \n\n => <p/>.
74 =back
76 =cut
78 use base 'Exporter';
80 our @EXPORT_OK = qw(xhtml xhtml_and_text text);
81 our %EXPORT_TAGS = (all => \@EXPORT_OK);
83 my %ACCEPTABLE_STYLES = map { $_ => 1 }
84 qw(background-color color font-family font-size font-style
85 font-weight margin-left margin-right text-align
86 text-decoration);
88 sub clean_style {
89 my $style = shift;
91 my @style_chunks = split /;/, $style;
93 my @permitted_chunks;
94 for my $chunk (@style_chunks) {
95 my ($name, $value) = split /\s*:\s*/, $chunk;
96 if (!defined($value)) {
97 # No actual value
98 next;
101 $name = lc $name; # force lowercase, which it should be anyhow
102 if (!$ACCEPTABLE_STYLES{$name}) {
103 next;
106 push @permitted_chunks, "$name\:$value";
109 if (@permitted_chunks) {
110 return join '; ', @permitted_chunks;
111 } else {
112 return 0;
116 my $standard_atts = {style => \&clean_style};
117 my $with_cite = {style => \&clean_style, cite => 1};
119 my $xhtml_im_permissions =
121 # XEP-0071 section 6.1 - all rejected, see perldocs
123 # section 6.2, as described
124 abbr => $standard_atts,
125 acronym => $standard_atts,
126 address => $standard_atts,
127 br => $standard_atts,
128 cite => $standard_atts,
129 code => $standard_atts,
130 dfn => $standard_atts,
131 div => $standard_atts,
132 em => $standard_atts,
133 h1 => $standard_atts,
134 h2 => $standard_atts,
135 h3 => $standard_atts,
136 h4 => $standard_atts,
137 h5 => $standard_atts,
138 h6 => $standard_atts,
139 kdb => $standard_atts,
140 p => $standard_atts,
141 pre => $standard_atts,
142 q => $standard_atts,
143 samp => $standard_atts,
144 span => $standard_atts,
145 strong => $standard_atts,
146 var => $standard_atts,
148 blockquote => $with_cite,
149 q => $with_cite,
151 # Section 6.3, modded by 7.7.2
152 a => {style => \&clean_style, type => 1,
153 href => \&Thrasher::HTMLNormalize::fixURL},
155 # Section 6.4
156 dl => $standard_atts,
157 dt => $standard_atts,
158 dd => $standard_atts,
159 ol => $standard_atts,
160 ul => $standard_atts,
161 li => $standard_atts,
163 # Section 6.5, modded by 7.7.2
164 img => {style => \&clean_style,
165 alt => 1, height => 1, src => 1, width => 1},
168 # This translates from certain HTML tags to equivalent tags
169 # within the
170 my $tag_translations = {
171 'b' => 'strong',
172 'i' => 'em'
175 # No tags for you!
176 my $plain_text_permissions = {};
178 sub xhtml {
179 my $original_text = shift;
180 my $normalized = normalize($original_text, $xhtml_im_permissions, 0,
181 1, $tag_translations);
182 # Strip out the first <p>, since it tends to get in our way
183 # in this application.
184 $normalized =~ s/^\<p[^>]*\>(.*?)\<\/p[^>]*\>/$1/;
185 return $normalized;
188 sub xhtml_and_text {
189 my $original_text = shift;
191 my $xhtml = xhtml($original_text);
192 my $text = text($original_text);
194 return ($xhtml, $text);
197 sub text {
198 my $original_text = shift;
199 my $text = normalize($original_text, $plain_text_permissions, 0,
200 1, undef, 1);
201 return $text;