perl/lib/Thrasher/XHTML_IM_Normalize.pm

   1 package Thrasher::XHTML_IM_Normalize;
   2 use strict;
   3 use warnings;
   4
   5 use Thrasher::HTMLNormalize qw(normalize);
   6
   7 =pod
   8
   9 =head1 NAME
  10
  11 Thrasher::XHTMLNormalize - provides utility functions for creating
  12 XHTML-IM (XEP-0071) messages out of some HTML-like stuff
  13
  14 =head1 SYNOPSIS
  15
  16  use Thrasher::XHTMLNormalize qw(xhtml xhtml_and_text);
  17
  18  my $test_string = "<html><body><p><i>Almost <b>XHTML</i></b>";
  19
  20  # Will return something suitable for insertion as XML.
  21  # In this case, '<p><i>Almost <b>XHTML</b></i></p>'.
  22  my $xhtml = xhtml($test_string);
  23
  24  # Produces the same $xhtml as the previous statement, but
  25  # $text will have just the pure-text part of the string.
  26  my ($text, $xhtml) = xhtml_and_text($test_string);
  27
  28 =head1 DESCRIPTION
  29
  30 This utilizes Thrasher::HTMLNormalize to create two functions, one
  31 which takes vaguely HTML-ish stuff and outputs XHTML-IM/XEP-0071
  32 complaint text suitable for inclusion in an XML stream, and
  33 one which additionally returns a pure-text version.
  34
  35 The RECOMMENDEDs, SHOULDs, and SHOULD NOTs are resolved
  36 as follows:
  37
  38 =over
  39
  40 =item *
  41
  42 Per section 7.1 of XEP-0071, we ignore C<html>, C<head>, and
  43 C<title>, stripping them out, and we ignore any C<body> tag,
  44 replacing it with our own, on the theory that if we need this
  45 normalization in the first place it's probably because we're
  46 not actually processing XHTML-IM.
  47
  48 =item *
  49
  50 Section 7.6.1's recommend styles is upgraded from RECOMMENDED to
  51 REQUIRED; no other styles will be permitted to pass through.
  52
  53 =item *
  54
  55 Section 7.7.1 NOT RECOMMENDs class, id, title, and xml:lang.
  56 These are always discarded, in accordance with the given logic.
  57 Even if you had some crazy XMPP app that actually expected to
  58 be able to push out stylesheets, things on the other end of
  59 a transport have no chance of using these correctly.
  60
  61 Section 7.7.2 NOT RECOMMENDEDs are also tossed out.
  62
  63 =item *
  64
  65 There is no support for xml:lang. I don't think anything is
  66 emitted localized messages with multiple bodies in a way that works
  67 with this correctly.
  68
  69 =item *
  70
  71 Section 8, bullet point 9 recommends \n => <br/>; this will
  72 do that, it also replaces \n\n => <p/>.
  73
  74 =back
  75
  76 =cut
  77
  78 use base 'Exporter';
  79
  80 our @EXPORT_OK = qw(xhtml xhtml_and_text text);
  81 our %EXPORT_TAGS = (all => \@EXPORT_OK);
  82
  83 my %ACCEPTABLE_STYLES = map { $_ => 1 }
  84     qw(background-color color font-family font-size font-style
  85        font-weight margin-left margin-right text-align
  86        text-decoration);
  87
  88 sub clean_style {
  89     my $style = shift;
  90
  91     my @style_chunks = split /;/, $style;
  92
  93     my @permitted_chunks;
  94     for my $chunk (@style_chunks) {
  95         my ($name, $value) = split /\s*:\s*/, $chunk;
  96         if (!defined($value)) {
  97             # No actual value
  98             next;
  99         }
 100
 101         $name = lc $name; # force lowercase, which it should be anyhow
 102         if (!$ACCEPTABLE_STYLES{$name}) {
 103             next;
 104         }
 105
 106         push @permitted_chunks, "$name\:$value";
 107     }
 108
 109     if (@permitted_chunks) {
 110         return join '; ', @permitted_chunks;
 111     } else {
 112         return 0;
 113     }
 114 }
 115
 116 my $standard_atts = {style => \&clean_style};
 117 my $with_cite = {style => \&clean_style, cite => 1};
 118
 119 my $xhtml_im_permissions =
 120 {
 121     # XEP-0071 section 6.1 - all rejected, see perldocs
 122
 123     # section 6.2, as described
 124     abbr => $standard_atts,
 125     acronym => $standard_atts,
 126     address => $standard_atts,
 127     br => $standard_atts,
 128     cite => $standard_atts,
 129     code => $standard_atts,
 130     dfn => $standard_atts,
 131     div => $standard_atts,
 132     em => $standard_atts,
 133     h1 => $standard_atts,
 134     h2 => $standard_atts,
 135     h3 => $standard_atts,
 136     h4 => $standard_atts,
 137     h5 => $standard_atts,
 138     h6 => $standard_atts,
 139     kdb => $standard_atts,
 140     p => $standard_atts,
 141     pre => $standard_atts,
 142     q => $standard_atts,
 143     samp => $standard_atts,
 144     span => $standard_atts,
 145     strong => $standard_atts,
 146     var => $standard_atts,
 147
 148     blockquote => $with_cite,
 149     q => $with_cite,
 150
 151     # Section 6.3, modded by 7.7.2
 152     a => {style => \&clean_style, type => 1,
 153           href => \&Thrasher::HTMLNormalize::fixURL},
 154
 155     # Section 6.4
 156     dl => $standard_atts,
 157     dt => $standard_atts,
 158     dd => $standard_atts,
 159     ol => $standard_atts,
 160     ul => $standard_atts,
 161     li => $standard_atts,
 162
 163     # Section 6.5, modded by 7.7.2
 164     img => {style => \&clean_style,
 165             alt => 1, height => 1, src => 1, width => 1},
 166 };
 167
 168 # This translates from certain HTML tags to equivalent tags
 169 # within the
 170 my $tag_translations = {
 171     'b' => 'strong',
 172     'i' => 'em'
 173 };
 174
 175 # No tags for you!
 176 my $plain_text_permissions = {};
 177
 178 sub xhtml {
 179     my $original_text = shift;
 180     my $normalized = normalize($original_text, $xhtml_im_permissions, 0,
 181                      1, $tag_translations);
 182     # Strip out the first <p>, since it tends to get in our way
 183     # in this application.
 184     $normalized =~ s/^\<p[^>]*\>(.*?)\<\/p[^>]*\>/$1/;
 185     return $normalized;
 186 }
 187
 188 sub xhtml_and_text {
 189     my $original_text = shift;
 190
 191     my $xhtml = xhtml($original_text);
 192     my $text = text($original_text);
 193
 194     return ($xhtml, $text);
 195 }
 196
 197 sub text {
 198     my $original_text = shift;
 199     my $text = normalize($original_text, $plain_text_permissions, 0,
 200                          1, undef, 1);
 201     return $text;
 202 }