perl/lib/Thrasher/HTMLNormalize.pm

   1 package Thrasher::HTMLNormalize;
   2 use strict;
   3 use warnings;
   4
   5 use Thrasher::HTMLNormalize::Entities qw(html_entity);
   6
   7 use base 'Exporter';
   8
   9 our @EXPORT_OK = qw(&escape $CONSERVATIVE_PERMISSIONS &normalize
  10                     &escape_quote $CONSERVATIVE_PERMISSIONS_NO_LINKS);
  11
  12 our %EXPORT_TAGS = (all => \@EXPORT_OK);
  13
  14 my $VALIDATE_AS_XML = 0;
  15
  16 ###
  17 ### PLEASE NOTE: This module has unit tests. Changing this module
  18 ###              without verifying them is a bad idea.
  19 ###
  20
  21 # Also note that this file isn't as dirty as it looks; that's
  22 # just the natural result of sticking an entire compiler in one
  23 # file.
  24
  25 ###
  26 ### CONSTANT AND DATA DECLARATIONS
  27 ###
  28 ### Recall "my" is actually file scoped, not package scoped.
  29 ### use constant, however, *is* package scoped, so it's darned inconvenient.
  30
  31 # A set-like hash indicating which entities are allowed
  32 # to exist as text-named entities, and not converted into numeric
  33 # entities.
  34 my %LEGAL_ENTITIES = map { $_ => 1 } qw(amp lt gt quot apos);
  35
  36 my %ESCAPE_ENTITIES = ('&' => '&amp;',
  37                        '>' => '&gt;',
  38                        '<' => '&lt;' );
  39 my %QUOTE_ENTITIES = (%ESCAPE_ENTITIES,
  40                       "'" => '&apos;',
  41                       '"' => '&quot;');
  42
  43 # Tags that sort of close themselves
  44 my %SELF_CLOSERS = map { $_ => 1 } qw(li p);
  45
  46 # Token types
  47 my $TEXT          = 1;
  48 my $TAG           = 2;
  49 my $TAGCLOSE      = 3;
  50 my $SELFCLOSEDTAG = 4;
  51 my $ENTITY        = 5;
  52 my $NEWLINE       = 6;
  53 my $DOUBLENEWLINE = 7;
  54 my $CDATA         = 8;
  55
  56 # Tag types. These are used for formatting the resulting text.
  57 # A block tag, as in CSS
  58 my $BLOCK         = 10;
  59 # An inline tag, as in CSS
  60 my $INLINE        = 11;
  61 # A 'block' tag that should be converted to a self-closing
  62 # tag if an empty instance is found (i.e., <br></br> => <br />)
  63 my $BLOCK_SELF_CLOSE = 12;
  64
  65 my %TAG_TYPES =
  66     (
  67      p => $BLOCK,
  68      div => $BLOCK,
  69      span => $INLINE,
  70      br => $BLOCK_SELF_CLOSE,
  71      a => $INLINE,
  72      li => $BLOCK,
  73      dl => $BLOCK,
  74      dt => $BLOCK,
  75      dd => $BLOCK,
  76      ul => $BLOCK,
  77      font => $INLINE,
  78      abbr => $INLINE,
  79      acronym => $INLINE,
  80      b => $INLINE,
  81      bgsound => $INLINE,
  82      big => $INLINE,
  83      caption => $INLINE,
  84      img => $INLINE,
  85      blockquote => $BLOCK,
  86      h1 => $BLOCK,
  87      h2 => $BLOCK,
  88      h3 => $BLOCK,
  89      h4 => $BLOCK,
  90      h5 => $BLOCK,
  91      h6 => $BLOCK,
  92      em => $INLINE,
  93      strong => $INLINE,
  94      dfn => $INLINE,
  95      code => $INLINE,
  96      samp => $INLINE,
  97      kbd => $INLINE,
  98      var => $INLINE,
  99      cite => $INLINE,
 100      hr => $BLOCK_SELF_CLOSE,
 101      i => $INLINE,
 102      ol => $BLOCK,
 103      pre => $BLOCK,
 104      small => $INLINE,
 105      strike => $INLINE,
 106      'sub' => $INLINE,
 107      sup => $INLINE,
 108      tt => $INLINE,
 109      table => $BLOCK,
 110      tr => $BLOCK,
 111      td => $BLOCK,
 112      u => $INLINE,
 113      html => $BLOCK,
 114      body => $BLOCK
 115      );
 116
 117 # A reasonable pre-defined behavior for URLs. If it has a protocol
 118 # in the front, it *must* be http or https. Anything else gets
 119 # destroyed. If it doesn't have a protocol in the front, we'll
 120 # at least take a shot and prepend http://.
 121 sub fixURL {
 122     my $url = shift;
 123     if ($url !~ /^https?:\/\//) {
 124         # has non-http protocol. Evil. Die now.
 125         # or maybe an incorrectly-specified port. Die now anyhow.
 126         if ($url =~ /^[^:]+:/) {
 127             return undef;
 128         }
 129
 130         return 'http://' . $url;
 131     }
 132     return $url;
 133 }
 134
 135 # A reasonable pre-defined conservative set. Allows a reasonable
 136 # range of simple formatting tags, but the only attributes allowed
 137 # are URLS (and those only with http or https protocols), and
 138 # titles for abbr and acronyms. Certainly no code execution allowed.
 139 our $CONSERVATIVE_PERMISSIONS = {
 140     p => {},
 141     br => {},
 142     a => {href => \&fixURL},
 143     ul => {},
 144     li => {},
 145     abbr => {title => 1},
 146     acronym => {title => 1},
 147     b => {},
 148     big => {},
 149     blockquote => {},
 150     em => {},
 151     strong => {},
 152     dfn => {},
 153     code => {},
 154     samp => {},
 155     kbd => {},
 156     var => {},
 157     cite => {},
 158     hr => {},
 159     i => {},
 160     ol => {},
 161     pre => {},
 162     small => {},
 163     strike => {},
 164     sub => {},
 165     sup => {},
 166     tt => {},
 167     u => {}
 168 };
 169
 170 my %CONSERVATIVE_PERMISSIONS_NO_LINKS = %$CONSERVATIVE_PERMISSIONS;
 171 our $CONSERVATIVE_PERMISSIONS_NO_LINKS = \%CONSERVATIVE_PERMISSIONS_NO_LINKS;
 172 delete $CONSERVATIVE_PERMISSIONS_NO_LINKS->{a};
 173
 174 ###
 175 ### UTILITY FUNCTIONS
 176 ###
 177
 178 # Return whether $s is the name of a valid entity
 179 sub is_valid_entity {
 180     my $s = shift;
 181     return !!($s =~ /\#\d{1,5}/) || $LEGAL_ENTITIES{$s};
 182 }
 183
 184 # Given an entity name, either return that entity or an empty
 185 # string to elide it.
 186 sub make_entity {
 187     my $s = shift;
 188     if (is_valid_entity($s)) {
 189         return "&$s;";
 190     } else {
 191         return '';
 192     }
 193 }
 194
 195 sub escape {
 196     my $s = shift;
 197     $s =~ s/([&<>])/$ESCAPE_ENTITIES{$1}/ge;
 198     return $s;
 199 }
 200 sub escape_quote {
 201     my $s = shift;
 202     $s =~ s/([&<>'"])/$QUOTE_ENTITIES{$1}/ge; #'])/;
 203     return $s;
 204 }
 205
 206 # Nicer interface to create tokens; just specify the last bit.
 207 sub token {
 208     my $token_type = shift;
 209     $token_type = 'Thrasher::HTMLNormalize::Token::' . $token_type;
 210     return $token_type->new(@_);
 211 }
 212
 213 # Tokenize a tag or entity. We know it's one of them because of
 214 # how it was parsed. (Pulled out for simple testing.)
 215 sub tokenize_tag_or_entity {
 216     my $tag_or_entity = shift;
 217     my $allow_taglike = shift;
 218     my $tag_translation = shift;
 219
 220     # \# is to convince emacs syntax highlighting that # is not a comment
 221     if ($tag_or_entity !~ /((<(\/)?(\w+)([^>]*)>)|(&(\#?\w+);))/) {
 222         die "tokenize_tag_or_entity must be passed a tag or entity: "
 223             .$tag_or_entity;
 224     }
 225
 226     # Convert that mass of matches into something comprehensible
 227     my $groups = {tag => $2,    # the whole of the tag
 228                   slash => $3,  # if it has a slash to close the tag
 229                   tagname => $4,# the tag name
 230                   tagremains => $5, # the rest of the tag (atts)
 231                   entity => $6, # the whole of the entity
 232                   entityname => $7 };
 233
 234     $groups->{tagname} = lc $groups->{tagname};
 235
 236     if (defined(my $tag_translator =
 237                     $tag_translation->{lc $groups->{tagname}})) {
 238         # use Data::Dumper;
 239         #print "Initial: " . Dumper($groups);
 240         if (ref($tag_translator) eq '') {
 241             $groups->{tagname} = $tag_translator;
 242         } elsif (ref($tag_translator) eq 'CODE') {
 243             $tag_translator->($groups);
 244         }
 245         #print "Post-translation: " . Dumper($groups);
 246     }
 247
 248     if ($groups->{tag} && $allow_taglike &&
 249         !$TAG_TYPES{lc $groups->{tagname}}) {
 250         return token('Text', $tag_or_entity);
 251     } elsif ($groups->{tag}) {
 252         # handle tokenizing a tag
 253         if ($groups->{slash}) {
 254             return token('TagClose', $groups->{tagname});
 255         }
 256
 257         my $token_type = 'Tag';
 258
 259         my $atts = $groups->{tagremains};
 260         if ($atts && substr($atts, -1) eq '/') {
 261             $token_type = 'SelfClosedTag';
 262             $atts = substr($atts, 0, length($atts) - 1); # pop off /
 263         }
 264
 265         my $atts_hash = {};
 266
 267         # Pull out the attributes
 268         while ($atts =~ /([a-zA-Z]\w+)(?:=(?:(?:\"([^\">]*)\")|(?:'([^'>]*)\')|(?:([^ \n\t>]*))))?/g) {
 269             my $name = $1;
 270                 # one value delimited by ", the other by ',
 271                 # the final by whitespace, the final case
 272                 # covers those few atts in HTML that can
 273                 # show up with no value at all
 274             my $value = $2 || $3 || $4 || 1;
 275
 276             $atts_hash->{lc $name} = $value;
 277         }
 278
 279         return token($token_type, $groups->{tagname}, $atts_hash);
 280     } elsif ($groups->{entity}) {
 281         return token('Entity', $groups->{entityname});
 282     }
 283
 284     # shouldn't be able to get here.
 285     die "tokenize_tag_or_entity didn't get anything it expected.";
 286 }
 287
 288 # Given something known to be character-based text with no
 289 # (X)HTML in it, tokenize it.
 290 sub tokenize_text {
 291     my $s = shift;
 292     my @tokens;
 293
 294     my @texthunks = split(/(\n+)/, $s);
 295     my $i = 0;
 296
 297     for my $fragment (@texthunks) {
 298         if ($i %2) {
 299             my $newlines = length($fragment); # all newlines
 300             if ($newlines == 1) {
 301                 push @tokens, token('Newline');
 302             } else {
 303                 push @tokens, token('DoubleNewline');
 304             }
 305         } else {
 306             if ($fragment) { # skip empty matches
 307                 push @tokens, token('Text', $fragment);
 308             }
 309         }
 310
 311         $i++;
 312     }
 313
 314     return \@tokens;
 315 }
 316
 317 # Given a tag and our validation criteria, verify the tag is
 318 # allowed. If it is not allowed, this returns 0. It may also
 319 # destructively manipulate the tag's attributes to make them valid.
 320 sub validate_tag {
 321     my $tag = shift;
 322     my $allowed_elements = shift;
 323
 324     # It's all allowed.
 325     if (!defined($allowed_elements)) {
 326         return $tag;
 327     }
 328
 329     my $legal_attributes = $allowed_elements->{$tag->{name}};
 330
 331     # Is this allowed at all?
 332     if (!$legal_attributes) {
 333         return 0;
 334     }
 335
 336     # If this is a close tag, pass it through.
 337     if ($tag->{type} == $TAGCLOSE) {
 338         return $tag;
 339     }
 340
 341     # Now, actually validate the attributes;
 342     while (my ($name, $value) = each %{$tag->{atts}}) {
 343         my $attribute_validation = $legal_attributes->{$name};
 344
 345         # Is the attribute even allowed?
 346         if (!$attribute_validation) {
 347             # Nope, toast it.
 348             delete($tag->{atts}->{$name});
 349             next;
 350         }
 351
 352         if ($attribute_validation eq '1') { # note string compare
 353             # everything is permitted here, pass through
 354             next;
 355         }
 356
 357         my $new_value;
 358         if (ref($attribute_validation) eq 'CODE') {
 359             $new_value = $attribute_validation->($value);
 360             if (!defined($new_value)) {
 361                 # This attribute is so evil that the entire tag
 362                 # has to go away!
 363                 return 0;
 364             }
 365
 366             if (!$new_value) { # either false or empty string
 367                 delete($tag->{atts}->{$name});
 368                 next;
 369             }
 370         } elsif (ref($attribute_validation) eq 'Regexp') {
 371             my $match = $value =~ /$attribute_validation/;
 372             if (!$match) {
 373                 return 0;
 374             }
 375             # Otherwise pass through; for better control, use a sub
 376             $new_value = $value;
 377         } elsif ($attribute_validation) { # any true value
 378             $new_value = $value;
 379         } else {
 380             # any false value; redundent since unspecified = false
 381             delete($tag->{atts}->{$name});
 382             next;
 383         }
 384
 385         $tag->{atts}->{$name} = $new_value;
 386     }
 387
 388     return $tag;
 389 }
 390
 391 # The money function at last.
 392 sub normalize {
 393     my $html = shift;
 394     my $allowed_elements = shift; # default - allow everything
 395     my $allow_cdata = shift; # default - no
 396     my $allow_taglike = shift;
 397     my $tag_translation = shift || {};
 398
 399     my $text_only = shift; # don't automatically slap in <p>s
 400
 401     # Whitespace trimmer
 402     {
 403         # Apparently, applying this to something that consists
 404         # entirely of whitespace gets a warning about using
 405         # an uninitialized value. Probably just the implementation
 406         # poking out and doesn't seem to be worth worrying about.
 407         no warnings 'uninitialized';
 408         $html =~ s/(^\s+)|(\s+$ )//gx;
 409     }
 410
 411     # FIXME: Verify in utf-8 text encoding
 412
 413     # If we elimated the HTML in that step, return nothing
 414     if (!$html) {
 415         return '';
 416     }
 417
 418     my @tokens;
 419
 420     if ($allow_cdata) {
 421         my @cdata_fragments = split(/<!\[CDATA\[(.*?)\]\]>/,
 422                                     $html);
 423         my $i = 0;
 424
 425         for my $fragment (@cdata_fragments) {
 426             if ($i % 2) {
 427                 push @tokens, token('CData', $fragment);
 428             } else {
 429                 tokenize_fragment(\@tokens, $fragment,
 430                                   $allowed_elements, $allow_taglike,
 431                                   $tag_translation);
 432             }
 433             $i++;
 434         }
 435     } else {
 436         tokenize_fragment(\@tokens, $html, $allowed_elements,
 437                           $allow_taglike, $tag_translation);
 438     }
 439
 440     # Push a p on front if the first token is not a block
 441     # tag. Note that <html> is a block tag, so complete
 442     # documents don't get this done, only fragments. This work
 443     # because the TagStack will normalize the p correctly.
 444     if (!$text_only &&
 445         @tokens && ($tokens[0]->{type} != $TAG ||
 446                     ($TAG_TYPES{$tokens[0]->{name}} || $BLOCK)
 447                         != $BLOCK)) {
 448         unshift @tokens, token('Tag', 'p');
 449     }
 450
 451     my @new_tokens;
 452     # we abusively share modification between this func
 453     # and the tag stack :(
 454     my $tag_stack = new Thrasher::HTMLNormalize::TagStack(\@new_tokens);
 455
 456     for (my $i = 0; $i < @tokens; $i++) {
 457         my $token = $tokens[$i];
 458
 459         my $type = $token->{type};
 460         if ($type == $TAG) {
 461             $tag_stack->push($token);
 462         }
 463         if ($type == $TAGCLOSE) {
 464             $tag_stack->pop($token);
 465         }
 466         if ($type == $TEXT || $type == $ENTITY ||
 467             $type == $SELFCLOSEDTAG || $type == $CDATA) {
 468             push @new_tokens, $token
 469         }
 470
 471         # If a newline is already preceded by a self-closed
 472         # br, pass it through, otherwise, prepend a self-closed br
 473         if ($type == $NEWLINE) {
 474             # Well, actually, if we're in a "pre", skip all fancy
 475             # processing
 476             if ($tag_stack->in_tag('pre')) {
 477                 push @new_tokens, token('Text', "\n");
 478                 next;
 479             }
 480
 481             if (!@new_tokens) {
 482                 next;
 483             }
 484
 485             my $previous_token = $tokens[$i - 1];
 486             if ($previous_token->{type} == $SELFCLOSEDTAG &&
 487                 $previous_token->{name} eq 'br') {
 488                 push @new_tokens, $token;
 489             } else {
 490                 # add a br somehowe
 491
 492                 # we're not at the end of the tokens.
 493                 if ($i + 1 < @tokens) {
 494                     my $next_token = $tokens[$i + 1];
 495                     if ($next_token->{type} == $TAG) {
 496                         my $tag_type = $TAG_TYPES{$next_token->{name}}
 497                                        || $BLOCK;
 498                         if ($tag_type != $BLOCK &&
 499                             $tag_type != $BLOCK_SELF_CLOSE) {
 500                             # this automatically gets eaten if there
 501                             # was no previous <p>
 502                             if (!$text_only) {
 503                                 push @new_tokens,
 504                                      token('SelfClosedTag', 'br');
 505                             }
 506                             push @new_tokens, $token;
 507                         } else {
 508                             push @new_tokens, $token;
 509                         }
 510                     } else {
 511                         if (!$text_only) {
 512                             push @new_tokens, token('SelfClosedTag', 'br');
 513                         }
 514                         push @new_tokens, $token;
 515                     }
 516                 }
 517             }
 518         }
 519
 520         if ($type == $DOUBLENEWLINE) {
 521             if ($tag_stack->in_tag('pre')) {
 522                 push @new_tokens, token('Text', "\n\n");
 523                 next;
 524             }
 525
 526             # Always assume we're closing a <p>. It gets eaten later
 527             # by the TagStack if this turns out to be false.
 528             if (!$text_only) {
 529                 $tag_stack->pop(token('TagClose', 'p'));
 530             }
 531
 532             # If this is followed by a non-block tag, add a <p>.
 533             if ($i < @tokens - 1) {
 534                 my $next_token = $tokens[$i + 1];
 535                 if ($next_token->{type} == $TAG) {
 536                     my $tag_type = $TAG_TYPES{$next_token->{name}} || $BLOCK;
 537                     if ($tag_type != $BLOCK &&
 538                         $tag_type != $BLOCK_SELF_CLOSE) {
 539                         push @new_tokens, $token;
 540                         if (!$text_only) {
 541                             $tag_stack->push(token('Tag', 'p'));
 542                         }
 543                     } else {
 544                         push @new_tokens, $token;
 545                     }
 546                 } else {
 547                     push @new_tokens, $token;
 548                     if (!$text_only) {
 549                         $tag_stack->push(token('Tag', 'p'));
 550                     }
 551                 }
 552             }
 553         }
 554     }
 555
 556     $tag_stack->terminate();
 557
 558     my $new_html = join '', map { $_->render } @new_tokens;
 559
 560     if ($VALIDATE_AS_XML) {
 561         # Do a simple validation pass.
 562         die "XML validation not implemented yet.";
 563     }
 564
 565     return $new_html;
 566 }
 567
 568 # Performs low-level tokenization on a bit of text known not to have
 569 # (valid) CDATA in it.
 570 sub tokenize_fragment {
 571     my $tokens = shift;
 572     my $html = shift;
 573     my $allowed_elements = shift;
 574     my $allow_taglike = shift;
 575     my $tag_translation = shift;
 576
 577     # Same as the validation re in tokenize_tag_or_entity, but
 578     # with all but the outer parens turned into non-capturing.
 579     my @fragments = split (/((?:<(?:\/)?(?:\w+)(?:[^>]*)>)|(?:&(?:\#?\w+);))/,
 580                            $html);
 581
 582     for (my $i = 0; $i < @fragments; $i++) {
 583         my $text = $fragments[$i];
 584
 585         if ($i % 2) {
 586             my $token = tokenize_tag_or_entity($text, $allow_taglike,
 587                                                $tag_translation);
 588             if ($token->{type} == $TAG) {
 589                 $token = validate_tag($token, $allowed_elements);
 590             }
 591             if ($token) {
 592                 push @$tokens, $token;
 593             }
 594         } else {
 595             push @$tokens, @{tokenize_text($text)};
 596         }
 597     }
 598 }
 599
 600 ###
 601 ### TOKEN CLASSES
 602 ###
 603
 604 package Thrasher::HTMLNormalize::Token;
 605 # A base class for tokens, used for 'isa' checks.
 606
 607 # This is only used by children; do not call directly.
 608 sub _new {
 609     my $class = shift;
 610     my %values = @_;
 611     my $self = \%values;
 612     bless $self, $class;
 613     return $self;
 614 }
 615
 616
 617 package Thrasher::HTMLNormalize::Token::Text;
 618 use base 'Thrasher::HTMLNormalize::Token';
 619
 620 sub new { shift()->SUPER::_new(text => shift(), type => $TEXT) }
 621
 622 sub render {
 623     return Thrasher::HTMLNormalize::escape($_[0]->{text});
 624 }
 625
 626 package Thrasher::HTMLNormalize::TagBase;
 627 use base 'Thrasher::HTMLNormalize::Token';
 628 # Contains the similarities between Tags and SelfClosedTags.
 629
 630 # process an attribute list into proper HTML; this assumes that
 631 # the attributes has been checked for valid attributes already,
 632 # but this also checks for permitted entities. Returns an arrayref
 633 # containing strings ready-to-be-output.
 634 sub att_list {
 635     my $self = shift;
 636     my @atts;
 637     while (my ($name, $value) = each %{$self->{atts}}) {
 638         my @fragments = split (/&(\w+);/, $value);
 639         my $i = 0;
 640         my @new_value;
 641         for my $text (@fragments) {
 642             if ($i % 2) {
 643                 # This is an entity body.
 644                 push @new_value, Thrasher::HTMLNormalize::make_entity($text);
 645             } else {
 646                 # normal text
 647                 push(@new_value,
 648                      Thrasher::HTMLNormalize::escape_quote($text));
 649             }
 650             $i ++;
 651         }
 652         my $value = join '', @new_value;
 653         # note $name was checked for validity implicitly by the
 654         # regex that pulled it out in the first place.
 655         push @atts, "$name='$value'";
 656     }
 657     return \@atts;
 658 }
 659
 660 package Thrasher::HTMLNormalize::Token::Tag;
 661 use base 'Thrasher::HTMLNormalize::TagBase';
 662 # Represents a Tag, which may have attributes.
 663 # Takes: A name, and a hashref of atts.
 664 # Note that name is checked implicitly by the regex that pulls it out.
 665
 666 use Carp qw(confess);
 667
 668 sub new {
 669     my $self = shift()->SUPER::_new(name => shift(),
 670                                     atts => shift() || {},
 671                                     type => $TAG);
 672     confess "Hash" if ref($self->{name}) eq 'HASH';
 673     return $self;
 674 }
 675
 676 sub render {
 677     my $self = shift;
 678     my $atts = $self->att_list;
 679
 680     if (@$atts) {
 681         return "<$self->{name} " . join(' ', @$atts) . ">";
 682     } else {
 683         return "<$self->{name}>";
 684     }
 685 }
 686
 687 package Thrasher::HTMLNormalize::Token::SelfClosedTag;
 688 use base 'Thrasher::HTMLNormalize::TagBase';
 689 # Represents a self-closing tag, which may have attributes.
 690
 691 sub new { shift()->SUPER::_new(name => shift(),
 692                                atts => shift() || {},
 693                                type => $SELFCLOSEDTAG); }
 694
 695 sub render {
 696     my $self = shift;
 697     my $atts = $self->att_list;
 698
 699     if (@$atts) {
 700         return "<$self->{name} " . join (' ', @$atts) . " />";
 701     } else {
 702         return "<$self->{name} />";
 703     }
 704 }
 705
 706 package Thrasher::HTMLNormalize::Token::TagClose;
 707 use base 'Thrasher::HTMLNormalize::Token';
 708 # Represents a close tag. Has only a name.
 709
 710 sub new { shift()->SUPER::_new(name => shift(),
 711                                type => $TAGCLOSE); }
 712
 713 sub render {
 714     return "</$_[0]->{name}>";
 715 }
 716
 717 package Thrasher::HTMLNormalize::Token::Entity;
 718 use base 'Thrasher::HTMLNormalize::Token';
 719 use Thrasher::HTMLNormalize::Entities qw(html_entity);
 720 # Represents an entity. Has only a name. Performs numberization of
 721 # HTML entities
 722
 723 sub new {
 724     my $class = shift;
 725
 726     my $name = shift;
 727     if (!$LEGAL_ENTITIES{$name} && (my $number = html_entity($name))) {
 728         $name = '#' . $number;
 729     }
 730     # make_entity takes care of nulling out illegal entities
 731     # that pass this test
 732
 733     return $class->SUPER::_new(name => $name, type => $ENTITY);
 734 }
 735
 736 sub render {
 737     return Thrasher::HTMLNormalize::make_entity($_[0]->{name});
 738 }
 739
 740 package Thrasher::HTMLNormalize::Token::Newline;
 741 use base 'Thrasher::HTMLNormalize::Token';
 742 # Represents a single newline, suitable for <br />
 743
 744 sub new { shift()->SUPER::_new(type => $NEWLINE); }
 745
 746 sub render { "\n"; }
 747
 748 package Thrasher::HTMLNormalize::Token::DoubleNewline;
 749 use base 'Thrasher::HTMLNormalize::Token';
 750 # Double new line, suitable for <p>-ification
 751
 752 sub new { shift()->SUPER::_new(type => $DOUBLENEWLINE); }
 753
 754 sub render { "\n\n"; }
 755
 756 package Thrasher::HTMLNormalize::Token::CData;
 757 use base 'Thrasher::HTMLNormalize::Token';
 758
 759 sub new { shift()->SUPER::_new(rawtext => shift(), type => $CDATA); }
 760
 761 sub render { "<![CDATA[" . $_[0]->{rawtext} . "]]>"; }
 762
 763 ###
 764 ### TAG STACK
 765 ###
 766
 767 package Thrasher::HTMLNormalize::TagStack;
 768 use strict;
 769 use warnings;
 770
 771 use Data::Dumper;
 772
 773 *{token} = *Thrasher::HTMLNormalize::token;
 774
 775 # This is what it says it is; the "stack" of tags that has led
 776 # us to this point in the HTML. It is a "smart" stack; it can
 777 # silently refuse to accept tag tokens, resulting in the removal
 778 # of that tag from the resulting HTML. If you need to implement
 779 # the real nesting rules for (X)HTML tags, this is where you'd
 780 # put that logic.
 781
 782 sub new {
 783     my $class = shift;
 784     my $self = {stack => [],
 785                 # we abusively modify this in the normalize function :(
 786                 tokens => shift};
 787     bless $self, $class;
 788     return $self;
 789 }
 790
 791 # last element on the stack
 792 sub laststack { $_[0]->{stack}->[-1]; }
 793 sub lasttoken { $_[0]->{tokens}->[-1]; }
 794
 795 # Push a new tag token onto the stack.
 796 sub push {
 797     my $self = shift;
 798     my $tag = shift;
 799
 800     # Assume certain tags close themselves if they appear immediately
 801     # after themselves, i.e., <li>stuff<li>morestuff.
 802     # In particular, this if statement is asking "Is the previous
 803     # element the same as this element and a SELF_CLOSER like p?"
 804     if (@{$self->{stack}} && $self->laststack->{name} eq $tag->{name}
 805         && $SELF_CLOSERS{$tag->{name}}) {
 806         # If so, 'pop' the tag, which causes the emission of the
 807         # appropriate end tag (see pop)
 808         $self->pop(token('TagClose', $tag->{name}));
 809     }
 810
 811     # If this tag should always self close, convert it to such
 812     # unconditionally
 813     if ($TAG_TYPES{$tag->{name}} == $BLOCK_SELF_CLOSE) {
 814         push @{$self->{tokens}}, token('SelfClosedTag',
 815                                        $tag->{name},
 816                                        $tag->{atts});
 817         return;
 818     }
 819
 820     # otherwise, a normal push
 821     push @{$self->{stack}}, $tag;
 822     push @{$self->{tokens}}, $tag;
 823 }
 824
 825 # A lot of the normalization magic takes place here.
 826 sub pop {
 827     my $self = shift;
 828     my $tag = shift;
 829
 830     # We want a chance to manipulate the tokens we push back on
 831     # a bit, before losing track of the tokens this func
 832     # generated by tossing them onto the pile
 833     my @tokens;
 834
 835     # If a newline or doublenewline is currently at the end of the
 836     # stack, pop it off for a moment and put it back later. This has
 837     # the effect of binding close paragraph tags to the paragraph they
 838     # were used in. Without this,
 839     #
 840     #  <p>Hello.
 841     #
 842     #  <p>There.
 843     #
 844     # (complete with those newlines) gets converted to
 845     #
 846     #  <p>Hello.
 847     #
 848     #  </p><p>There.</p>
 849     #
 850     # which really isn't the intent, even if it is valid HTML.
 851     my $popped_newline;
 852     if (@{$self->{tokens}} &&
 853           ($self->lasttoken->{type} == $NEWLINE ||
 854            $self->lasttoken->{type} == $DOUBLENEWLINE)) {
 855         $popped_newline = pop @{$self->{tokens}};
 856     }
 857
 858     # Scan backwards for the matching start tag, emit
 859     # any missing close tags.
 860     my $i = scalar(@{$self->{stack}}) - 1;
 861     while ($i >= 0) {
 862         my $open_tag = $self->{stack}->[$i];
 863         if ($open_tag->{name} eq $tag->{name}) {
 864             while ($i + 1 < @{$self->{stack}}) {
 865                 my $tag_to_close = pop @{$self->{stack}};
 866                 CORE::push(@tokens,
 867                            token('TagClose', $tag_to_close->{name}));
 868             }
 869             CORE::pop @{$self->{stack}};
 870             CORE::push @tokens, token('TagClose', $tag->{name});
 871             last;
 872         }
 873         $i--;
 874     }
 875
 876     # If we're closing off the previous *token* (not stack
 877     # element), and there is therefore no content for that
 878     # token, either elide it if it is an empty block token,
 879     # or convert it to a self-closed tag
 880     if (@tokens && $self->lasttoken->{type} == $TAG &&
 881         $self->lasttoken->{name} eq $tokens[0]->{name}) {
 882         my $name = $tokens[0]->{name};
 883         my $tag_type = $TAG_TYPES{$name};
 884         if ($tag_type == $BLOCK) {
 885             # elide the token
 886             pop @tokens;
 887             pop @{$self->{tokens}};
 888         }
 889         if ($tag_type == $BLOCK_SELF_CLOSE) {
 890             pop @tokens;
 891             my $old_tag = pop @{$self->tokens};
 892             unshift @tokens, token('SelfClosedTag',
 893                                    $old_tag->{name},
 894                                    $old_tag->{atts});
 895         }
 896     }
 897
 898     # If there was no corresponding open tag, just discard it by
 899     # doing nothing on purpose.
 900
 901     # do_nothing();
 902
 903     # If we popped off a newline, now is when it goes back on
 904     CORE::push @tokens, $popped_newline if $popped_newline;
 905
 906     CORE::push @{$self->{tokens}}, @tokens;
 907 }
 908
 909 # Returns true if we are "in" the given tag, false otherwise. This
 910 # is used to implement special treatment for <pre>, for instance
 911 sub in_tag {
 912     my $self = shift;
 913     my $tag_name = shift;
 914
 915     for my $tag (@{$self->{stack}}) {
 916         return 1 if $tag->{name} eq $tag_name;
 917     }
 918     return 0;
 919 }
 920
 921 # Terminates the TagStack by popping all current elements. Used
 922 # at the end of the input string. Despite the name, not the same
 923 # as a DESTROY function.
 924 sub terminate {
 925     my $self = shift;
 926
 927     while (@{$self->{stack}}) {
 928         my $tag_still_open = CORE::pop @{$self->{stack}};
 929         CORE::push @{$self->{tokens}}, token('TagClose',
 930                                              $tag_still_open->{name});
 931     }
 932 }
 933
 934 1;
 935 __END__
 936
 937 =head1 NAME
 938
 939 Thrasher::HTMLNormalize - a module for normalizing (X)HTML for safe
 940 display
 941
 942 =head1 SYNOPSIS
 943
 944  use Thrasher::HTMLNormalize qw(:all);
 945
 946  # Take arbitrary HTML and make it syntactically correct
 947  my $normalizedHTML = normalize($html);
 948
 949  # Take arbitrary maybe-HTML and make it correct, and
 950  # use a chosen sub-set of HTML suitable for simple rich-text-like
 951  # HTML.
 952  my $comment_like_html = normalize($html, $CONSERVATIVE_PERMISSIONS);
 953
 954  # Take arbitrary maybe-HTML and make it correct, use a
 955  # a chosen sub-set of HTML suitable for simple rich-text-like
 956  # HTML, and if you see a "tag" that isn't actually HTML,
 957  # escape it as if it was text.
 958  my $message_log_like_html = normalize($html,
 959                                        $CONSERVATIVE_PERMISSIONS,
 960                                        undef, 1);
 961
 962
 963 =head1 MOTIVATION
 964
 965 This predated HTML::Normalize on CPAN. I just had it lying around, and
 966 I understand it, so I used it. I recommend checking out
 967 HTML::Normalize if you think you might want to use this. (On the
 968 other hand, I've had to implement features here that it doesn't have.)
 969
 970 This is a utility module; Thrasher::XHTMLNormalize specifically
 971 defines the normalization for XHTML-IM.
 972
 973 This is a utility module; Thrasher::XHTMLNormalize specifically
 974 defines the normalization for XHTML-IM.
 975
 976 =cut