perl/lib/Thrasher/HTMLNormalize.pm

   1 package Thrasher::HTMLNormalize;
   2 use strict;
   3 use warnings;
   4
   5 use Thrasher::HTMLNormalize::Entities qw(html_entity);
   6
   7 use base 'Exporter';
   8
   9 our @EXPORT_OK = qw(&escape $CONSERVATIVE_PERMISSIONS &normalize
  10                     &escape_quote $CONSERVATIVE_PERMISSIONS_NO_LINKS);
  11
  12 our %EXPORT_TAGS = (all => \@EXPORT_OK);
  13
  14 my $VALIDATE_AS_XML = 0;
  15
  16 ###
  17 ### PLEASE NOTE: This module has unit tests. Changing this module
  18 ###              without verifying them is a bad idea.
  19 ###
  20
  21 # Also note that this file isn't as dirty as it looks; that's
  22 # just the natural result of sticking an entire compiler in one
  23 # file.
  24
  25 ###
  26 ### CONSTANT AND DATA DECLARATIONS
  27 ###
  28 ### Recall "my" is actually file scoped, not package scoped.
  29 ### use constant, however, *is* package scoped, so it's darned inconvenient.
  30
  31 # A set-like hash indicating which entities are allowed
  32 # to exist as text-named entities, and not converted into numeric
  33 # entities.
  34 my %LEGAL_ENTITIES = map { $_ => 1 } qw(amp lt gt quot apos);
  35
  36 my %ESCAPE_ENTITIES = ('&' => '&amp;',
  37                        '>' => '&gt;',
  38                        '<' => '&lt;' );
  39 my %QUOTE_ENTITIES = (%ESCAPE_ENTITIES,
  40                       "'" => '&apos;',
  41                       '"' => '&quot;');
  42
  43 # Tags that sort of close themselves
  44 my %SELF_CLOSERS = map { $_ => 1 } qw(li p);
  45
  46 # Token types
  47 my $TEXT          = 1;
  48 my $TAG           = 2;
  49 my $TAGCLOSE      = 3;
  50 my $SELFCLOSEDTAG = 4;
  51 my $ENTITY        = 5;
  52 my $NEWLINE       = 6;
  53 my $DOUBLENEWLINE = 7;
  54 my $CDATA         = 8;
  55
  56 # Tag types. These are used for formatting the resulting text.
  57 # A block tag, as in CSS
  58 my $BLOCK         = 10;
  59 # An inline tag, as in CSS
  60 my $INLINE        = 11;
  61 # A 'block' tag that should be converted to a self-closing
  62 # tag if an empty instance is found (i.e., <br></br> => <br />)
  63 my $BLOCK_SELF_CLOSE = 12;
  64
  65 my %TAG_TYPES =
  66     (
  67      p => $BLOCK,
  68      div => $BLOCK,
  69      span => $INLINE,
  70      br => $BLOCK_SELF_CLOSE,
  71      a => $INLINE,
  72      li => $BLOCK,
  73      dl => $BLOCK,
  74      dt => $BLOCK,
  75      dd => $BLOCK,
  76      ul => $BLOCK,
  77      font => $INLINE,
  78      abbr => $INLINE,
  79      acronym => $INLINE,
  80      b => $INLINE,
  81      bgsound => $INLINE,
  82      big => $INLINE,
  83      caption => $INLINE,
  84      img => $INLINE,
  85      blockquote => $BLOCK,
  86      h1 => $BLOCK,
  87      h2 => $BLOCK,
  88      h3 => $BLOCK,
  89      h4 => $BLOCK,
  90      h5 => $BLOCK,
  91      h6 => $BLOCK,
  92      em => $INLINE,
  93      strong => $INLINE,
  94      dfn => $INLINE,
  95      code => $INLINE,
  96      samp => $INLINE,
  97      kbd => $INLINE,
  98      var => $INLINE,
  99      cite => $INLINE,
 100      hr => $BLOCK_SELF_CLOSE,
 101      i => $INLINE,
 102      ol => $BLOCK,
 103      pre => $BLOCK,
 104      small => $INLINE,
 105      s => $INLINE,
 106      strike => $INLINE,
 107      'sub' => $INLINE,
 108      sup => $INLINE,
 109      tt => $INLINE,
 110      table => $BLOCK,
 111      tr => $BLOCK,
 112      td => $BLOCK,
 113      u => $INLINE,
 114      html => $BLOCK,
 115      body => $BLOCK
 116      );
 117
 118 # A reasonable pre-defined behavior for URLs. If it has a protocol
 119 # in the front, it *must* be http or https. Anything else gets
 120 # destroyed. If it doesn't have a protocol in the front, we'll
 121 # at least take a shot and prepend http://.
 122 sub fixURL {
 123     my $url = shift;
 124     if ($url !~ /^https?:\/\//) {
 125         # has non-http protocol. Evil. Die now.
 126         # or maybe an incorrectly-specified port. Die now anyhow.
 127         if ($url =~ /^[^:]+:/) {
 128             return undef;
 129         }
 130
 131         return 'http://' . $url;
 132     }
 133     return $url;
 134 }
 135
 136 # A reasonable pre-defined conservative set. Allows a reasonable
 137 # range of simple formatting tags, but the only attributes allowed
 138 # are URLS (and those only with http or https protocols), and
 139 # titles for abbr and acronyms. Certainly no code execution allowed.
 140 our $CONSERVATIVE_PERMISSIONS = {
 141     p => {},
 142     br => {},
 143     a => {href => \&fixURL},
 144     ul => {},
 145     li => {},
 146     abbr => {title => 1},
 147     acronym => {title => 1},
 148     b => {},
 149     big => {},
 150     blockquote => {},
 151     em => {},
 152     strong => {},
 153     dfn => {},
 154     code => {},
 155     samp => {},
 156     kbd => {},
 157     var => {},
 158     cite => {},
 159     hr => {},
 160     i => {},
 161     ol => {},
 162     pre => {},
 163     small => {},
 164     strike => {},
 165     sub => {},
 166     sup => {},
 167     tt => {},
 168     u => {}
 169 };
 170
 171 my %CONSERVATIVE_PERMISSIONS_NO_LINKS = %$CONSERVATIVE_PERMISSIONS;
 172 our $CONSERVATIVE_PERMISSIONS_NO_LINKS = \%CONSERVATIVE_PERMISSIONS_NO_LINKS;
 173 delete $CONSERVATIVE_PERMISSIONS_NO_LINKS->{a};
 174
 175 ###
 176 ### UTILITY FUNCTIONS
 177 ###
 178
 179 # Return whether $s is the name of a valid entity
 180 sub is_valid_entity {
 181     my $s = shift;
 182     return !!($s =~ /\#\d{1,5}/) || $LEGAL_ENTITIES{$s};
 183 }
 184
 185 # Given an entity name, either return that entity or an empty
 186 # string to elide it.
 187 sub make_entity {
 188     my $s = shift;
 189     if (is_valid_entity($s)) {
 190         return "&$s;";
 191     } else {
 192         return '';
 193     }
 194 }
 195
 196 sub escape {
 197     my $s = shift;
 198     $s =~ s/([&<>])/$ESCAPE_ENTITIES{$1}/ge;
 199     return $s;
 200 }
 201 sub escape_quote {
 202     my $s = shift;
 203     $s =~ s/([&<>'"])/$QUOTE_ENTITIES{$1}/ge; #'])/;
 204     return $s;
 205 }
 206
 207 # Nicer interface to create tokens; just specify the last bit.
 208 sub token {
 209     my $token_type = shift;
 210     $token_type = 'Thrasher::HTMLNormalize::Token::' . $token_type;
 211     return $token_type->new(@_);
 212 }
 213
 214 # Tokenize a tag or entity. We know it's one of them because of
 215 # how it was parsed. (Pulled out for simple testing.)
 216 sub tokenize_tag_or_entity {
 217     my $tag_or_entity = shift;
 218     my $allow_taglike = shift;
 219     my $tag_translation = shift;
 220
 221     # \# is to convince emacs syntax highlighting that # is not a comment
 222     if ($tag_or_entity !~ /((<(\/)?(\w+)([^>]*)>)|(&(\#?\w+);))/) {
 223         die "tokenize_tag_or_entity must be passed a tag or entity: "
 224             .$tag_or_entity;
 225     }
 226
 227     # Convert that mass of matches into something comprehensible
 228     my $groups = {tag => $2,    # the whole of the tag
 229                   slash => $3,  # if it has a slash to close the tag
 230                   tagname => $4,# the tag name
 231                   tagremains => $5, # the rest of the tag (atts)
 232                   entity => $6, # the whole of the entity
 233                   entityname => $7 };
 234
 235     $groups->{tagname} = lc $groups->{tagname};
 236
 237     if (defined(my $tag_translator =
 238                     $tag_translation->{lc $groups->{tagname}})) {
 239         # use Data::Dumper;
 240         #print "Initial: " . Dumper($groups);
 241         if (ref($tag_translator) eq '') {
 242             $groups->{tagname} = $tag_translator;
 243         } elsif (ref($tag_translator) eq 'CODE') {
 244             $tag_translator->($groups);
 245         }
 246         #print "Post-translation: " . Dumper($groups);
 247     }
 248
 249     if ($groups->{tag} && $allow_taglike &&
 250         !$TAG_TYPES{lc $groups->{tagname}}) {
 251         return token('Text', $tag_or_entity);
 252     } elsif ($groups->{tag}) {
 253         # handle tokenizing a tag
 254         if ($groups->{slash}) {
 255             return token('TagClose', $groups->{tagname});
 256         }
 257
 258         my $token_type = 'Tag';
 259
 260         my $atts = $groups->{tagremains};
 261         if ($atts && substr($atts, -1) eq '/') {
 262             $token_type = 'SelfClosedTag';
 263             $atts = substr($atts, 0, length($atts) - 1); # pop off /
 264         }
 265
 266         my $atts_hash = {};
 267
 268         # Pull out the attributes
 269         while ($atts =~ /([a-zA-Z]\w+)(?:=(?:(?:\"([^\">]*)\")|(?:'([^'>]*)\')|(?:([^ \n\t>]*))))?/g) {
 270             my $name = $1;
 271                 # one value delimited by ", the other by ',
 272                 # the final by whitespace, the final case
 273                 # covers those few atts in HTML that can
 274                 # show up with no value at all
 275             my $value = $2 || $3 || $4 || 1;
 276
 277             $atts_hash->{lc $name} = $value;
 278         }
 279
 280         return token($token_type, $groups->{tagname}, $atts_hash);
 281     } elsif ($groups->{entity}) {
 282         return token('Entity', $groups->{entityname});
 283     }
 284
 285     # shouldn't be able to get here.
 286     die "tokenize_tag_or_entity didn't get anything it expected.";
 287 }
 288
 289 # Given something known to be character-based text with no
 290 # (X)HTML in it, tokenize it.
 291 sub tokenize_text {
 292     my $s = shift;
 293     my @tokens;
 294
 295     my @texthunks = split(/(\n+)/, $s);
 296     my $i = 0;
 297
 298     for my $fragment (@texthunks) {
 299         if ($i %2) {
 300             my $newlines = length($fragment); # all newlines
 301             if ($newlines == 1) {
 302                 push @tokens, token('Newline');
 303             } else {
 304                 push @tokens, token('DoubleNewline');
 305             }
 306         } else {
 307             if ($fragment) { # skip empty matches
 308                 push @tokens, token('Text', $fragment);
 309             }
 310         }
 311
 312         $i++;
 313     }
 314
 315     return \@tokens;
 316 }
 317
 318 # Given a tag and our validation criteria, verify the tag is
 319 # allowed. If it is not allowed, this returns 0. It may also
 320 # destructively manipulate the tag's attributes to make them valid.
 321 sub validate_tag {
 322     my $tag = shift;
 323     my $allowed_elements = shift;
 324
 325     # It's all allowed.
 326     if (!defined($allowed_elements)) {
 327         return $tag;
 328     }
 329
 330     my $legal_attributes = $allowed_elements->{$tag->{name}};
 331
 332     # Is this allowed at all?
 333     if (!$legal_attributes) {
 334         return 0;
 335     }
 336
 337     # If this is a close tag, pass it through.
 338     if ($tag->{type} == $TAGCLOSE) {
 339         return $tag;
 340     }
 341
 342     # Now, actually validate the attributes;
 343     while (my ($name, $value) = each %{$tag->{atts}}) {
 344         my $attribute_validation = $legal_attributes->{$name};
 345
 346         # Is the attribute even allowed?
 347         if (!$attribute_validation) {
 348             # Nope, toast it.
 349             delete($tag->{atts}->{$name});
 350             next;
 351         }
 352
 353         if ($attribute_validation eq '1') { # note string compare
 354             # everything is permitted here, pass through
 355             next;
 356         }
 357
 358         my $new_value;
 359         if (ref($attribute_validation) eq 'CODE') {
 360             $new_value = $attribute_validation->($value);
 361             if (!defined($new_value)) {
 362                 # This attribute is so evil that the entire tag
 363                 # has to go away!
 364                 return 0;
 365             }
 366
 367             if (!$new_value) { # either false or empty string
 368                 delete($tag->{atts}->{$name});
 369                 next;
 370             }
 371         } elsif (ref($attribute_validation) eq 'Regexp') {
 372             my $match = $value =~ /$attribute_validation/;
 373             if (!$match) {
 374                 return 0;
 375             }
 376             # Otherwise pass through; for better control, use a sub
 377             $new_value = $value;
 378         } elsif ($attribute_validation) { # any true value
 379             $new_value = $value;
 380         } else {
 381             # any false value; redundent since unspecified = false
 382             delete($tag->{atts}->{$name});
 383             next;
 384         }
 385
 386         $tag->{atts}->{$name} = $new_value;
 387     }
 388
 389     return $tag;
 390 }
 391
 392 # The money function at last.
 393 sub normalize {
 394     my $html = shift;
 395     my $allowed_elements = shift; # default - allow everything
 396     my $allow_cdata = shift; # default - no
 397     my $allow_taglike = shift;
 398     my $tag_translation = shift || {};
 399
 400     my $text_only = shift; # don't automatically slap in <p>s
 401
 402     # Whitespace trimmer
 403     {
 404         # Apparently, applying this to something that consists
 405         # entirely of whitespace gets a warning about using
 406         # an uninitialized value. Probably just the implementation
 407         # poking out and doesn't seem to be worth worrying about.
 408         no warnings 'uninitialized';
 409         $html =~ s/(^\s+)|(\s+$ )//gx;
 410     }
 411
 412     # FIXME: Verify in utf-8 text encoding
 413
 414     # If we elimated the HTML in that step, return nothing
 415     if (!$html) {
 416         return '';
 417     }
 418
 419     my @tokens;
 420
 421     if ($allow_cdata) {
 422         my @cdata_fragments = split(/<!\[CDATA\[(.*?)\]\]>/,
 423                                     $html);
 424         my $i = 0;
 425
 426         for my $fragment (@cdata_fragments) {
 427             if ($i % 2) {
 428                 push @tokens, token('CData', $fragment);
 429             } else {
 430                 tokenize_fragment(\@tokens, $fragment,
 431                                   $allowed_elements, $allow_taglike,
 432                                   $tag_translation);
 433             }
 434             $i++;
 435         }
 436     } else {
 437         tokenize_fragment(\@tokens, $html, $allowed_elements,
 438                           $allow_taglike, $tag_translation);
 439     }
 440
 441     # Push a p on front if the first token is not a block
 442     # tag. Note that <html> is a block tag, so complete
 443     # documents don't get this done, only fragments. This work
 444     # because the TagStack will normalize the p correctly.
 445     if (!$text_only &&
 446         @tokens && ($tokens[0]->{type} != $TAG ||
 447                     ($TAG_TYPES{$tokens[0]->{name}} || $BLOCK)
 448                         != $BLOCK)) {
 449         unshift @tokens, token('Tag', 'p');
 450     }
 451
 452     my @new_tokens;
 453     # we abusively share modification between this func
 454     # and the tag stack :(
 455     my $tag_stack = new Thrasher::HTMLNormalize::TagStack(\@new_tokens);
 456
 457     for (my $i = 0; $i < @tokens; $i++) {
 458         my $token = $tokens[$i];
 459
 460         my $type = $token->{type};
 461         if ($type == $TAG) {
 462             $tag_stack->push($token);
 463         }
 464         if ($type == $TAGCLOSE) {
 465             $tag_stack->pop($token);
 466         }
 467         if ($type == $TEXT || $type == $ENTITY ||
 468             $type == $SELFCLOSEDTAG || $type == $CDATA) {
 469             push @new_tokens, $token
 470         }
 471
 472         # If a newline is already preceded by a self-closed
 473         # br, pass it through, otherwise, prepend a self-closed br
 474         if ($type == $NEWLINE) {
 475             # Well, actually, if we're in a "pre", skip all fancy
 476             # processing
 477             if ($tag_stack->in_tag('pre')) {
 478                 push @new_tokens, token('Text', "\n");
 479                 next;
 480             }
 481
 482             if (!@new_tokens) {
 483                 next;
 484             }
 485
 486             my $previous_token = $tokens[$i - 1];
 487             if ($previous_token->{type} == $SELFCLOSEDTAG &&
 488                 $previous_token->{name} eq 'br') {
 489                 push @new_tokens, $token;
 490             } else {
 491                 # add a br somehowe
 492
 493                 # we're not at the end of the tokens.
 494                 if ($i + 1 < @tokens) {
 495                     my $next_token = $tokens[$i + 1];
 496                     if ($next_token->{type} == $TAG) {
 497                         my $tag_type = $TAG_TYPES{$next_token->{name}}
 498                                        || $BLOCK;
 499                         if ($tag_type != $BLOCK &&
 500                             $tag_type != $BLOCK_SELF_CLOSE) {
 501                             # this automatically gets eaten if there
 502                             # was no previous <p>
 503                             if (!$text_only) {
 504                                 push @new_tokens,
 505                                      token('SelfClosedTag', 'br');
 506                             }
 507                             push @new_tokens, $token;
 508                         } else {
 509                             push @new_tokens, $token;
 510                         }
 511                     } else {
 512                         if (!$text_only) {
 513                             push @new_tokens, token('SelfClosedTag', 'br');
 514                         }
 515                         push @new_tokens, $token;
 516                     }
 517                 }
 518             }
 519         }
 520
 521         if ($type == $DOUBLENEWLINE) {
 522             if ($tag_stack->in_tag('pre')) {
 523                 push @new_tokens, token('Text', "\n\n");
 524                 next;
 525             }
 526
 527             # Always assume we're closing a <p>. It gets eaten later
 528             # by the TagStack if this turns out to be false.
 529             if (!$text_only) {
 530                 $tag_stack->pop(token('TagClose', 'p'));
 531             }
 532
 533             # If this is followed by a non-block tag, add a <p>.
 534             if ($i < @tokens - 1) {
 535                 my $next_token = $tokens[$i + 1];
 536                 if ($next_token->{type} == $TAG) {
 537                     my $tag_type = $TAG_TYPES{$next_token->{name}} || $BLOCK;
 538                     if ($tag_type != $BLOCK &&
 539                         $tag_type != $BLOCK_SELF_CLOSE) {
 540                         push @new_tokens, $token;
 541                         if (!$text_only) {
 542                             $tag_stack->push(token('Tag', 'p'));
 543                         }
 544                     } else {
 545                         push @new_tokens, $token;
 546                     }
 547                 } else {
 548                     push @new_tokens, $token;
 549                     if (!$text_only) {
 550                         $tag_stack->push(token('Tag', 'p'));
 551                     }
 552                 }
 553             }
 554         }
 555     }
 556
 557     $tag_stack->terminate();
 558
 559     my $new_html = join '', map { $_->render } @new_tokens;
 560
 561     if ($VALIDATE_AS_XML) {
 562         # Do a simple validation pass.
 563         die "XML validation not implemented yet.";
 564     }
 565
 566     return $new_html;
 567 }
 568
 569 # Performs low-level tokenization on a bit of text known not to have
 570 # (valid) CDATA in it.
 571 sub tokenize_fragment {
 572     my $tokens = shift;
 573     my $html = shift;
 574     my $allowed_elements = shift;
 575     my $allow_taglike = shift;
 576     my $tag_translation = shift;
 577
 578     # Same as the validation re in tokenize_tag_or_entity, but
 579     # with all but the outer parens turned into non-capturing.
 580     my @fragments = split (/((?:<(?:\/)?(?:\w+)(?:[^>]*)>)|(?:&(?:\#?\w+);))/,
 581                            $html);
 582
 583     for (my $i = 0; $i < @fragments; $i++) {
 584         my $text = $fragments[$i];
 585
 586         if ($i % 2) {
 587             my $token = tokenize_tag_or_entity($text, $allow_taglike,
 588                                                $tag_translation);
 589             if ($token->{type} == $TAG) {
 590                 $token = validate_tag($token, $allowed_elements);
 591             }
 592             if ($token) {
 593                 push @$tokens, $token;
 594             }
 595         } else {
 596             push @$tokens, @{tokenize_text($text)};
 597         }
 598     }
 599 }
 600
 601 ###
 602 ### TOKEN CLASSES
 603 ###
 604
 605 package Thrasher::HTMLNormalize::Token;
 606 # A base class for tokens, used for 'isa' checks.
 607
 608 # This is only used by children; do not call directly.
 609 sub _new {
 610     my $class = shift;
 611     my %values = @_;
 612     my $self = \%values;
 613     bless $self, $class;
 614     return $self;
 615 }
 616
 617
 618 package Thrasher::HTMLNormalize::Token::Text;
 619 use base 'Thrasher::HTMLNormalize::Token';
 620
 621 sub new { shift()->SUPER::_new(text => shift(), type => $TEXT) }
 622
 623 sub render {
 624     return Thrasher::HTMLNormalize::escape($_[0]->{text});
 625 }
 626
 627 package Thrasher::HTMLNormalize::TagBase;
 628 use base 'Thrasher::HTMLNormalize::Token';
 629 # Contains the similarities between Tags and SelfClosedTags.
 630
 631 # process an attribute list into proper HTML; this assumes that
 632 # the attributes has been checked for valid attributes already,
 633 # but this also checks for permitted entities. Returns an arrayref
 634 # containing strings ready-to-be-output.
 635 sub att_list {
 636     my $self = shift;
 637     my @atts;
 638     while (my ($name, $value) = each %{$self->{atts}}) {
 639         my @fragments = split (/&(\w+);/, $value);
 640         my $i = 0;
 641         my @new_value;
 642         for my $text (@fragments) {
 643             if ($i % 2) {
 644                 # This is an entity body.
 645                 push @new_value, Thrasher::HTMLNormalize::make_entity($text);
 646             } else {
 647                 # normal text
 648                 push(@new_value,
 649                      Thrasher::HTMLNormalize::escape_quote($text));
 650             }
 651             $i ++;
 652         }
 653         my $value = join '', @new_value;
 654         # note $name was checked for validity implicitly by the
 655         # regex that pulled it out in the first place.
 656         push @atts, "$name='$value'";
 657     }
 658     return \@atts;
 659 }
 660
 661 package Thrasher::HTMLNormalize::Token::Tag;
 662 use base 'Thrasher::HTMLNormalize::TagBase';
 663 # Represents a Tag, which may have attributes.
 664 # Takes: A name, and a hashref of atts.
 665 # Note that name is checked implicitly by the regex that pulls it out.
 666
 667 use Carp qw(confess);
 668
 669 sub new {
 670     my $self = shift()->SUPER::_new(name => shift(),
 671                                     atts => shift() || {},
 672                                     type => $TAG);
 673     confess "Hash" if ref($self->{name}) eq 'HASH';
 674     return $self;
 675 }
 676
 677 sub render {
 678     my $self = shift;
 679     my $atts = $self->att_list;
 680
 681     if (@$atts) {
 682         return "<$self->{name} " . join(' ', @$atts) . ">";
 683     } else {
 684         return "<$self->{name}>";
 685     }
 686 }
 687
 688 package Thrasher::HTMLNormalize::Token::SelfClosedTag;
 689 use base 'Thrasher::HTMLNormalize::TagBase';
 690 # Represents a self-closing tag, which may have attributes.
 691
 692 sub new { shift()->SUPER::_new(name => shift(),
 693                                atts => shift() || {},
 694                                type => $SELFCLOSEDTAG); }
 695
 696 sub render {
 697     my $self = shift;
 698     my $atts = $self->att_list;
 699
 700     if (@$atts) {
 701         return "<$self->{name} " . join (' ', @$atts) . " />";
 702     } else {
 703         return "<$self->{name} />";
 704     }
 705 }
 706
 707 package Thrasher::HTMLNormalize::Token::TagClose;
 708 use base 'Thrasher::HTMLNormalize::Token';
 709 # Represents a close tag. Has only a name.
 710
 711 sub new { shift()->SUPER::_new(name => shift(),
 712                                type => $TAGCLOSE); }
 713
 714 sub render {
 715     return "</$_[0]->{name}>";
 716 }
 717
 718 package Thrasher::HTMLNormalize::Token::Entity;
 719 use base 'Thrasher::HTMLNormalize::Token';
 720 use Thrasher::HTMLNormalize::Entities qw(html_entity);
 721 # Represents an entity. Has only a name. Performs numberization of
 722 # HTML entities
 723
 724 sub new {
 725     my $class = shift;
 726
 727     my $name = shift;
 728     if (!$LEGAL_ENTITIES{$name} && (my $number = html_entity($name))) {
 729         $name = '#' . $number;
 730     }
 731     # make_entity takes care of nulling out illegal entities
 732     # that pass this test
 733
 734     return $class->SUPER::_new(name => $name, type => $ENTITY);
 735 }
 736
 737 sub render {
 738     return Thrasher::HTMLNormalize::make_entity($_[0]->{name});
 739 }
 740
 741 package Thrasher::HTMLNormalize::Token::Newline;
 742 use base 'Thrasher::HTMLNormalize::Token';
 743 # Represents a single newline, suitable for <br />
 744
 745 sub new { shift()->SUPER::_new(type => $NEWLINE); }
 746
 747 sub render { "\n"; }
 748
 749 package Thrasher::HTMLNormalize::Token::DoubleNewline;
 750 use base 'Thrasher::HTMLNormalize::Token';
 751 # Double new line, suitable for <p>-ification
 752
 753 sub new { shift()->SUPER::_new(type => $DOUBLENEWLINE); }
 754
 755 sub render { "\n\n"; }
 756
 757 package Thrasher::HTMLNormalize::Token::CData;
 758 use base 'Thrasher::HTMLNormalize::Token';
 759
 760 sub new { shift()->SUPER::_new(rawtext => shift(), type => $CDATA); }
 761
 762 sub render { "<![CDATA[" . $_[0]->{rawtext} . "]]>"; }
 763
 764 ###
 765 ### TAG STACK
 766 ###
 767
 768 package Thrasher::HTMLNormalize::TagStack;
 769 use strict;
 770 use warnings;
 771
 772 use Data::Dumper;
 773
 774 *{token} = *Thrasher::HTMLNormalize::token;
 775
 776 # This is what it says it is; the "stack" of tags that has led
 777 # us to this point in the HTML. It is a "smart" stack; it can
 778 # silently refuse to accept tag tokens, resulting in the removal
 779 # of that tag from the resulting HTML. If you need to implement
 780 # the real nesting rules for (X)HTML tags, this is where you'd
 781 # put that logic.
 782
 783 sub new {
 784     my $class = shift;
 785     my $self = {stack => [],
 786                 # we abusively modify this in the normalize function :(
 787                 tokens => shift};
 788     bless $self, $class;
 789     return $self;
 790 }
 791
 792 # last element on the stack
 793 sub laststack { $_[0]->{stack}->[-1]; }
 794 sub lasttoken { $_[0]->{tokens}->[-1]; }
 795
 796 # Push a new tag token onto the stack.
 797 sub push {
 798     my $self = shift;
 799     my $tag = shift;
 800
 801     # Assume certain tags close themselves if they appear immediately
 802     # after themselves, i.e., <li>stuff<li>morestuff.
 803     # In particular, this if statement is asking "Is the previous
 804     # element the same as this element and a SELF_CLOSER like p?"
 805     if (@{$self->{stack}} && $self->laststack->{name} eq $tag->{name}
 806         && $SELF_CLOSERS{$tag->{name}}) {
 807         # If so, 'pop' the tag, which causes the emission of the
 808         # appropriate end tag (see pop)
 809         $self->pop(token('TagClose', $tag->{name}));
 810     }
 811
 812     # If this tag should always self close, convert it to such
 813     # unconditionally
 814     if ($TAG_TYPES{$tag->{name}} == $BLOCK_SELF_CLOSE) {
 815         push @{$self->{tokens}}, token('SelfClosedTag',
 816                                        $tag->{name},
 817                                        $tag->{atts});
 818         return;
 819     }
 820
 821     # otherwise, a normal push
 822     push @{$self->{stack}}, $tag;
 823     push @{$self->{tokens}}, $tag;
 824 }
 825
 826 # A lot of the normalization magic takes place here.
 827 sub pop {
 828     my $self = shift;
 829     my $tag = shift;
 830
 831     # We want a chance to manipulate the tokens we push back on
 832     # a bit, before losing track of the tokens this func
 833     # generated by tossing them onto the pile
 834     my @tokens;
 835
 836     # If a newline or doublenewline is currently at the end of the
 837     # stack, pop it off for a moment and put it back later. This has
 838     # the effect of binding close paragraph tags to the paragraph they
 839     # were used in. Without this,
 840     #
 841     #  <p>Hello.
 842     #
 843     #  <p>There.
 844     #
 845     # (complete with those newlines) gets converted to
 846     #
 847     #  <p>Hello.
 848     #
 849     #  </p><p>There.</p>
 850     #
 851     # which really isn't the intent, even if it is valid HTML.
 852     my $popped_newline;
 853     if (@{$self->{tokens}} &&
 854           ($self->lasttoken->{type} == $NEWLINE ||
 855            $self->lasttoken->{type} == $DOUBLENEWLINE)) {
 856         $popped_newline = pop @{$self->{tokens}};
 857     }
 858
 859     # Scan backwards for the matching start tag, emit
 860     # any missing close tags.
 861     my $i = scalar(@{$self->{stack}}) - 1;
 862     while ($i >= 0) {
 863         my $open_tag = $self->{stack}->[$i];
 864         if ($open_tag->{name} eq $tag->{name}) {
 865             while ($i + 1 < @{$self->{stack}}) {
 866                 my $tag_to_close = pop @{$self->{stack}};
 867                 CORE::push(@tokens,
 868                            token('TagClose', $tag_to_close->{name}));
 869             }
 870             CORE::pop @{$self->{stack}};
 871             CORE::push @tokens, token('TagClose', $tag->{name});
 872             last;
 873         }
 874         $i--;
 875     }
 876
 877     # If we're closing off the previous *token* (not stack
 878     # element), and there is therefore no content for that
 879     # token, either elide it if it is an empty block token,
 880     # or convert it to a self-closed tag
 881     if (@tokens && $self->lasttoken->{type} == $TAG &&
 882         $self->lasttoken->{name} eq $tokens[0]->{name}) {
 883         my $name = $tokens[0]->{name};
 884         my $tag_type = $TAG_TYPES{$name};
 885         if ($tag_type == $BLOCK) {
 886             # elide the token
 887             pop @tokens;
 888             pop @{$self->{tokens}};
 889         }
 890         if ($tag_type == $BLOCK_SELF_CLOSE) {
 891             pop @tokens;
 892             my $old_tag = pop @{$self->tokens};
 893             unshift @tokens, token('SelfClosedTag',
 894                                    $old_tag->{name},
 895                                    $old_tag->{atts});
 896         }
 897     }
 898
 899     # If there was no corresponding open tag, just discard it by
 900     # doing nothing on purpose.
 901
 902     # do_nothing();
 903
 904     # If we popped off a newline, now is when it goes back on
 905     CORE::push @tokens, $popped_newline if $popped_newline;
 906
 907     CORE::push @{$self->{tokens}}, @tokens;
 908 }
 909
 910 # Returns true if we are "in" the given tag, false otherwise. This
 911 # is used to implement special treatment for <pre>, for instance
 912 sub in_tag {
 913     my $self = shift;
 914     my $tag_name = shift;
 915
 916     for my $tag (@{$self->{stack}}) {
 917         return 1 if $tag->{name} eq $tag_name;
 918     }
 919     return 0;
 920 }
 921
 922 # Terminates the TagStack by popping all current elements. Used
 923 # at the end of the input string. Despite the name, not the same
 924 # as a DESTROY function.
 925 sub terminate {
 926     my $self = shift;
 927
 928     while (@{$self->{stack}}) {
 929         my $tag_still_open = CORE::pop @{$self->{stack}};
 930         CORE::push @{$self->{tokens}}, token('TagClose',
 931                                              $tag_still_open->{name});
 932     }
 933 }
 934
 935 1;
 936 __END__
 937
 938 =head1 NAME
 939
 940 Thrasher::HTMLNormalize - a module for normalizing (X)HTML for safe
 941 display
 942
 943 =head1 SYNOPSIS
 944
 945  use Thrasher::HTMLNormalize qw(:all);
 946
 947  # Take arbitrary HTML and make it syntactically correct
 948  my $normalizedHTML = normalize($html);
 949
 950  # Take arbitrary maybe-HTML and make it correct, and
 951  # use a chosen sub-set of HTML suitable for simple rich-text-like
 952  # HTML.
 953  my $comment_like_html = normalize($html, $CONSERVATIVE_PERMISSIONS);
 954
 955  # Take arbitrary maybe-HTML and make it correct, use a
 956  # a chosen sub-set of HTML suitable for simple rich-text-like
 957  # HTML, and if you see a "tag" that isn't actually HTML,
 958  # escape it as if it was text.
 959  my $message_log_like_html = normalize($html,
 960                                        $CONSERVATIVE_PERMISSIONS,
 961                                        undef, 1);
 962
 963
 964 =head1 MOTIVATION
 965
 966 This predated HTML::Normalize on CPAN. I just had it lying around, and
 967 I understand it, so I used it. I recommend checking out
 968 HTML::Normalize if you think you might want to use this. (On the
 969 other hand, I've had to implement features here that it doesn't have.)
 970
 971 This is a utility module; Thrasher::XHTMLNormalize specifically
 972 defines the normalization for XHTML-IM.
 973
 974 This is a utility module; Thrasher::XHTMLNormalize specifically
 975 defines the normalization for XHTML-IM.
 976
 977 =cut