patches/gitweb/q/gitweb-workaround-surrogate-code-point-problem.diff

   1 Subject: [PATCH] gitweb: workaround surrogate code point problem
   2
   3 When gitweb attempts to automatically treat repository data as
   4 UTF-8, it uses utf8::decode to activate Perl's UTF-8 flag.
   5
   6 Unfortunately, surrogate pairs (codepoints 0xD800-0xDFFF) are
   7 also converted to UTF-8 if present in the input.  However those
   8 codepoints are only valid in UTF-16.  Attempting to do any kind
   9 of pattern match substitution on the strings that contain these
  10 UTF-8 surrogate pair code points will result in a fatal
  11 'Malformed UTF-8 character' error.
  12
  13 The substitution in question is attempting to replace control
  14 characters with nice-looking escapes sequences.  It only needs
  15 to detect character values 0x00-0x1f, so switch into bytes mode
  16 for the substitution to avoid the fatal error.
  17
  18 This results in the surrogates actually being sent back to the
  19 browser for display which typically results in them being
  20 rendered as a replacement character (0xfffd).
  21
  22 Signed-off-by: Kyle J. McKay <mackyle@gmail.com>
  23 ---
  24  gitweb/gitweb.perl | 5 +++++
  25  1 file changed, 5 insertions(+)
  26
  27 diff --git a/gitweb/gitweb.perl b/gitweb/gitweb.perl
  28 index f26f2d1d..896f2ceb 100755
  29 --- a/gitweb/gitweb.perl
  30 +++ b/gitweb/gitweb.perl
  31 @@ -1749,6 +1749,7 @@ sub esc_html {
  32         if ($opts{'-nbsp'}) {
  33                 $str =~ s/ /&#160;/g;
  34         }
  35 +       use bytes;
  36         $str =~ s|([[:cntrl:]])|(($1 ne "\t") ? quot_cec($1) : $1)|eg;
  37         return $str;
  38  }
  39 @@ -1765,6 +1766,7 @@ sub esc_path {
  40         if ($opts{'-nbsp'}) {
  41                 $str =~ s/ /&#160;/g;
  42         }
  43 +       use bytes;
  44         $str =~ s|([[:cntrl:]])|quot_cec($1)|eg;
  45         return $str;
  46  }
  47 @@ -1776,6 +1778,7 @@ sub sanitize {
  48         return undef unless defined $str;
  49
  50         $str = to_utf8($str);
  51 +       use bytes;
  52         $str =~ s|([[:cntrl:]])|(index("\t\n\r", $1) != -1 ? $1 : quot_cec($1))|eg;
  53         return $str;
  54  }
  55 @@ -1949,6 +1952,7 @@ sub chop_and_escape_str {
  56         if ($chopped eq $str) {
  57                 return esc_html($chopped);
  58         } else {
  59 +               use bytes;
  60                 $str =~ s/[[:cntrl:]]/?/g;
  61                 return $cgi->span({-title=>$str}, esc_html($chopped));
  62         }
  63 @@ -2263,6 +2267,7 @@ sub format_subject_html {
  64         $extra = '' unless defined($extra);
  65
  66         if (length($short) < length($long)) {
  67 +               use bytes;
  68                 $long =~ s/[[:cntrl:]]/?/g;
  69                 return $cgi->a({-href => $href, -class => "list subject",
  70                                 -title => to_utf8($long)},
  71 ---