From 6982036dc15ba85cdbf7339265f030ff7582ff3b Mon Sep 17 00:00:00 2001 From: pkasting Date: Mon, 21 Sep 2015 18:54:05 -0700 Subject: [PATCH] Correctly handle problematic nested escapes in URL paths. Specifically, if unescaping in the input leads to the output URL containing a new escaped sequence, e.g. converting the input "%%30%30" to "%00", escape the leading '%' as "%25" to ensure the output sequence is not treated as a new valid escape sequence. This ensures that canonicalizing the same URL a second time won't make changes to it, which is important for avoiding crashes and other bugs in a variety of places in both debug and release builds. BUG=533361 TEST=Visit http://andrisatteka.blogspot.com/2015/09/a-simple-string-to-crash-google-chrome.html , hover the link there, Chrome should not crash. Review URL: https://codereview.chromium.org/1358433004 Cr-Commit-Position: refs/heads/master@{#350086} --- url/url_canon_path.cc | 118 +++++++++++++++++++++++++++++++++++++++------- url/url_canon_unittest.cc | 15 ++++++ 2 files changed, 115 insertions(+), 18 deletions(-) diff --git a/url/url_canon_path.cc b/url/url_canon_path.cc index ee1cd9626c5d..6e5bfb1a245f 100644 --- a/url/url_canon_path.cc +++ b/url/url_canon_path.cc @@ -162,6 +162,76 @@ void BackUpToPreviousSlash(int path_begin_in_output, output->set_length(i + 1); } +// Looks for problematic nested escape sequences and escapes the output as +// needed to ensure they can't be misinterpreted. +// +// Our concern is that in input escape sequence that's invalid because it +// contains nested escape sequences might look valid once those are unescaped. +// For example, "%%300" is not a valid escape sequence, but after unescaping the +// inner "%30" this becomes "%00" which is valid. Leaving this in the output +// string can result in callers re-canonicalizing the string and unescaping this +// sequence, thus resulting in something fundamentally different than the +// original input here. This can cause a variety of problems. +// +// This function is called after we've just unescaped a sequence that's within +// two output characters of a previous '%' that we know didn't begin a valid +// escape sequence in the input string. We look for whether the output is going +// to turn into a valid escape sequence, and if so, convert the initial '%' into +// an escaped "%25" so the output can't be misinterpreted. +// +// |spec| is the input string we're canonicalizing. +// |next_input_index| is the index of the next unprocessed character in |spec|. +// |input_len| is the length of |spec|. +// |last_invalid_percent_index| is the index in |output| of a previously-seen +// '%' character. The caller knows this '%' character isn't followed by a valid +// escape sequence in the input string. +// |output| is the canonicalized output thus far. The caller guarantees this +// ends with a '%' followed by one or two characters, and the '%' is the one +// pointed to by |last_invalid_percent_index|. The last character in the string +// was just unescaped. +template +void CheckForNestedEscapes(const CHAR* spec, + int next_input_index, + int input_len, + int last_invalid_percent_index, + CanonOutput* output) { + const int length = output->length(); + const char last_unescaped_char = output->at(length - 1); + + // If |output| currently looks like "%c", we need to try appending the next + // input character to see if this will result in a problematic escape + // sequence. Note that this won't trigger on the first nested escape of a + // two-escape sequence like "%%30%30" -- we'll allow the conversion to + // "%0%30" -- but the second nested escape will be caught by this function + // when it's called again in that case. + const bool append_next_char = last_invalid_percent_index == length - 2; + if (append_next_char) { + // If the input doesn't contain a 7-bit character next, this case won't be a + // problem. + if ((next_input_index == input_len) || (spec[next_input_index] >= 0x80)) + return; + output->push_back(static_cast(spec[next_input_index])); + } + + // Now output ends like "%cc". Try to unescape this. + int begin = last_invalid_percent_index; + unsigned char temp; + if (DecodeEscaped(output->data(), &begin, output->length(), &temp)) { + // New escape sequence found. Overwrite the characters following the '%' + // with "25", and push_back() the one or two characters that were following + // the '%' when we were called. + if (!append_next_char) + output->push_back(output->at(last_invalid_percent_index + 1)); + output->set(last_invalid_percent_index + 1, '2'); + output->set(last_invalid_percent_index + 2, '5'); + output->push_back(last_unescaped_char); + } else if (append_next_char) { + // Not a valid escape sequence, but we still need to undo appending the next + // source character so the caller can process it normally. + output->set_length(length); + } +} + // Appends the given path to the output. It assumes that if the input path // starts with a slash, it should be copied to the output. If no path has // already been appended to the output (the case when not resolving @@ -182,10 +252,15 @@ bool DoPartialPath(const CHAR* spec, CanonOutput* output) { int end = path.end(); + // We use this variable to minimize the amount of work done when unescaping -- + // we'll only call CheckForNestedEscapes() when this points at one of the last + // couple of characters in |output|. + int last_invalid_percent_index = INT_MIN; + bool success = true; for (int i = path.begin; i < end; i++) { UCHAR uch = static_cast(spec[i]); - if (sizeof(CHAR) > sizeof(char) && uch >= 0x80) { + if (sizeof(CHAR) > 1 && uch >= 0x80) { // We only need to test wide input for having non-ASCII characters. For // narrow input, we'll always just use the lookup table. We don't try to // do anything tricky with decoding/validating UTF-8. This function will @@ -245,33 +320,40 @@ bool DoPartialPath(const CHAR* spec, unsigned char unescaped_value; if (DecodeEscaped(spec, &i, end, &unescaped_value)) { // Valid escape sequence, see if we keep, reject, or unescape it. + // Note that at this point DecodeEscape() will have advanced |i| to + // the last character of the escape sequence. char unescaped_flags = kPathCharLookup[unescaped_value]; if (unescaped_flags & UNESCAPE) { - // This escaped value shouldn't be escaped, copy it. + // This escaped value shouldn't be escaped. Try to copy it. output->push_back(unescaped_value); - } else if (unescaped_flags & INVALID_BIT) { - // Invalid escaped character, copy it and remember the error. - output->push_back('%'); - output->push_back(static_cast(spec[i - 1])); - output->push_back(static_cast(spec[i])); - success = false; + // If we just unescaped a value within 2 output characters of the + // '%' from a previously-detected invalid escape sequence, we + // might have an input string with problematic nested escape + // sequences; detect and fix them. + if (last_invalid_percent_index >= (output->length() - 3)) { + CheckForNestedEscapes(spec, i + 1, end, + last_invalid_percent_index, output); + } } else { - // Valid escaped character but we should keep it escaped. We - // don't want to change the case of any hex letters in case - // the server is sensitive to that, so we just copy the two - // characters without checking (DecodeEscape will have advanced - // to the last character of the pair). + // Either this is an invalid escaped character, or it's a valid + // escaped character we should keep escaped. In the first case we + // should just copy it exactly and remember the error. In the + // second we also copy exactly in case the server is sensitive to + // changing the case of any hex letters. output->push_back('%'); output->push_back(static_cast(spec[i - 1])); output->push_back(static_cast(spec[i])); + if (unescaped_flags & INVALID_BIT) + success = false; } } else { - // Invalid escape sequence. IE7 rejects any URLs with such - // sequences, while Firefox, IE6, and Safari all pass it through - // unchanged. We are more permissive unlike IE7. I don't think this - // can cause significant problems, if it does, we should change - // to be more like IE7. + // Invalid escape sequence. IE7+ rejects any URLs with such + // sequences, while other browsers pass them through unchanged. We + // use the permissive behavior. + // TODO(brettw): Consider testing IE's strict behavior, which would + // allow removing the code to handle nested escapes above. + last_invalid_percent_index = output->length(); output->push_back('%'); } diff --git a/url/url_canon_unittest.cc b/url/url_canon_unittest.cc index 0ccd6c93009e..b382a7cda2aa 100644 --- a/url/url_canon_unittest.cc +++ b/url/url_canon_unittest.cc @@ -1060,6 +1060,21 @@ TEST(URLCanonTest, Path) { {"/%7Ffp3%3Eju%3Dduvgw%3Dd", L"/%7Ffp3%3Eju%3Dduvgw%3Dd", "/%7Ffp3%3Eju%3Dduvgw%3Dd", Component(0, 24), true}, // @ should be passed through unchanged (escaped or unescaped). {"/@asdf%40", L"/@asdf%40", "/@asdf%40", Component(0, 9), true}, + // Nested escape sequences should result in escaping the leading '%' if + // unescaping would result in a new escape sequence. + {"/%A%42", L"/%A%42", "/%25AB", Component(0, 6), true}, + {"/%%41B", L"/%%41B", "/%25AB", Component(0, 6), true}, + {"/%%41%42", L"/%%41%42", "/%25AB", Component(0, 6), true}, + // Make sure truncated "nested" escapes don't result in reading off the + // string end. + {"/%%41", L"/%%41", "/%A", Component(0, 3), true}, + // Don't unescape the leading '%' if unescaping doesn't result in a valid + // new escape sequence. + {"/%%470", L"/%%470", "/%G0", Component(0, 4), true}, + {"/%%2D%41", L"/%%2D%41", "/%-A", Component(0, 4), true}, + // Don't erroneously downcast a UTF-16 charater in a way that makes it + // look like part of an escape sequence. + {NULL, L"/%%41\x0130", "/%A%C4%B0", Component(0, 9), true}, // ----- encoding tests ----- // Basic conversions -- 2.11.4.GIT