Updating trunk VERSION from 2139.0 to 2140.0
[chromium-blink-merge.git] / net / tools / dump_cache / url_to_filename_encoder.cc
blobb807ec092671b01671277fd1986f97a75d4c7797
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include <stdlib.h>
7 #include "base/logging.h"
8 #include "base/strings/string_util.h"
9 #include "net/base/net_util.h"
10 #include "net/tools/dump_cache/url_to_filename_encoder.h"
12 using std::string;
14 namespace {
16 // Returns 1 if buf is prefixed by "num_digits" of hex digits
17 // Teturns 0 otherwise.
18 // The function checks for '\0' for string termination.
19 int HexDigitsPrefix(const char* buf, int num_digits) {
20 for (int i = 0; i < num_digits; i++) {
21 if (!IsHexDigit(buf[i]))
22 return 0; // This also detects end of string as '\0' is not xdigit.
24 return 1;
27 #ifdef WIN32
28 #define strtoull _strtoui64
29 #endif
31 // A simple parser for long long values. Returns the parsed value if a
32 // valid integer is found; else returns deflt
33 // UInt64 and Int64 cannot handle decimal numbers with leading 0s.
34 uint64 ParseLeadingHex64Value(const char *str, uint64 deflt) {
35 char *error = NULL;
36 const uint64 value = strtoull(str, &error, 16);
37 return (error == str) ? deflt : value;
42 namespace net {
44 // The escape character choice is made here -- all code and tests in this
45 // directory are based off of this constant. However, our testdata
46 // has tons of dependencies on this, so it cannot be changed without
47 // re-running those tests and fixing them.
48 const char UrlToFilenameEncoder::kEscapeChar = ',';
49 const char UrlToFilenameEncoder::kTruncationChar = '-';
50 const size_t UrlToFilenameEncoder::kMaximumSubdirectoryLength = 128;
52 void UrlToFilenameEncoder::AppendSegment(string* segment, string* dest) {
53 CHECK(!segment->empty());
54 if ((*segment == ".") || (*segment == "..")) {
55 dest->append(1, kEscapeChar);
56 dest->append(*segment);
57 segment->clear();
58 } else {
59 size_t segment_size = segment->size();
60 if (segment_size > kMaximumSubdirectoryLength) {
61 // We need to inject ",-" at the end of the segment to signify that
62 // we are inserting an artificial '/'. This means we have to chop
63 // off at least two characters to make room.
64 segment_size = kMaximumSubdirectoryLength - 2;
66 // But we don't want to break up an escape sequence that happens to lie at
67 // the end. Escape sequences are at most 2 characters.
68 if ((*segment)[segment_size - 1] == kEscapeChar) {
69 segment_size -= 1;
70 } else if ((*segment)[segment_size - 2] == kEscapeChar) {
71 segment_size -= 2;
73 dest->append(segment->data(), segment_size);
74 dest->append(1, kEscapeChar);
75 dest->append(1, kTruncationChar);
76 segment->erase(0, segment_size);
78 // At this point, if we had segment_size=3, and segment="abcd",
79 // then after this erase, we will have written "abc,-" and set segment="d"
80 } else {
81 dest->append(*segment);
82 segment->clear();
87 void UrlToFilenameEncoder::EncodeSegment(const string& filename_prefix,
88 const string& escaped_ending,
89 char dir_separator,
90 string* encoded_filename) {
91 string filename_ending = UrlUtilities::Unescape(escaped_ending);
93 char encoded[3];
94 int encoded_len;
95 string segment;
97 // TODO(jmarantz): This code would be a bit simpler if we disallowed
98 // Instaweb allowing filename_prefix to not end in "/". We could
99 // then change the is routine to just take one input string.
100 size_t start_of_segment = filename_prefix.find_last_of(dir_separator);
101 if (start_of_segment == string::npos) {
102 segment = filename_prefix;
103 } else {
104 segment = filename_prefix.substr(start_of_segment + 1);
105 *encoded_filename = filename_prefix.substr(0, start_of_segment + 1);
108 size_t index = 0;
109 // Special case the first / to avoid adding a leading kEscapeChar.
110 if (!filename_ending.empty() && (filename_ending[0] == dir_separator)) {
111 encoded_filename->append(segment);
112 segment.clear();
113 encoded_filename->append(1, dir_separator);
114 ++index;
117 for (; index < filename_ending.length(); ++index) {
118 unsigned char ch = static_cast<unsigned char>(filename_ending[index]);
120 // Note: instead of outputing an empty segment, we let the second slash
121 // be escaped below.
122 if ((ch == dir_separator) && !segment.empty()) {
123 AppendSegment(&segment, encoded_filename);
124 encoded_filename->append(1, dir_separator);
125 segment.clear();
126 } else {
127 // After removing unsafe chars the only safe ones are _.=+- and alphanums.
128 if ((ch == '_') || (ch == '.') || (ch == '=') || (ch == '+') ||
129 (ch == '-') || (('0' <= ch) && (ch <= '9')) ||
130 (('A' <= ch) && (ch <= 'Z')) || (('a' <= ch) && (ch <= 'z'))) {
131 encoded[0] = ch;
132 encoded_len = 1;
133 } else {
134 encoded[0] = kEscapeChar;
135 encoded[1] = ch / 16;
136 encoded[1] += (encoded[1] >= 10) ? 'A' - 10 : '0';
137 encoded[2] = ch % 16;
138 encoded[2] += (encoded[2] >= 10) ? 'A' - 10 : '0';
139 encoded_len = 3;
141 segment.append(encoded, encoded_len);
143 // If segment is too big, we must chop it into chunks.
144 if (segment.size() > kMaximumSubdirectoryLength) {
145 AppendSegment(&segment, encoded_filename);
146 encoded_filename->append(1, dir_separator);
151 // Append "," to the leaf filename so the leaf can also be a branch., e.g.
152 // allow http://a/b/c and http://a/b/c/d to co-exist as files "/a/b/c," and
153 // /a/b/c/d". So we will rename the "d" here to "d,". If doing that pushed
154 // us over the 128 char limit, then we will need to append "/" and the
155 // remaining chars.
156 segment += kEscapeChar;
157 AppendSegment(&segment, encoded_filename);
158 if (!segment.empty()) {
159 // The last overflow segment is special, because we appended in
160 // kEscapeChar above. We won't need to check it again for size
161 // or further escaping.
162 encoded_filename->append(1, dir_separator);
163 encoded_filename->append(segment);
167 // Note: this decoder is not the exact inverse of the EncodeSegment above,
168 // because it does not take into account a prefix.
169 bool UrlToFilenameEncoder::Decode(const string& encoded_filename,
170 char dir_separator,
171 string* decoded_url) {
172 enum State {
173 kStart,
174 kEscape,
175 kFirstDigit,
176 kTruncate,
177 kEscapeDot
179 State state = kStart;
180 char hex_buffer[3];
181 hex_buffer[2] = '\0';
182 for (size_t i = 0; i < encoded_filename.size(); ++i) {
183 char ch = encoded_filename[i];
184 switch (state) {
185 case kStart:
186 if (ch == kEscapeChar) {
187 state = kEscape;
188 } else if (ch == dir_separator) {
189 decoded_url->append(1, '/'); // URLs only use '/' not '\\'
190 } else {
191 decoded_url->append(1, ch);
193 break;
194 case kEscape:
195 if (HexDigitsPrefix(&ch, 1) == 1) {
196 hex_buffer[0] = ch;
197 state = kFirstDigit;
198 } else if (ch == kTruncationChar) {
199 state = kTruncate;
200 } else if (ch == '.') {
201 decoded_url->append(1, '.');
202 state = kEscapeDot; // Look for at most one more dot.
203 } else if (ch == dir_separator) {
204 // Consider url "//x". This was once encoded to "/,/x,".
205 // This code is what skips the first Escape.
206 decoded_url->append(1, '/'); // URLs only use '/' not '\\'
207 state = kStart;
208 } else {
209 return false;
211 break;
212 case kFirstDigit:
213 if (HexDigitsPrefix(&ch, 1) == 1) {
214 hex_buffer[1] = ch;
215 uint64 hex_value = ParseLeadingHex64Value(hex_buffer, 0);
216 decoded_url->append(1, static_cast<char>(hex_value));
217 state = kStart;
218 } else {
219 return false;
221 break;
222 case kTruncate:
223 if (ch == dir_separator) {
224 // Skip this separator, it was only put in to break up long
225 // path segments, but is not part of the URL.
226 state = kStart;
227 } else {
228 return false;
230 break;
231 case kEscapeDot:
232 decoded_url->append(1, ch);
233 state = kStart;
234 break;
238 // All legal encoded filenames end in kEscapeChar.
239 return (state == kEscape);
242 // Escape the given input |path| and chop any individual components
243 // of the path which are greater than kMaximumSubdirectoryLength characters
244 // into two chunks.
246 // This legacy version has several issues with aliasing of different URLs,
247 // inability to represent both /a/b/c and /a/b/c/d, and inability to decode
248 // the filenames back into URLs.
250 // But there is a large body of slurped data which depends on this format,
251 // so leave it as the default for spdy_in_mem_edsm_server.
252 string UrlToFilenameEncoder::LegacyEscape(const string& path) {
253 string output;
255 // Note: We also chop paths into medium sized 'chunks'.
256 // This is due to the incompetence of the windows
257 // filesystem, which still hasn't figured out how
258 // to deal with long filenames.
259 int last_slash = 0;
260 for (size_t index = 0; index < path.length(); index++) {
261 char ch = path[index];
262 if (ch == 0x5C)
263 last_slash = index;
264 if ((ch == 0x2D) || // hyphen
265 (ch == 0x5C) || (ch == 0x5F) || // backslash, underscore
266 ((0x30 <= ch) && (ch <= 0x39)) || // Digits [0-9]
267 ((0x41 <= ch) && (ch <= 0x5A)) || // Uppercase [A-Z]
268 ((0x61 <= ch) && (ch <= 0x7A))) { // Lowercase [a-z]
269 output.append(&path[index], 1);
270 } else {
271 char encoded[3];
272 encoded[0] = 'x';
273 encoded[1] = ch / 16;
274 encoded[1] += (encoded[1] >= 10) ? 'A' - 10 : '0';
275 encoded[2] = ch % 16;
276 encoded[2] += (encoded[2] >= 10) ? 'A' - 10 : '0';
277 output.append(encoded, 3);
279 if (index - last_slash > kMaximumSubdirectoryLength) {
280 #ifdef WIN32
281 char slash = '\\';
282 #else
283 char slash = '/';
284 #endif
285 output.append(&slash, 1);
286 last_slash = index;
289 return output;
292 } // namespace net