1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
7 #include "base/logging.h"
8 #include "base/strings/string_util.h"
9 #include "net/base/net_util.h"
10 #include "net/tools/flip_server/url_to_filename_encoder.h"
16 // Returns 1 if buf is prefixed by "num_digits" of hex digits
17 // Teturns 0 otherwise.
18 // The function checks for '\0' for string termination.
19 int HexDigitsPrefix(const char* buf
, int num_digits
) {
20 for (int i
= 0; i
< num_digits
; i
++) {
21 if (!base::IsHexDigit(buf
[i
]))
22 return 0; // This also detects end of string as '\0' is not xdigit.
28 #define strtoull _strtoui64
31 // A simple parser for long long values. Returns the parsed value if a
32 // valid integer is found; else returns deflt
33 // UInt64 and Int64 cannot handle decimal numbers with leading 0s.
34 uint64
ParseLeadingHex64Value(const char* str
, uint64 deflt
) {
36 const uint64 value
= strtoull(str
, &error
, 16);
37 return (error
== str
) ? deflt
: value
;
43 // The escape character choice is made here -- all code and tests in this
44 // directory are based off of this constant. However, our testdata
45 // has tons of dependencies on this, so it cannot be changed without
46 // re-running those tests and fixing them.
47 const char UrlToFilenameEncoder::kEscapeChar
= ',';
48 const char UrlToFilenameEncoder::kTruncationChar
= '-';
49 const size_t UrlToFilenameEncoder::kMaximumSubdirectoryLength
= 128;
51 void UrlToFilenameEncoder::AppendSegment(string
* segment
, string
* dest
) {
52 CHECK(!segment
->empty());
53 if ((*segment
== ".") || (*segment
== "..")) {
54 dest
->append(1, kEscapeChar
);
55 dest
->append(*segment
);
58 size_t segment_size
= segment
->size();
59 if (segment_size
> kMaximumSubdirectoryLength
) {
60 // We need to inject ",-" at the end of the segment to signify that
61 // we are inserting an artificial '/'. This means we have to chop
62 // off at least two characters to make room.
63 segment_size
= kMaximumSubdirectoryLength
- 2;
65 // But we don't want to break up an escape sequence that happens to lie at
66 // the end. Escape sequences are at most 2 characters.
67 if ((*segment
)[segment_size
- 1] == kEscapeChar
) {
69 } else if ((*segment
)[segment_size
- 2] == kEscapeChar
) {
72 dest
->append(segment
->data(), segment_size
);
73 dest
->append(1, kEscapeChar
);
74 dest
->append(1, kTruncationChar
);
75 segment
->erase(0, segment_size
);
77 // At this point, if we had segment_size=3, and segment="abcd",
78 // then after this erase, we will have written "abc,-" and set segment="d"
80 dest
->append(*segment
);
86 void UrlToFilenameEncoder::EncodeSegment(const string
& filename_prefix
,
87 const string
& escaped_ending
,
89 string
* encoded_filename
) {
90 string filename_ending
= UrlUtilities::Unescape(escaped_ending
);
96 // TODO(jmarantz): This code would be a bit simpler if we disallowed
97 // Instaweb allowing filename_prefix to not end in "/". We could
98 // then change the is routine to just take one input string.
99 size_t start_of_segment
= filename_prefix
.find_last_of(dir_separator
);
100 if (start_of_segment
== string::npos
) {
101 segment
= filename_prefix
;
103 segment
= filename_prefix
.substr(start_of_segment
+ 1);
104 *encoded_filename
= filename_prefix
.substr(0, start_of_segment
+ 1);
108 // Special case the first / to avoid adding a leading kEscapeChar.
109 if (!filename_ending
.empty() && (filename_ending
[0] == dir_separator
)) {
110 encoded_filename
->append(segment
);
112 encoded_filename
->append(1, dir_separator
);
116 for (; index
< filename_ending
.length(); ++index
) {
117 unsigned char ch
= static_cast<unsigned char>(filename_ending
[index
]);
119 // Note: instead of outputing an empty segment, we let the second slash
121 if ((ch
== dir_separator
) && !segment
.empty()) {
122 AppendSegment(&segment
, encoded_filename
);
123 encoded_filename
->append(1, dir_separator
);
126 // After removing unsafe chars the only safe ones are _.=+- and alphanums.
127 if ((ch
== '_') || (ch
== '.') || (ch
== '=') || (ch
== '+') ||
128 (ch
== '-') || (('0' <= ch
) && (ch
<= '9')) ||
129 (('A' <= ch
) && (ch
<= 'Z')) || (('a' <= ch
) && (ch
<= 'z'))) {
133 encoded
[0] = kEscapeChar
;
134 encoded
[1] = ch
/ 16;
135 encoded
[1] += (encoded
[1] >= 10) ? 'A' - 10 : '0';
136 encoded
[2] = ch
% 16;
137 encoded
[2] += (encoded
[2] >= 10) ? 'A' - 10 : '0';
140 segment
.append(encoded
, encoded_len
);
142 // If segment is too big, we must chop it into chunks.
143 if (segment
.size() > kMaximumSubdirectoryLength
) {
144 AppendSegment(&segment
, encoded_filename
);
145 encoded_filename
->append(1, dir_separator
);
150 // Append "," to the leaf filename so the leaf can also be a branch., e.g.
151 // allow http://a/b/c and http://a/b/c/d to co-exist as files "/a/b/c," and
152 // /a/b/c/d". So we will rename the "d" here to "d,". If doing that pushed
153 // us over the 128 char limit, then we will need to append "/" and the
155 segment
+= kEscapeChar
;
156 AppendSegment(&segment
, encoded_filename
);
157 if (!segment
.empty()) {
158 // The last overflow segment is special, because we appended in
159 // kEscapeChar above. We won't need to check it again for size
160 // or further escaping.
161 encoded_filename
->append(1, dir_separator
);
162 encoded_filename
->append(segment
);
166 // Note: this decoder is not the exact inverse of the EncodeSegment above,
167 // because it does not take into account a prefix.
168 bool UrlToFilenameEncoder::Decode(const string
& encoded_filename
,
170 string
* decoded_url
) {
171 enum State
{ kStart
, kEscape
, kFirstDigit
, kTruncate
, kEscapeDot
};
172 State state
= kStart
;
174 hex_buffer
[2] = '\0';
175 for (size_t i
= 0; i
< encoded_filename
.size(); ++i
) {
176 char ch
= encoded_filename
[i
];
179 if (ch
== kEscapeChar
) {
181 } else if (ch
== dir_separator
) {
182 decoded_url
->append(1, '/'); // URLs only use '/' not '\\'
184 decoded_url
->append(1, ch
);
188 if (HexDigitsPrefix(&ch
, 1) == 1) {
191 } else if (ch
== kTruncationChar
) {
193 } else if (ch
== '.') {
194 decoded_url
->append(1, '.');
195 state
= kEscapeDot
; // Look for at most one more dot.
196 } else if (ch
== dir_separator
) {
197 // Consider url "//x". This was once encoded to "/,/x,".
198 // This code is what skips the first Escape.
199 decoded_url
->append(1, '/'); // URLs only use '/' not '\\'
206 if (HexDigitsPrefix(&ch
, 1) == 1) {
208 uint64 hex_value
= ParseLeadingHex64Value(hex_buffer
, 0);
209 decoded_url
->append(1, static_cast<char>(hex_value
));
216 if (ch
== dir_separator
) {
217 // Skip this separator, it was only put in to break up long
218 // path segments, but is not part of the URL.
225 decoded_url
->append(1, ch
);
231 // All legal encoded filenames end in kEscapeChar.
232 return (state
== kEscape
);
235 // Escape the given input |path| and chop any individual components
236 // of the path which are greater than kMaximumSubdirectoryLength characters
239 // This legacy version has several issues with aliasing of different URLs,
240 // inability to represent both /a/b/c and /a/b/c/d, and inability to decode
241 // the filenames back into URLs.
243 // But there is a large body of slurped data which depends on this format,
244 // so leave it as the default for spdy_in_mem_edsm_server.
245 string
UrlToFilenameEncoder::LegacyEscape(const string
& path
) {
248 // Note: We also chop paths into medium sized 'chunks'.
249 // This is due to the incompetence of the windows
250 // filesystem, which still hasn't figured out how
251 // to deal with long filenames.
253 for (size_t index
= 0; index
< path
.length(); index
++) {
254 char ch
= path
[index
];
257 if ((ch
== 0x2D) || // hyphen
258 (ch
== 0x5C) || (ch
== 0x5F) || // backslash, underscore
259 ((0x30 <= ch
) && (ch
<= 0x39)) || // Digits [0-9]
260 ((0x41 <= ch
) && (ch
<= 0x5A)) || // Uppercase [A-Z]
261 ((0x61 <= ch
) && (ch
<= 0x7A))) { // Lowercase [a-z]
262 output
.append(&path
[index
], 1);
266 encoded
[1] = ch
/ 16;
267 encoded
[1] += (encoded
[1] >= 10) ? 'A' - 10 : '0';
268 encoded
[2] = ch
% 16;
269 encoded
[2] += (encoded
[2] >= 10) ? 'A' - 10 : '0';
270 output
.append(encoded
, 3);
272 if (index
- last_slash
> kMaximumSubdirectoryLength
) {
278 output
.append(&slash
, 1);