1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 // URL filename encoder goals:
7 // 1. Allow URLs with arbitrary path-segment length, generating filenames
8 // with a maximum of 128 characters.
9 // 2. Provide a somewhat human readable filenames, for easy debugging flow.
10 // 3. Provide reverse-mapping from filenames back to URLs.
11 // 4. Be able to distinguish http://x from http://x/ from http://x/index.html.
12 // Those can all be different URLs.
13 // 5. Be able to represent http://a/b/c and http://a/b/c/d, a pattern seen
14 // with Facebook Connect.
16 // We need an escape-character for representing characters that are legal
17 // in URL paths, but not in filenames, such as '?'.
19 // We can pick any legal character as an escape, as long as we escape it too.
20 // But as we have a goal of having filenames that humans can correlate with
21 // URLs, we should pick one that doesn't show up frequently in URLs. Candidates
22 // are ~`!@#$%^&()-=_+{}[],. but we would prefer to avoid characters that are
23 // shell escapes or that various build tools use.
25 // .#&%-=_+ occur frequently in URLs.
26 // <>:"/\|?* are illegal in Windows
27 // See http://msdn.microsoft.com/en-us/library/aa365247(VS.85).aspx
28 // ~`!$^&(){}[]'; are special to Unix shells
29 // In addition, build tools do not like ^@#%
31 // Josh took a quick look at the frequency of some special characters in
32 // Sadeesh's slurped directory from Fall 09 and found the following occurances:
34 // ^ 3 build tool doesn't like ^ in testdata filenames
35 // @ 10 build tool doesn't like @ in testdata filenames
36 // . 1676 too frequent in URLs
38 // # 0 build tool doesn't like it
39 // & 487 Prefer to avoid shell escapes
40 // % 374 g4 doesn't like it
41 // = 579 very frequent in URLs -- leave unmodified
42 // - 464 very frequent in URLs -- leave unmodified
43 // _ 798 very frequent in URLs -- leave unmodified
46 // The escaping algorithm is:
47 // 1) Escape all unfriendly symbols as ,XX where XX is the hex code.
48 // 2) Add a ',' at the end (We do not allow ',' at end of any directory name,
49 // so this assures that e.g. /a and /a/b can coexist in the filesystem).
50 // 3) Go through the path segment by segment (where a segment is one directory
51 // or leaf in the path) and
52 // 3a) If the segment is empty, escape the second slash. i.e. if it was
53 // www.foo.com//a then we escape the second / like www.foo.com/,2Fa,
54 // 3a) If it is "." or ".." prepend with ',' (so that we have a non-
55 // empty and non-reserved filename).
56 // 3b) If it is over 128 characters, break it up into smaller segments by
57 // inserting ,-/ (Windows limits paths to 128 chars, other OSes also
58 // have limits that would restrict us)
63 // /index.html /index.html,
67 // /a/b/c /a/b/c, Note: no prefix problem
68 // /u?foo=bar /u,3Ffoo=bar,
74 // /very...longname/ /very...long,-/name If very...long is about 126 long.
76 // NOTE: we avoid using some classes here (like FilePath and GURL) because we
77 // share this code with other projects externally.
79 #ifndef NET_TOOLS_DUMP_CACHE_URL_TO_FILENAME_ENCODER_H_
80 #define NET_TOOLS_DUMP_CACHE_URL_TO_FILENAME_ENCODER_H_
84 #include "base/strings/string_util.h"
85 #include "net/tools/dump_cache/url_utilities.h"
89 // Helper class for converting a URL into a filename.
90 class UrlToFilenameEncoder
{
92 // Given a |url| and a |base_path|, returns a filename which represents this
93 // |url|. |url| may include URL escaping such as %21 for !
94 // |legacy_escape| indicates that this function should use the old-style
96 // TODO(mbelshe): delete the legacy_escape code.
97 static std::string
Encode(const std::string
& url
, std::string base_path
,
100 if (!legacy_escape
) {
101 std::string url_no_scheme
= UrlUtilities::GetUrlHostPath(url
);
102 EncodeSegment(base_path
, url_no_scheme
, '/', &filename
);
104 ReplaceAll(&filename
, "/", "\\");
107 std::string
clean_url(url
);
108 if (clean_url
.length() && clean_url
[clean_url
.length()-1] == '/')
109 clean_url
.append("index.html");
111 std::string host
= UrlUtilities::GetUrlHost(clean_url
);
112 filename
.append(base_path
);
113 filename
.append(host
);
115 filename
.append("\\");
117 filename
.append("/");
120 std::string url_filename
= UrlUtilities::GetUrlPath(clean_url
);
121 // Strip the leading '/'.
122 if (url_filename
[0] == '/')
123 url_filename
= url_filename
.substr(1);
125 // Replace '/' with '\'.
126 ConvertToSlashes(&url_filename
);
128 // Strip double back-slashes ("\\\\").
129 StripDoubleSlashes(&url_filename
);
131 // Save path as filesystem-safe characters.
132 url_filename
= LegacyEscape(url_filename
);
133 filename
.append(url_filename
);
136 // Last step - convert to native slashes.
137 const std::string
slash("/");
138 const std::string
backslash("\\");
139 ReplaceAll(&filename
, backslash
, slash
);
146 // Rewrite HTML in a form that the SPDY in-memory server
148 // |filename_prefix| is prepended without escaping.
149 // |escaped_ending| is the URL to be encoded into a filename. It may have URL
150 // escaped characters (like %21 for !).
151 // |dir_separator| is "/" on Unix, "\" on Windows.
152 // |encoded_filename| is the resultant filename.
153 static void EncodeSegment(
154 const std::string
& filename_prefix
,
155 const std::string
& escaped_ending
,
157 std::string
* encoded_filename
);
159 // Decodes a filename that was encoded with EncodeSegment,
160 // yielding back the original URL.
161 static bool Decode(const std::string
& encoded_filename
,
163 std::string
* decoded_url
);
165 static const char kEscapeChar
;
166 static const char kTruncationChar
;
167 static const size_t kMaximumSubdirectoryLength
;
169 friend class UrlToFilenameEncoderTest
;
172 // Appends a segment of the path, special-casing "." and "..", and
173 // ensuring that the segment does not exceed the path length. If it does,
174 // it chops the end off the segment, writes the segment with a separator of
175 // ",-/", and then rewrites segment to contain just the truncated piece so
176 // it can be used in the next iteration.
177 // |segment| is a read/write parameter containing segment to write
178 // Note: this should not be called with empty segment.
179 static void AppendSegment(std::string
* segment
, std::string
* dest
);
181 // Allow reading of old slurped files.
182 static std::string
LegacyEscape(const std::string
& path
);
184 // Replace all instances of |from| within |str| as |to|.
185 static void ReplaceAll(std::string
* str
, const std::string
& from
,
186 const std::string
& to
) {
187 std::string::size_type
pos(0);
188 while ((pos
= str
->find(from
, pos
)) != std::string::npos
) {
189 str
->replace(pos
, from
.size(), to
);
194 // Replace all instances of "/" with "\" in |path|.
195 static void ConvertToSlashes(std::string
* path
) {
196 const std::string
slash("/");
197 const std::string
backslash("\\");
198 ReplaceAll(path
, slash
, backslash
);
201 // Replace all instances of "\\" with "%5C%5C" in |path|.
202 static void StripDoubleSlashes(std::string
* path
) {
203 const std::string
doubleslash("\\\\");
204 const std::string
escaped_doubleslash("%5C%5C");
205 ReplaceAll(path
, doubleslash
, escaped_doubleslash
);
211 #endif // NET_TOOLS_DUMP_CACHE_URL_TO_FILENAME_ENCODER_H_