1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "sync/internal_api/public/base/unique_position.h"
7 #include "base/basictypes.h"
8 #include "base/logging.h"
9 #include "base/rand_util.h"
10 #include "base/stl_util.h"
11 #include "base/strings/string_number_conversions.h"
12 #include "sync/protocol/unique_position.pb.h"
13 #include "third_party/zlib/zlib.h"
17 const size_t UniquePosition::kSuffixLength
= 28;
18 const size_t UniquePosition::kCompressBytesThreshold
= 128;
21 bool UniquePosition::IsValidSuffix(const std::string
& suffix
) {
22 // The suffix must be exactly the specified length, otherwise unique suffixes
23 // are not sufficient to guarantee unique positions (because prefix + suffix
24 // == p + refixsuffix).
25 return suffix
.length() == kSuffixLength
26 && suffix
[kSuffixLength
-1] != 0;
30 bool UniquePosition::IsValidBytes(const std::string
& bytes
) {
31 // The first condition ensures that our suffix uniqueness is sufficient to
32 // guarantee position uniqueness. Otherwise, it's possible the end of some
33 // prefix + some short suffix == some long suffix.
34 // The second condition ensures that FindSmallerWithSuffix can always return a
36 return bytes
.length() >= kSuffixLength
37 && bytes
[bytes
.length()-1] != 0;
41 std::string
UniquePosition::RandomSuffix() {
42 // Users random data for all but the last byte. The last byte must not be
43 // zero. We arbitrarily set it to 0x7f.
44 return base::RandBytesAsString(kSuffixLength
- 1) + "\x7f";
48 UniquePosition
UniquePosition::CreateInvalid() {
50 DCHECK(!pos
.IsValid());
55 UniquePosition
UniquePosition::FromProto(const sync_pb::UniquePosition
& proto
) {
56 if (proto
.has_custom_compressed_v1()) {
57 return UniquePosition(proto
.custom_compressed_v1());
58 } else if (proto
.has_value() && !proto
.value().empty()) {
59 return UniquePosition(Compress(proto
.value()));
60 } else if (proto
.has_compressed_value() && proto
.has_uncompressed_length()) {
61 uLongf uncompressed_len
= proto
.uncompressed_length();
62 std::string un_gzipped
;
64 un_gzipped
.resize(uncompressed_len
);
65 int result
= uncompress(
66 reinterpret_cast<Bytef
*>(string_as_array(&un_gzipped
)),
68 reinterpret_cast<const Bytef
*>(proto
.compressed_value().data()),
69 proto
.compressed_value().size());
71 DLOG(ERROR
) << "Unzip failed " << result
;
72 return UniquePosition::CreateInvalid();
74 if (uncompressed_len
!= proto
.uncompressed_length()) {
76 << "Uncompressed length " << uncompressed_len
77 << " did not match specified length " << proto
.uncompressed_length();
78 return UniquePosition::CreateInvalid();
80 return UniquePosition(Compress(un_gzipped
));
82 return UniquePosition::CreateInvalid();
87 UniquePosition
UniquePosition::FromInt64(
88 int64 x
, const std::string
& suffix
) {
89 uint64 y
= static_cast<uint64
>(x
);
90 y
^= 0x8000000000000000ULL
; // Make it non-negative.
91 std::string
bytes(8, 0);
92 for (int i
= 7; i
>= 0; --i
) {
93 bytes
[i
] = static_cast<uint8
>(y
);
96 return UniquePosition(bytes
+ suffix
, suffix
);
100 UniquePosition
UniquePosition::InitialPosition(
101 const std::string
& suffix
) {
102 DCHECK(IsValidSuffix(suffix
));
103 return UniquePosition(suffix
, suffix
);
107 UniquePosition
UniquePosition::Before(
108 const UniquePosition
& x
,
109 const std::string
& suffix
) {
110 DCHECK(IsValidSuffix(suffix
));
112 const std::string
& before
= FindSmallerWithSuffix(
113 Uncompress(x
.compressed_
), suffix
);
114 return UniquePosition(before
+ suffix
, suffix
);
118 UniquePosition
UniquePosition::After(
119 const UniquePosition
& x
,
120 const std::string
& suffix
) {
121 DCHECK(IsValidSuffix(suffix
));
123 const std::string
& after
= FindGreaterWithSuffix(
124 Uncompress(x
.compressed_
), suffix
);
125 return UniquePosition(after
+ suffix
, suffix
);
129 UniquePosition
UniquePosition::Between(
130 const UniquePosition
& before
,
131 const UniquePosition
& after
,
132 const std::string
& suffix
) {
133 DCHECK(before
.IsValid());
134 DCHECK(after
.IsValid());
135 DCHECK(before
.LessThan(after
));
136 DCHECK(IsValidSuffix(suffix
));
137 const std::string
& mid
= FindBetweenWithSuffix(
138 Uncompress(before
.compressed_
),
139 Uncompress(after
.compressed_
),
141 return UniquePosition(mid
+ suffix
, suffix
);
144 UniquePosition::UniquePosition() : is_valid_(false) {}
146 bool UniquePosition::LessThan(const UniquePosition
& other
) const {
147 DCHECK(this->IsValid());
148 DCHECK(other
.IsValid());
150 return compressed_
< other
.compressed_
;
153 bool UniquePosition::Equals(const UniquePosition
& other
) const {
154 if (!this->IsValid() && !other
.IsValid())
157 return compressed_
== other
.compressed_
;
160 void UniquePosition::ToProto(sync_pb::UniquePosition
* proto
) const {
163 // This is the current preferred foramt.
164 proto
->set_custom_compressed_v1(compressed_
);
166 // Older clients used to write other formats. We don't bother doing that
167 // anymore because that form of backwards compatibility is expensive. We no
168 // longer want to pay that price just too support clients that have been
169 // obsolete for a long time. See the proto definition for details.
172 void UniquePosition::SerializeToString(std::string
* blob
) const {
174 sync_pb::UniquePosition proto
;
176 proto
.SerializeToString(blob
);
179 int64
UniquePosition::ToInt64() const {
181 const std::string
& s
= Uncompress(compressed_
);
182 size_t l
= sizeof(int64
);
183 if (s
.length() < l
) {
187 for (size_t i
= 0; i
< l
; ++i
) {
188 const uint8 byte
= s
[l
- i
- 1];
189 y
|= static_cast<uint64
>(byte
) << (i
* 8);
191 y
^= 0x8000000000000000ULL
;
192 // This is technically implementation-defined if y > INT64_MAX, so
193 // we're assuming that we're on a twos-complement machine.
194 return static_cast<int64
>(y
);
197 bool UniquePosition::IsValid() const {
201 std::string
UniquePosition::ToDebugString() const {
202 const std::string bytes
= Uncompress(compressed_
);
204 return std::string("INVALID[]");
206 std::string debug_string
= base::HexEncode(bytes
.data(), bytes
.length());
208 debug_string
= "INVALID[" + debug_string
+ "]";
211 std::string compressed_string
=
212 base::HexEncode(compressed_
.data(), compressed_
.length());
213 debug_string
.append(", compressed: " + compressed_string
);
217 std::string
UniquePosition::GetSuffixForTest() const {
218 const std::string bytes
= Uncompress(compressed_
);
219 const size_t prefix_len
= bytes
.length() - kSuffixLength
;
220 return bytes
.substr(prefix_len
, std::string::npos
);
223 std::string
UniquePosition::FindSmallerWithSuffix(
224 const std::string
& reference
,
225 const std::string
& suffix
) {
226 size_t ref_zeroes
= reference
.find_first_not_of('\0');
227 size_t suffix_zeroes
= suffix
.find_first_not_of('\0');
229 // Neither of our inputs are allowed to have trailing zeroes, so the following
231 DCHECK_NE(ref_zeroes
, std::string::npos
);
232 DCHECK_NE(suffix_zeroes
, std::string::npos
);
234 if (suffix_zeroes
> ref_zeroes
) {
235 // Implies suffix < ref.
236 return std::string();
239 if (suffix
.substr(suffix_zeroes
) < reference
.substr(ref_zeroes
)) {
240 // Prepend zeroes so the result has as many zero digits as |reference|.
241 return std::string(ref_zeroes
- suffix_zeroes
, '\0');
242 } else if (suffix_zeroes
> 1) {
243 // Prepend zeroes so the result has one more zero digit than |reference|.
244 // We could also take the "else" branch below, but taking this branch will
245 // give us a smaller result.
246 return std::string(ref_zeroes
- suffix_zeroes
+ 1, '\0');
248 // Prepend zeroes to match those in the |reference|, then something smaller
249 // than the first non-zero digit in |reference|.
250 char lt_digit
= static_cast<uint8
>(reference
[ref_zeroes
])/2;
251 return std::string(ref_zeroes
, '\0') + lt_digit
;
256 std::string
UniquePosition::FindGreaterWithSuffix(
257 const std::string
& reference
,
258 const std::string
& suffix
) {
259 size_t ref_FFs
= reference
.find_first_not_of(kuint8max
);
260 size_t suffix_FFs
= suffix
.find_first_not_of(kuint8max
);
262 if (ref_FFs
== std::string::npos
) {
263 ref_FFs
= reference
.length();
265 if (suffix_FFs
== std::string::npos
) {
266 suffix_FFs
= suffix
.length();
269 if (suffix_FFs
> ref_FFs
) {
270 // Implies suffix > reference.
271 return std::string();
274 if (suffix
.substr(suffix_FFs
) > reference
.substr(ref_FFs
)) {
275 // Prepend FF digits to match those in |reference|.
276 return std::string(ref_FFs
- suffix_FFs
, kuint8max
);
277 } else if (suffix_FFs
> 1) {
278 // Prepend enough leading FF digits so result has one more of them than
279 // |reference| does. We could also take the "else" branch below, but this
280 // gives us a smaller result.
281 return std::string(ref_FFs
- suffix_FFs
+ 1, kuint8max
);
283 // Prepend FF digits to match those in |reference|, then something larger
284 // than the first non-FF digit in |reference|.
285 char gt_digit
= static_cast<uint8
>(reference
[ref_FFs
]) +
286 (kuint8max
- static_cast<uint8
>(reference
[ref_FFs
]) + 1) / 2;
287 return std::string(ref_FFs
, kuint8max
) + gt_digit
;
292 std::string
UniquePosition::FindBetweenWithSuffix(
293 const std::string
& before
,
294 const std::string
& after
,
295 const std::string
& suffix
) {
296 DCHECK(IsValidSuffix(suffix
));
297 DCHECK_NE(before
, after
);
298 DCHECK_LT(before
, after
);
302 // Sometimes our suffix puts us where we want to be.
303 if (before
< suffix
&& suffix
< after
) {
304 return std::string();
308 for ( ; i
< std::min(before
.length(), after
.length()); ++i
) {
309 uint8 a_digit
= before
[i
];
310 uint8 b_digit
= after
[i
];
312 if (b_digit
- a_digit
>= 2) {
313 mid
.push_back(a_digit
+ (b_digit
- a_digit
)/2);
315 } else if (a_digit
== b_digit
) {
316 mid
.push_back(a_digit
);
318 // Both strings are equal so far. Will appending the suffix at this point
319 // give us the comparison we're looking for?
320 if (before
.substr(i
+1) < suffix
&& suffix
< after
.substr(i
+1)) {
324 DCHECK_EQ(b_digit
- a_digit
, 1); // Implied by above if branches.
326 // The two options are off by one digit. The choice of whether to round
327 // up or down here will have consequences on what we do with the remaining
328 // digits. Exploring both options is an optimization and is not required
329 // for the correctness of this algorithm.
331 // Option A: Round down the current digit. This makes our |mid| <
332 // |after|, no matter what we append afterwards. We then focus on
333 // appending digits until |mid| > |before|.
334 std::string mid_a
= mid
;
335 mid_a
.push_back(a_digit
);
336 mid_a
.append(FindGreaterWithSuffix(before
.substr(i
+1), suffix
));
338 // Option B: Round up the current digit. This makes our |mid| > |before|,
339 // no matter what we append afterwards. We then focus on appending digits
340 // until |mid| < |after|. Note that this option may not be viable if the
341 // current digit is the last one in |after|, so we skip the option in that
343 if (after
.length() > i
+1) {
344 std::string mid_b
= mid
;
345 mid_b
.push_back(b_digit
);
346 mid_b
.append(FindSmallerWithSuffix(after
.substr(i
+1), suffix
));
348 // Does this give us a shorter position value? If so, use it.
349 if (mid_b
.length() < mid_a
.length()) {
357 // If we haven't found a midpoint yet, the following must be true:
358 DCHECK_EQ(before
.substr(0, i
), after
.substr(0, i
));
359 DCHECK_EQ(before
, mid
);
360 DCHECK_LT(before
.length(), after
.length());
362 // We know that we'll need to append at least one more byte to |mid| in the
363 // process of making it < |after|. Appending any digit, regardless of the
364 // value, will make |before| < |mid|. Therefore, the following will get us a
367 mid
.append(FindSmallerWithSuffix(after
.substr(i
), suffix
));
371 UniquePosition::UniquePosition(const std::string
& internal_rep
)
372 : compressed_(internal_rep
),
373 is_valid_(IsValidBytes(Uncompress(internal_rep
))) {
376 UniquePosition::UniquePosition(
377 const std::string
& uncompressed
,
378 const std::string
& suffix
)
379 : compressed_(Compress(uncompressed
)),
380 is_valid_(IsValidBytes(uncompressed
)) {
381 DCHECK(uncompressed
.rfind(suffix
) + kSuffixLength
== uncompressed
.length());
382 DCHECK(IsValidSuffix(suffix
));
386 // On custom compression:
388 // Let C(x) be the compression function and U(x) be the uncompression function.
390 // This compression scheme has a few special properties. For one, it is
391 // order-preserving. For any two valid position strings x and y:
392 // x < y <=> C(x) < C(y)
393 // This allows us keep the position strings compressed as we sort them.
395 // The compressed format and the decode algorithm:
397 // The compressed string is a series of blocks, almost all of which are 8 bytes
398 // in length. The only exception is the last block in the compressed string,
399 // which may be a remainder block, which has length no greater than 7. The
400 // full-length blocks are either repeated character blocks or plain data blocks.
401 // All blocks are entirely self-contained. Their decoded values are independent
402 // from that of their neighbours.
404 // A repeated character block is encoded into eight bytes and represents between
405 // 4 and 2^31 repeated instances of a given character in the unencoded stream.
406 // The encoding consists of a single character repeated four times, followed by
407 // an encoded count. The encoded count is stored as a big-endian 32 bit
408 // integer. There are 2^31 possible count values, and two encodings for each.
409 // The high encoding is 'enc = kuint32max - count'; the low encoding is 'enc =
410 // count'. At compression time, the algorithm will choose between the two
411 // encodings based on which of the two will maintain the appropriate sort
412 // ordering (by a process which will be described below). The decompression
413 // algorithm need not concern itself with which encoding was used; it needs only
414 // to decode it. The decoded value of this block is "count" instances of the
415 // character that was repeated four times in the first half of this block.
417 // A plain data block is encoded into eight bytes and represents exactly eight
418 // bytes of data in the unencoded stream. The plain data block must not begin
419 // with the same character repeated four times. It is allowed to contain such a
420 // four-character sequence, just not at the start of the block. The decoded
421 // value of a plain data block is identical to its encoded value.
423 // A remainder block has length of at most seven. It is a shorter version of
424 // the plain data block. It occurs only at the end of the encoded stream and
425 // represents exactly as many bytes of unencoded data as its own length. Like a
426 // plain data block, the remainder block never begins with the same character
427 // repeated four times. The decoded value of this block is identical to its
430 // The encode algorithm:
432 // From the above description, it can be seen that there may be more than one
433 // way to encode a given input string. The encoder must be careful to choose
434 // the encoding that guarantees sort ordering.
436 // The rules for the encoder are as follows:
437 // 1. Iterate through the input string and produce output blocks one at a time.
438 // 2. Where possible (ie. where the next four bytes of input consist of the
439 // same character repeated four times), produce a repeated data block of
440 // maximum possible length.
441 // 3. If there is at least 8 bytes of data remaining and it is not possible
442 // to produce a repeated character block, produce a plain data block.
443 // 4. If there are less than 8 bytes of data remaining and it is not possible
444 // to produce a repeated character block, produce a remainder block.
445 // 5. When producing a repeated character block, the count encoding must be
446 // chosen in such a way that the sort ordering is maintained. The choice is
447 // best illustrated by way of example:
449 // When comparing two strings, the first of which begins with of 8
450 // instances of the letter 'B' and the second with 10 instances of the
451 // letter 'B', which of the two should compare lower? The result depends
452 // on the 9th character of the first string, since it will be compared
453 // against the 9th 'B' in the second string. If that character is an 'A',
454 // then the first string will compare lower. If it is a 'C', then the
455 // first string will compare higher.
457 // The key insight is that the comparison value of a repeated character block
458 // depends on the value of the character that follows it. If the character
459 // follows the repeated character has a value greater than the repeated
460 // character itself, then a shorter run length should translate to a higher
461 // comparison value. Therefore, we encode its count using the low encoding.
462 // Similarly, if the following character is lower, we use the high encoding.
466 // Appends an encoded run length to |output_str|.
467 static void WriteEncodedRunLength(uint32 length
,
469 std::string
* output_str
) {
470 CHECK_GE(length
, 4U);
471 CHECK_LT(length
, 0x80000000);
473 // Step 1: Invert the count, if necessary, to account for the following digit.
474 uint32 encoded_length
;
476 encoded_length
= 0xffffffff - length
;
478 encoded_length
= length
;
481 // Step 2: Write it as big-endian so it compares correctly with memcmp(3).
482 output_str
->append(1, 0xff & (encoded_length
>> 24U));
483 output_str
->append(1, 0xff & (encoded_length
>> 16U));
484 output_str
->append(1, 0xff & (encoded_length
>> 8U));
485 output_str
->append(1, 0xff & (encoded_length
>> 0U));
488 // Reads an encoded run length for |str| at position |i|.
489 static uint32
ReadEncodedRunLength(const std::string
& str
, size_t i
) {
490 DCHECK_LE(i
+ 4, str
.length());
492 // Step 1: Extract the big-endian count.
493 uint32 encoded_length
=
494 ((uint8
)(str
[i
+3]) << 0) |
495 ((uint8
)(str
[i
+2]) << 8) |
496 ((uint8
)(str
[i
+1]) << 16) |
497 ((uint8
)(str
[i
+0]) << 24);
499 // Step 2: If this was an inverted count, un-invert it.
501 if (encoded_length
& 0x80000000) {
502 length
= 0xffffffff - encoded_length
;
504 length
= encoded_length
;
510 // A series of four identical chars at the beginning of a block indicates
511 // the beginning of a repeated character block.
512 static bool IsRepeatedCharPrefix(const std::string
& chars
, size_t start_index
) {
513 return chars
[start_index
] == chars
[start_index
+1]
514 && chars
[start_index
] == chars
[start_index
+2]
515 && chars
[start_index
] == chars
[start_index
+3];
521 // Wraps the CompressImpl function with a bunch of DCHECKs.
522 std::string
UniquePosition::Compress(const std::string
& str
) {
523 DCHECK(IsValidBytes(str
));
524 std::string compressed
= CompressImpl(str
);
525 DCHECK(IsValidCompressed(compressed
));
526 DCHECK_EQ(str
, Uncompress(compressed
));
531 // Performs the order preserving run length compression of a given input string.
532 std::string
UniquePosition::CompressImpl(const std::string
& str
) {
535 // The compressed length will usually be at least as long as the suffix (28),
536 // since the suffix bytes are mostly random. Most are a few bytes longer; a
537 // small few are tens of bytes longer. Some early tests indicated that
538 // roughly 99% had length 40 or smaller. We guess that pre-sizing for 48 is a
539 // good trade-off, but that has not been confirmed through profiling.
542 // Each loop iteration will consume 8, or N bytes, where N >= 4 and is the
543 // length of a string of identical digits starting at i.
544 for (size_t i
= 0; i
< str
.length(); ) {
545 if (i
+ 4 <= str
.length() && IsRepeatedCharPrefix(str
, i
)) {
546 // Four identical bytes in a row at this position means that we must start
547 // a repeated character block. Begin by outputting those four bytes.
548 output
.append(str
, i
, 4);
550 // Determine the size of the run.
551 const char rep_digit
= str
[i
];
552 const size_t runs_until
= str
.find_first_not_of(rep_digit
, i
+4);
554 // Handle the 'runs until end' special case specially.
556 bool encode_high
; // True if the next byte is greater than |rep_digit|.
557 if (runs_until
== std::string::npos
) {
558 run_length
= str
.length() - i
;
561 run_length
= runs_until
- i
;
562 encode_high
= static_cast<uint8
>(str
[runs_until
]) >
563 static_cast<uint8
>(rep_digit
);
565 DCHECK_LT(run_length
, static_cast<size_t>(kint32max
))
566 << "This implementation can't encode run-lengths greater than 2^31.";
568 WriteEncodedRunLength(run_length
, encode_high
, &output
);
569 i
+= run_length
; // Jump forward by the size of the run length.
571 // Output up to eight bytes without any encoding.
572 const size_t len
= std::min(static_cast<size_t>(8), str
.length() - i
);
573 output
.append(str
, i
, len
);
574 i
+= len
; // Jump forward by the amount of input consumed (usually 8).
582 // Uncompresses strings that were compresed with UniquePosition::Compress.
583 std::string
UniquePosition::Uncompress(const std::string
& str
) {
586 // Iterate through the compressed string one block at a time.
587 for (i
= 0; i
+ 8 <= str
.length(); i
+= 8) {
588 if (IsRepeatedCharPrefix(str
, i
)) {
589 // Found a repeated character block. Expand it.
590 const char rep_digit
= str
[i
];
591 uint32 length
= ReadEncodedRunLength(str
, i
+4);
592 output
.append(length
, rep_digit
);
594 // Found a regular block. Copy it.
595 output
.append(str
, i
, 8);
598 // Copy the remaining bytes that were too small to form a block.
599 output
.append(str
, i
, std::string::npos
);
603 bool UniquePosition::IsValidCompressed(const std::string
& str
) {
604 for (size_t i
= 0; i
+ 8 <= str
.length(); i
+= 8) {
605 if (IsRepeatedCharPrefix(str
, i
)) {
606 uint32 count
= ReadEncodedRunLength(str
, i
+4);
608 // A repeated character block should at least represent the four
609 // characters that started it.
612 if (str
[i
] == str
[i
+4]) {
613 // Does the next digit after a count match the repeated character? Then
614 // this is not the highest possible count.
619 // We don't bother looking for the existence or checking the validity of
620 // any partial blocks. There's no way they could be invalid anyway.
624 } // namespace syncer