1 //===-- sanitizer_lzw.h -----------------------------------------*- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // Lempel–Ziv–Welch encoding/decoding
11 //===----------------------------------------------------------------------===//
13 #ifndef SANITIZER_LZW_H
14 #define SANITIZER_LZW_H
16 #include "sanitizer_dense_map.h"
18 namespace __sanitizer
{
20 using LzwCodeType
= u32
;
22 template <class T
, class ItIn
, class ItOut
>
23 ItOut
LzwEncode(ItIn begin
, ItIn end
, ItOut out
) {
25 detail::DenseMapPair
<LzwCodeType
/* Prefix */, T
/* Next input */>;
27 // Sentinel value for substrings of len 1.
28 static constexpr LzwCodeType kNoPrefix
=
29 Min(DenseMapInfo
<Substring
>::getEmptyKey().first
,
30 DenseMapInfo
<Substring
>::getTombstoneKey().first
) -
32 DenseMap
<Substring
, LzwCodeType
> prefix_to_code
;
34 // Add all substring of len 1 as initial dictionary.
35 InternalMmapVector
<T
> dict_len1
;
36 for (auto it
= begin
; it
!= end
; ++it
)
37 if (prefix_to_code
.try_emplace({kNoPrefix
, *it
}, 0).second
)
38 dict_len1
.push_back(*it
);
40 // Slightly helps with later delta encoding.
41 Sort(dict_len1
.data(), dict_len1
.size());
43 // For large sizeof(T) we have to store dict_len1. Smaller types like u8 can
44 // just generate them.
45 *out
= dict_len1
.size();
48 for (uptr i
= 0; i
!= dict_len1
.size(); ++i
) {
49 // Remap after the Sort.
50 prefix_to_code
[{kNoPrefix
, dict_len1
[i
]}] = i
;
54 CHECK_EQ(prefix_to_code
.size(), dict_len1
.size());
60 // Main LZW encoding loop.
61 LzwCodeType match
= prefix_to_code
.find({kNoPrefix
, *begin
})->second
;
63 for (auto it
= begin
; it
!= end
; ++it
) {
64 // Extend match with the new item.
65 auto ins
= prefix_to_code
.try_emplace({match
, *it
}, prefix_to_code
.size());
67 // This is a new substring, but emit the code for the current match
68 // (before extend). This allows LZW decoder to recover the dictionary.
71 // Reset the match to a single item, which must be already in the map.
72 match
= prefix_to_code
.find({kNoPrefix
, *it
})->second
;
74 // Already known, use as the current match.
75 match
= ins
.first
->second
;
85 template <class T
, class ItIn
, class ItOut
>
86 ItOut
LzwDecode(ItIn begin
, ItIn end
, ItOut out
) {
90 // Load dictionary of len 1 substrings. Theses correspont to lowest codes.
91 InternalMmapVector
<T
> dict_len1(*begin
);
97 for (auto& v
: dict_len1
) {
102 // Substrings of len 2 and up. Indexes are shifted because [0,
103 // dict_len1.size()) stored in dict_len1. Substings get here after being
104 // emitted to the output, so we can use output position.
105 InternalMmapVector
<detail::DenseMapPair
<ItOut
/* begin. */, ItOut
/* end */>>
108 // Copies already emitted substrings into the output again.
109 auto copy
= [&code_to_substr
, &dict_len1
](LzwCodeType code
, ItOut out
) {
110 if (code
< dict_len1
.size()) {
111 *out
= dict_len1
[code
];
115 const auto& s
= code_to_substr
[code
- dict_len1
.size()];
117 for (ItOut it
= s
.first
; it
!= s
.second
; ++it
, ++out
) *out
= *it
;
121 // Returns lens of the substring with the given code.
122 auto code_to_len
= [&code_to_substr
, &dict_len1
](LzwCodeType code
) -> uptr
{
123 if (code
< dict_len1
.size())
125 const auto& s
= code_to_substr
[code
- dict_len1
.size()];
126 return s
.second
- s
.first
;
129 // Main LZW decoding loop.
130 LzwCodeType prev_code
= *begin
;
132 out
= copy(prev_code
, out
);
133 for (auto it
= begin
; it
!= end
; ++it
) {
134 LzwCodeType code
= *it
;
136 if (code
== dict_len1
.size() + code_to_substr
.size()) {
137 // Special LZW case. The code is not in the dictionary yet. This is
138 // possible only when the new substring is the same as previous one plus
139 // the first item of the previous substring. We can emit that in two
141 out
= copy(prev_code
, out
);
145 out
= copy(code
, out
);
148 // Every time encoded emits the code, it also creates substing of len + 1
149 // including the first item of the just emmited substring. Do the same here.
150 uptr len
= code_to_len(prev_code
);
151 code_to_substr
.push_back({start
- len
, start
+ 1});
158 } // namespace __sanitizer