1 //===- MultiOnDiskHashTable.h - Merged set of hash tables -------*- C++ -*-===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file provides a hash table data structure suitable for incremental and
10 // distributed storage across a set of files.
12 // Multiple hash tables from different files are implicitly merged to improve
13 // performance, and on reload the merged table will override those from other
16 //===----------------------------------------------------------------------===//
18 #ifndef LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H
19 #define LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H
21 #include "llvm/ADT/DenseMap.h"
22 #include "llvm/ADT/DenseSet.h"
23 #include "llvm/ADT/PointerUnion.h"
24 #include "llvm/ADT/STLExtras.h"
25 #include "llvm/ADT/SmallVector.h"
26 #include "llvm/ADT/TinyPtrVector.h"
27 #include "llvm/ADT/iterator_range.h"
28 #include "llvm/Support/Endian.h"
29 #include "llvm/Support/EndianStream.h"
30 #include "llvm/Support/OnDiskHashTable.h"
31 #include "llvm/Support/raw_ostream.h"
37 namespace serialization
{
39 /// A collection of on-disk hash tables, merged when relevant for performance.
40 template<typename Info
> class MultiOnDiskHashTable
{
42 /// A handle to a file, used when overriding tables.
43 using file_type
= typename
Info::file_type
;
45 /// A pointer to an on-disk representation of the hash table.
46 using storage_type
= const unsigned char *;
48 using external_key_type
= typename
Info::external_key_type
;
49 using internal_key_type
= typename
Info::internal_key_type
;
50 using data_type
= typename
Info::data_type
;
51 using data_type_builder
= typename
Info::data_type_builder
;
52 using hash_value_type
= unsigned;
55 /// The generator is permitted to read our merged table.
56 template<typename ReaderInfo
, typename WriterInfo
>
57 friend class MultiOnDiskHashTableGenerator
;
59 /// A hash table stored on disk.
61 using HashTable
= llvm::OnDiskIterableChainedHashTable
<Info
>;
66 OnDiskTable(file_type File
, unsigned NumBuckets
, unsigned NumEntries
,
67 storage_type Buckets
, storage_type Payload
, storage_type Base
,
70 Table(NumBuckets
, NumEntries
, Buckets
, Payload
, Base
, InfoObj
) {}
74 std::vector
<file_type
> Files
;
75 llvm::DenseMap
<internal_key_type
, data_type
> Data
;
78 using Table
= llvm::PointerUnion
<OnDiskTable
*, MergedTable
*>;
79 using TableVector
= llvm::TinyPtrVector
<void *>;
81 /// The current set of on-disk and merged tables.
82 /// We manually store the opaque value of the Table because TinyPtrVector
83 /// can't cope with holding a PointerUnion directly.
84 /// There can be at most one MergedTable in this vector, and if present,
85 /// it is the first table.
88 /// Files corresponding to overridden tables that we've not yet
90 llvm::TinyPtrVector
<file_type
> PendingOverrides
;
92 struct AsOnDiskTable
{
93 using result_type
= OnDiskTable
*;
95 result_type
operator()(void *P
) const {
96 return Table::getFromOpaqueValue(P
).template get
<OnDiskTable
*>();
100 using table_iterator
=
101 llvm::mapped_iterator
<TableVector::iterator
, AsOnDiskTable
>;
102 using table_range
= llvm::iterator_range
<table_iterator
>;
104 /// The current set of on-disk tables.
105 table_range
tables() {
106 auto Begin
= Tables
.begin(), End
= Tables
.end();
107 if (getMergedTable())
109 return llvm::make_range(llvm::map_iterator(Begin
, AsOnDiskTable()),
110 llvm::map_iterator(End
, AsOnDiskTable()));
113 MergedTable
*getMergedTable() const {
114 // If we already have a merged table, it's the first one.
115 return Tables
.empty() ? nullptr : Table::getFromOpaqueValue(*Tables
.begin())
116 .template dyn_cast
<MergedTable
*>();
119 /// Delete all our current on-disk tables.
121 for (auto *T
: tables())
123 if (auto *M
= getMergedTable())
128 void removeOverriddenTables() {
129 llvm::DenseSet
<file_type
> Files
;
130 Files
.insert(PendingOverrides
.begin(), PendingOverrides
.end());
131 // Explicitly capture Files to work around an MSVC 2015 rejects-valid bug.
132 auto ShouldRemove
= [&Files
](void *T
) -> bool {
133 auto *ODT
= Table::getFromOpaqueValue(T
).template get
<OnDiskTable
*>();
134 bool Remove
= Files
.count(ODT
->File
);
139 Tables
.erase(std::remove_if(tables().begin().getCurrent(), Tables
.end(),
142 PendingOverrides
.clear();
146 MergedTable
*Merged
= getMergedTable();
148 Merged
= new MergedTable
;
150 // Read in all the tables and merge them together.
151 // FIXME: Be smarter about which tables we merge.
152 for (auto *ODT
: tables()) {
153 auto &HT
= ODT
->Table
;
154 Info
&InfoObj
= HT
.getInfoObj();
156 for (auto I
= HT
.data_begin(), E
= HT
.data_end(); I
!= E
; ++I
) {
157 auto *LocalPtr
= I
.getItem();
159 // FIXME: Don't rely on the OnDiskHashTable format here.
160 auto L
= InfoObj
.ReadKeyDataLength(LocalPtr
);
161 const internal_key_type
&Key
= InfoObj
.ReadKey(LocalPtr
, L
.first
);
162 data_type_builder
ValueBuilder(Merged
->Data
[Key
]);
163 InfoObj
.ReadDataInto(Key
, LocalPtr
+ L
.first
, L
.second
,
167 Merged
->Files
.push_back(ODT
->File
);
172 Tables
.push_back(Table(Merged
).getOpaqueValue());
176 MultiOnDiskHashTable() = default;
178 MultiOnDiskHashTable(MultiOnDiskHashTable
&&O
)
179 : Tables(std::move(O
.Tables
)),
180 PendingOverrides(std::move(O
.PendingOverrides
)) {
184 MultiOnDiskHashTable
&operator=(MultiOnDiskHashTable
&&O
) {
188 Tables
= std::move(O
.Tables
);
190 PendingOverrides
= std::move(O
.PendingOverrides
);
194 ~MultiOnDiskHashTable() { clear(); }
196 /// Add the table \p Data loaded from file \p File.
197 void add(file_type File
, storage_type Data
, Info InfoObj
= Info()) {
198 using namespace llvm::support
;
200 storage_type Ptr
= Data
;
202 uint32_t BucketOffset
= endian::readNext
<uint32_t, little
, unaligned
>(Ptr
);
204 // Read the list of overridden files.
205 uint32_t NumFiles
= endian::readNext
<uint32_t, little
, unaligned
>(Ptr
);
206 // FIXME: Add a reserve() to TinyPtrVector so that we don't need to make
207 // an additional copy.
208 llvm::SmallVector
<file_type
, 16> OverriddenFiles
;
209 OverriddenFiles
.reserve(NumFiles
);
210 for (/**/; NumFiles
!= 0; --NumFiles
)
211 OverriddenFiles
.push_back(InfoObj
.ReadFileRef(Ptr
));
212 PendingOverrides
.insert(PendingOverrides
.end(), OverriddenFiles
.begin(),
213 OverriddenFiles
.end());
215 // Read the OnDiskChainedHashTable header.
216 storage_type Buckets
= Data
+ BucketOffset
;
217 auto NumBucketsAndEntries
=
218 OnDiskTable::HashTable::readNumBucketsAndEntries(Buckets
);
220 // Register the table.
221 Table NewTable
= new OnDiskTable(File
, NumBucketsAndEntries
.first
,
222 NumBucketsAndEntries
.second
,
223 Buckets
, Ptr
, Data
, std::move(InfoObj
));
224 Tables
.push_back(NewTable
.getOpaqueValue());
227 /// Find and read the lookup results for \p EKey.
228 data_type
find(const external_key_type
&EKey
) {
231 if (!PendingOverrides
.empty())
232 removeOverriddenTables();
234 if (Tables
.size() > static_cast<unsigned>(Info::MaxTables
))
237 internal_key_type Key
= Info::GetInternalKey(EKey
);
238 auto KeyHash
= Info::ComputeHash(Key
);
240 if (MergedTable
*M
= getMergedTable()) {
241 auto It
= M
->Data
.find(Key
);
242 if (It
!= M
->Data
.end())
246 data_type_builder
ResultBuilder(Result
);
248 for (auto *ODT
: tables()) {
249 auto &HT
= ODT
->Table
;
250 auto It
= HT
.find_hashed(Key
, KeyHash
);
252 HT
.getInfoObj().ReadDataInto(Key
, It
.getDataPtr(), It
.getDataLen(),
259 /// Read all the lookup results into a single value. This only makes
260 /// sense if merging values across keys is meaningful.
261 data_type
findAll() {
263 data_type_builder
ResultBuilder(Result
);
265 if (!PendingOverrides
.empty())
266 removeOverriddenTables();
268 if (MergedTable
*M
= getMergedTable()) {
269 for (auto &KV
: M
->Data
)
270 Info::MergeDataInto(KV
.second
, ResultBuilder
);
273 for (auto *ODT
: tables()) {
274 auto &HT
= ODT
->Table
;
275 Info
&InfoObj
= HT
.getInfoObj();
276 for (auto I
= HT
.data_begin(), E
= HT
.data_end(); I
!= E
; ++I
) {
277 auto *LocalPtr
= I
.getItem();
279 // FIXME: Don't rely on the OnDiskHashTable format here.
280 auto L
= InfoObj
.ReadKeyDataLength(LocalPtr
);
281 const internal_key_type
&Key
= InfoObj
.ReadKey(LocalPtr
, L
.first
);
282 InfoObj
.ReadDataInto(Key
, LocalPtr
+ L
.first
, L
.second
, ResultBuilder
);
290 /// Writer for the on-disk hash table.
291 template<typename ReaderInfo
, typename WriterInfo
>
292 class MultiOnDiskHashTableGenerator
{
293 using BaseTable
= MultiOnDiskHashTable
<ReaderInfo
>;
294 using Generator
= llvm::OnDiskChainedHashTableGenerator
<WriterInfo
>;
299 MultiOnDiskHashTableGenerator() : Gen() {}
301 void insert(typename
WriterInfo::key_type_ref Key
,
302 typename
WriterInfo::data_type_ref Data
, WriterInfo
&Info
) {
303 Gen
.insert(Key
, Data
, Info
);
306 void emit(llvm::SmallVectorImpl
<char> &Out
, WriterInfo
&Info
,
307 const BaseTable
*Base
) {
308 using namespace llvm::support
;
310 llvm::raw_svector_ostream
OutStream(Out
);
312 // Write our header information.
314 endian::Writer
Writer(OutStream
, little
);
316 // Reserve four bytes for the bucket offset.
317 Writer
.write
<uint32_t>(0);
319 if (auto *Merged
= Base
? Base
->getMergedTable() : nullptr) {
320 // Write list of overridden files.
321 Writer
.write
<uint32_t>(Merged
->Files
.size());
322 for (const auto &F
: Merged
->Files
)
323 Info
.EmitFileRef(OutStream
, F
);
325 // Add all merged entries from Base to the generator.
326 for (auto &KV
: Merged
->Data
) {
327 if (!Gen
.contains(KV
.first
, Info
))
328 Gen
.insert(KV
.first
, Info
.ImportData(KV
.second
), Info
);
331 Writer
.write
<uint32_t>(0);
335 // Write the table itself.
336 uint32_t BucketOffset
= Gen
.Emit(OutStream
, Info
);
338 // Replace the first four bytes with the bucket offset.
339 endian::write32le(Out
.data(), BucketOffset
);
343 } // namespace serialization
346 #endif // LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H