Popular sites on the NTP: check that experiment group StartsWith (rather than IS...
[chromium-blink-merge.git] / chrome / browser / safe_browsing / safe_browsing_store_file.h
blobbbe22fa812368d9be0e48bf2eabd03d85c43edf4
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #ifndef CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_FILE_H_
6 #define CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_FILE_H_
8 #include <set>
9 #include <vector>
11 #include "chrome/browser/safe_browsing/safe_browsing_store.h"
13 #include "base/callback.h"
14 #include "base/files/file_path.h"
15 #include "base/files/scoped_file.h"
16 #include "base/sequenced_task_runner.h"
18 // Implement SafeBrowsingStore in terms of a flat file. The file
19 // format is pretty literal:
21 // int32 magic; // magic number "validating" file
22 // int32 version; // format version
24 // // Counts for the various data which follows the header.
25 // uint32 add_chunk_count; // Chunks seen, including empties.
26 // uint32 sub_chunk_count; // Ditto.
27 // uint32 shard_stride; // SBPrefix space covered per shard.
28 // // 0==entire space in one shard.
29 // // Sorted by chunk_id.
30 // array[add_chunk_count] {
31 // int32 chunk_id;
32 // }
33 // // Sorted by chunk_id.
34 // array[sub_chunk_count] {
35 // int32 chunk_id;
36 // }
37 // MD5Digest header_checksum; // Checksum over preceeding data.
39 // // Sorted by prefix, then add chunk_id, then hash, both within shards and
40 // // overall.
41 // array[from 0 to wraparound to 0 by shard_stride] {
42 // uint32 add_prefix_count;
43 // uint32 sub_prefix_count;
44 // uint32 add_hash_count;
45 // uint32 sub_hash_count;
46 // array[add_prefix_count] {
47 // int32 chunk_id;
48 // uint32 prefix;
49 // }
50 // array[sub_prefix_count] {
51 // int32 chunk_id;
52 // int32 add_chunk_id;
53 // uint32 add_prefix;
54 // }
55 // array[add_hash_count] {
56 // int32 chunk_id;
57 // int32 received_time; // From base::Time::ToTimeT().
58 // char[32] full_hash;
59 // }
60 // array[sub_hash_count] {
61 // int32 chunk_id;
62 // int32 add_chunk_id;
63 // char[32] add_full_hash;
64 // }
65 // }
66 // MD5Digest checksum; // Checksum over entire file.
68 // The checksums are used to allow writing the file without doing an expensive
69 // fsync(). Since the data can be re-fetched, failing the checksum is not
70 // catastrophic. Histograms indicate that file corruption here is pretty
71 // uncommon.
73 // The |header_checksum| is present to guarantee valid header and chunk data for
74 // updates. Only that part of the file needs to be read to post the update.
76 // |shard_stride| breaks the file into approximately-equal portions, allowing
77 // updates to stream from one file to another with modest memory usage. It is
78 // dynamic to adjust to different file sizes without adding excessive overhead.
80 // During the course of an update, uncommitted data is stored in a
81 // temporary file (which is later re-used to commit). This is an
82 // array of chunks, with the count kept in memory until the end of the
83 // transaction. The format of this file is like the main file, with
84 // the list of chunks seen omitted, as that data is tracked in-memory:
86 // array[] {
87 // uint32 add_prefix_count;
88 // uint32 sub_prefix_count;
89 // uint32 add_hash_count;
90 // uint32 sub_hash_count;
91 // array[add_prefix_count] {
92 // int32 chunk_id;
93 // uint32 prefix;
94 // }
95 // array[sub_prefix_count] {
96 // int32 chunk_id;
97 // int32 add_chunk_id;
98 // uint32 add_prefix;
99 // }
100 // array[add_hash_count] {
101 // int32 chunk_id;
102 // int32 received_time; // From base::Time::ToTimeT().
103 // char[32] full_hash;
104 // }
105 // array[sub_hash_count] {
106 // int32 chunk_id;
107 // int32 add_chunk_id;
108 // char[32] add_full_hash;
109 // }
110 // }
112 // The overall transaction works like this:
113 // - Open the original file to get the chunks-seen data.
114 // - Open a temp file for storing new chunk info.
115 // - Write new chunks to the temp file.
116 // - When the transaction is finished:
117 // - Read the update data from the temp file into memory.
118 // - Overwrite the temp file with new header data.
119 // - Until done:
120 // - Read shards of the original file's data into memory.
121 // - Merge from the update data.
122 // - Write shards to the temp file.
123 // - Delete original file.
124 // - Rename temp file to original filename.
126 class SafeBrowsingStoreFile : public SafeBrowsingStore {
127 public:
128 explicit SafeBrowsingStoreFile(
129 const scoped_refptr<const base::SequencedTaskRunner>& task_runner);
130 ~SafeBrowsingStoreFile() override;
132 void Init(const base::FilePath& filename,
133 const base::Closure& corruption_callback) override;
135 // Delete any on-disk files, including the permanent storage.
136 bool Delete() override;
138 // Get all add hash prefixes and full-length hashes, respectively, from
139 // the store.
140 bool GetAddPrefixes(SBAddPrefixes* add_prefixes) override;
141 bool GetAddFullHashes(std::vector<SBAddFullHash>* add_full_hashes) override;
143 bool BeginChunk() override;
145 bool WriteAddPrefix(int32 chunk_id, SBPrefix prefix) override;
146 bool WriteAddHash(int32 chunk_id, const SBFullHash& full_hash) override;
147 bool WriteSubPrefix(int32 chunk_id,
148 int32 add_chunk_id,
149 SBPrefix prefix) override;
150 bool WriteSubHash(int32 chunk_id,
151 int32 add_chunk_id,
152 const SBFullHash& full_hash) override;
153 bool FinishChunk() override;
155 bool BeginUpdate() override;
156 bool FinishUpdate(
157 safe_browsing::PrefixSetBuilder* builder,
158 std::vector<SBAddFullHash>* add_full_hashes_result) override;
159 bool CancelUpdate() override;
161 void SetAddChunk(int32 chunk_id) override;
162 bool CheckAddChunk(int32 chunk_id) override;
163 void GetAddChunks(std::vector<int32>* out) override;
164 void SetSubChunk(int32 chunk_id) override;
165 bool CheckSubChunk(int32 chunk_id) override;
166 void GetSubChunks(std::vector<int32>* out) override;
168 void DeleteAddChunk(int32 chunk_id) override;
169 void DeleteSubChunk(int32 chunk_id) override;
171 // Verify |file_|'s checksum, calling the corruption callback if it
172 // does not check out. Empty input is considered valid.
173 bool CheckValidity() override;
175 // Returns the name of the temporary file used to buffer data for
176 // |filename|. Exported for unit tests.
177 static const base::FilePath TemporaryFileForFilename(
178 const base::FilePath& filename) {
179 return base::FilePath(filename.value() + FILE_PATH_LITERAL("_new"));
182 // Delete any on-disk files, including the permanent storage.
183 static bool DeleteStore(const base::FilePath& basename);
185 private:
186 // Checks whether the current thread is part of the sequenced task runner
187 // this object was initialized with.
188 bool CalledOnValidThread();
190 // Does the actual update for FinishUpdate(), so that FinishUpdate() can clean
191 // up correctly in case of error.
192 virtual bool DoUpdate(safe_browsing::PrefixSetBuilder* builder,
193 std::vector<SBAddFullHash>* add_full_hashes_result);
195 // Some very lucky users have an original-format file still in their
196 // profile. Check for it and delete, recording a histogram for the
197 // result (no histogram for not-found). Logically this
198 // would make more sense at the SafeBrowsingDatabase level, but
199 // practically speaking that code doesn't touch files directly.
200 static void CheckForOriginalAndDelete(const base::FilePath& filename);
202 // Close all files and clear all buffers.
203 bool Close();
205 // Calls |corruption_callback_| if non-NULL, always returns false as
206 // a convenience to the caller.
207 bool OnCorruptDatabase();
209 // Helper for creating a corruption callback for |old_store_|.
210 // TODO(shess): Remove after migration.
211 void HandleCorruptDatabase();
213 // Clear temporary buffers used to accumulate chunk data.
214 bool ClearChunkBuffers() {
215 // NOTE: .clear() doesn't release memory.
216 // TODO(shess): Figure out if this is overkill. Some amount of
217 // pre-reserved space is probably reasonable between each chunk
218 // collected.
219 SBAddPrefixes().swap(add_prefixes_);
220 SBSubPrefixes().swap(sub_prefixes_);
221 std::vector<SBAddFullHash>().swap(add_hashes_);
222 std::vector<SBSubFullHash>().swap(sub_hashes_);
223 return true;
226 // Clear all buffers used during update.
227 void ClearUpdateBuffers() {
228 ClearChunkBuffers();
229 chunks_written_ = 0;
230 std::set<int32>().swap(add_chunks_cache_);
231 std::set<int32>().swap(sub_chunks_cache_);
232 base::hash_set<int32>().swap(add_del_cache_);
233 base::hash_set<int32>().swap(sub_del_cache_);
236 // The sequenced task runner for this object, used to verify that its state
237 // is only ever accessed from the runner.
238 scoped_refptr<const base::SequencedTaskRunner> task_runner_;
240 // Buffers for collecting data between BeginChunk() and
241 // FinishChunk().
242 SBAddPrefixes add_prefixes_;
243 SBSubPrefixes sub_prefixes_;
244 std::vector<SBAddFullHash> add_hashes_;
245 std::vector<SBSubFullHash> sub_hashes_;
247 // Count of chunks collected in |new_file_|.
248 int chunks_written_;
250 // Name of the main database file.
251 base::FilePath filename_;
253 // Handles to the main and scratch files. |empty_| is true if the
254 // main file didn't exist when the update was started.
255 base::ScopedFILE file_;
256 base::ScopedFILE new_file_;
257 bool empty_;
259 // Cache of chunks which have been seen. Loaded from the database
260 // on BeginUpdate() so that it can be queried during the
261 // transaction.
262 std::set<int32> add_chunks_cache_;
263 std::set<int32> sub_chunks_cache_;
265 // Cache the set of deleted chunks during a transaction, applied on
266 // FinishUpdate().
267 // TODO(shess): If the set is small enough, hash_set<> might be
268 // slower than plain set<>.
269 base::hash_set<int32> add_del_cache_;
270 base::hash_set<int32> sub_del_cache_;
272 base::Closure corruption_callback_;
274 // Tracks whether corruption has already been seen in the current
275 // update, so that only one instance is recorded in the stats.
276 // TODO(shess): Remove with format-migration support.
277 bool corruption_seen_;
279 DISALLOW_COPY_AND_ASSIGN(SafeBrowsingStoreFile);
282 #endif // CHROME_BROWSER_SAFE_BROWSING_SAFE_BROWSING_STORE_FILE_H_