[DominatorTree] Add support for mixed pre/post CFG views.
[llvm-project.git] / compiler-rt / lib / fuzzer / FuzzerCorpus.h
blobdaea4f5213b18272a6c7520393b073ffba31c015
1 //===- FuzzerCorpus.h - Internal header for the Fuzzer ----------*- C++ -* ===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 // fuzzer::InputCorpus
9 //===----------------------------------------------------------------------===//
11 #ifndef LLVM_FUZZER_CORPUS
12 #define LLVM_FUZZER_CORPUS
14 #include "FuzzerDataFlowTrace.h"
15 #include "FuzzerDefs.h"
16 #include "FuzzerIO.h"
17 #include "FuzzerRandom.h"
18 #include "FuzzerSHA1.h"
19 #include "FuzzerTracePC.h"
20 #include <algorithm>
21 #include <chrono>
22 #include <numeric>
23 #include <random>
24 #include <unordered_set>
26 namespace fuzzer {
28 struct InputInfo {
29 Unit U; // The actual input data.
30 std::chrono::microseconds TimeOfUnit;
31 uint8_t Sha1[kSHA1NumBytes]; // Checksum.
32 // Number of features that this input has and no smaller input has.
33 size_t NumFeatures = 0;
34 size_t Tmp = 0; // Used by ValidateFeatureSet.
35 // Stats.
36 size_t NumExecutedMutations = 0;
37 size_t NumSuccessfullMutations = 0;
38 bool NeverReduce = false;
39 bool MayDeleteFile = false;
40 bool Reduced = false;
41 bool HasFocusFunction = false;
42 Vector<uint32_t> UniqFeatureSet;
43 Vector<uint8_t> DataFlowTraceForFocusFunction;
44 // Power schedule.
45 bool NeedsEnergyUpdate = false;
46 double Energy = 0.0;
47 size_t SumIncidence = 0;
48 Vector<std::pair<uint32_t, uint16_t>> FeatureFreqs;
50 // Delete feature Idx and its frequency from FeatureFreqs.
51 bool DeleteFeatureFreq(uint32_t Idx) {
52 if (FeatureFreqs.empty())
53 return false;
55 // Binary search over local feature frequencies sorted by index.
56 auto Lower = std::lower_bound(FeatureFreqs.begin(), FeatureFreqs.end(),
57 std::pair<uint32_t, uint16_t>(Idx, 0));
59 if (Lower != FeatureFreqs.end() && Lower->first == Idx) {
60 FeatureFreqs.erase(Lower);
61 return true;
63 return false;
66 // Assign more energy to a high-entropy seed, i.e., that reveals more
67 // information about the globally rare features in the neighborhood of the
68 // seed. Since we do not know the entropy of a seed that has never been
69 // executed we assign fresh seeds maximum entropy and let II->Energy approach
70 // the true entropy from above. If ScalePerExecTime is true, the computed
71 // entropy is scaled based on how fast this input executes compared to the
72 // average execution time of inputs. The faster an input executes, the more
73 // energy gets assigned to the input.
74 void UpdateEnergy(size_t GlobalNumberOfFeatures, bool ScalePerExecTime,
75 std::chrono::microseconds AverageUnitExecutionTime) {
76 Energy = 0.0;
77 SumIncidence = 0;
79 // Apply add-one smoothing to locally discovered features.
80 for (auto F : FeatureFreqs) {
81 size_t LocalIncidence = F.second + 1;
82 Energy -= LocalIncidence * logl(LocalIncidence);
83 SumIncidence += LocalIncidence;
86 // Apply add-one smoothing to locally undiscovered features.
87 // PreciseEnergy -= 0; // since logl(1.0) == 0)
88 SumIncidence += (GlobalNumberOfFeatures - FeatureFreqs.size());
90 // Add a single locally abundant feature apply add-one smoothing.
91 size_t AbdIncidence = NumExecutedMutations + 1;
92 Energy -= AbdIncidence * logl(AbdIncidence);
93 SumIncidence += AbdIncidence;
95 // Normalize.
96 if (SumIncidence != 0)
97 Energy = (Energy / SumIncidence) + logl(SumIncidence);
99 if (ScalePerExecTime) {
100 // Scaling to favor inputs with lower execution time.
101 uint32_t PerfScore = 100;
102 if (TimeOfUnit.count() > AverageUnitExecutionTime.count() * 10)
103 PerfScore = 10;
104 else if (TimeOfUnit.count() > AverageUnitExecutionTime.count() * 4)
105 PerfScore = 25;
106 else if (TimeOfUnit.count() > AverageUnitExecutionTime.count() * 2)
107 PerfScore = 50;
108 else if (TimeOfUnit.count() * 3 > AverageUnitExecutionTime.count() * 4)
109 PerfScore = 75;
110 else if (TimeOfUnit.count() * 4 < AverageUnitExecutionTime.count())
111 PerfScore = 300;
112 else if (TimeOfUnit.count() * 3 < AverageUnitExecutionTime.count())
113 PerfScore = 200;
114 else if (TimeOfUnit.count() * 2 < AverageUnitExecutionTime.count())
115 PerfScore = 150;
117 Energy *= PerfScore;
121 // Increment the frequency of the feature Idx.
122 void UpdateFeatureFrequency(uint32_t Idx) {
123 NeedsEnergyUpdate = true;
125 // The local feature frequencies is an ordered vector of pairs.
126 // If there are no local feature frequencies, push_back preserves order.
127 // Set the feature frequency for feature Idx32 to 1.
128 if (FeatureFreqs.empty()) {
129 FeatureFreqs.push_back(std::pair<uint32_t, uint16_t>(Idx, 1));
130 return;
133 // Binary search over local feature frequencies sorted by index.
134 auto Lower = std::lower_bound(FeatureFreqs.begin(), FeatureFreqs.end(),
135 std::pair<uint32_t, uint16_t>(Idx, 0));
137 // If feature Idx32 already exists, increment its frequency.
138 // Otherwise, insert a new pair right after the next lower index.
139 if (Lower != FeatureFreqs.end() && Lower->first == Idx) {
140 Lower->second++;
141 } else {
142 FeatureFreqs.insert(Lower, std::pair<uint32_t, uint16_t>(Idx, 1));
147 struct EntropicOptions {
148 bool Enabled;
149 size_t NumberOfRarestFeatures;
150 size_t FeatureFrequencyThreshold;
151 bool ScalePerExecTime;
154 class InputCorpus {
155 static const uint32_t kFeatureSetSize = 1 << 21;
156 static const uint8_t kMaxMutationFactor = 20;
157 static const size_t kSparseEnergyUpdates = 100;
159 size_t NumExecutedMutations = 0;
161 EntropicOptions Entropic;
163 public:
164 InputCorpus(const std::string &OutputCorpus, EntropicOptions Entropic)
165 : Entropic(Entropic), OutputCorpus(OutputCorpus) {
166 memset(InputSizesPerFeature, 0, sizeof(InputSizesPerFeature));
167 memset(SmallestElementPerFeature, 0, sizeof(SmallestElementPerFeature));
169 ~InputCorpus() {
170 for (auto II : Inputs)
171 delete II;
173 size_t size() const { return Inputs.size(); }
174 size_t SizeInBytes() const {
175 size_t Res = 0;
176 for (auto II : Inputs)
177 Res += II->U.size();
178 return Res;
180 size_t NumActiveUnits() const {
181 size_t Res = 0;
182 for (auto II : Inputs)
183 Res += !II->U.empty();
184 return Res;
186 size_t MaxInputSize() const {
187 size_t Res = 0;
188 for (auto II : Inputs)
189 Res = std::max(Res, II->U.size());
190 return Res;
192 void IncrementNumExecutedMutations() { NumExecutedMutations++; }
194 size_t NumInputsThatTouchFocusFunction() {
195 return std::count_if(Inputs.begin(), Inputs.end(), [](const InputInfo *II) {
196 return II->HasFocusFunction;
200 size_t NumInputsWithDataFlowTrace() {
201 return std::count_if(Inputs.begin(), Inputs.end(), [](const InputInfo *II) {
202 return !II->DataFlowTraceForFocusFunction.empty();
206 bool empty() const { return Inputs.empty(); }
207 const Unit &operator[] (size_t Idx) const { return Inputs[Idx]->U; }
208 InputInfo *AddToCorpus(const Unit &U, size_t NumFeatures, bool MayDeleteFile,
209 bool HasFocusFunction, bool NeverReduce,
210 std::chrono::microseconds TimeOfUnit,
211 const Vector<uint32_t> &FeatureSet,
212 const DataFlowTrace &DFT, const InputInfo *BaseII) {
213 assert(!U.empty());
214 if (FeatureDebug)
215 Printf("ADD_TO_CORPUS %zd NF %zd\n", Inputs.size(), NumFeatures);
216 Inputs.push_back(new InputInfo());
217 InputInfo &II = *Inputs.back();
218 II.U = U;
219 II.NumFeatures = NumFeatures;
220 II.NeverReduce = NeverReduce;
221 II.TimeOfUnit = TimeOfUnit;
222 II.MayDeleteFile = MayDeleteFile;
223 II.UniqFeatureSet = FeatureSet;
224 II.HasFocusFunction = HasFocusFunction;
225 // Assign maximal energy to the new seed.
226 II.Energy = RareFeatures.empty() ? 1.0 : log(RareFeatures.size());
227 II.SumIncidence = RareFeatures.size();
228 II.NeedsEnergyUpdate = false;
229 std::sort(II.UniqFeatureSet.begin(), II.UniqFeatureSet.end());
230 ComputeSHA1(U.data(), U.size(), II.Sha1);
231 auto Sha1Str = Sha1ToString(II.Sha1);
232 Hashes.insert(Sha1Str);
233 if (HasFocusFunction)
234 if (auto V = DFT.Get(Sha1Str))
235 II.DataFlowTraceForFocusFunction = *V;
236 // This is a gross heuristic.
237 // Ideally, when we add an element to a corpus we need to know its DFT.
238 // But if we don't, we'll use the DFT of its base input.
239 if (II.DataFlowTraceForFocusFunction.empty() && BaseII)
240 II.DataFlowTraceForFocusFunction = BaseII->DataFlowTraceForFocusFunction;
241 DistributionNeedsUpdate = true;
242 PrintCorpus();
243 // ValidateFeatureSet();
244 return &II;
247 // Debug-only
248 void PrintUnit(const Unit &U) {
249 if (!FeatureDebug) return;
250 for (uint8_t C : U) {
251 if (C != 'F' && C != 'U' && C != 'Z')
252 C = '.';
253 Printf("%c", C);
257 // Debug-only
258 void PrintFeatureSet(const Vector<uint32_t> &FeatureSet) {
259 if (!FeatureDebug) return;
260 Printf("{");
261 for (uint32_t Feature: FeatureSet)
262 Printf("%u,", Feature);
263 Printf("}");
266 // Debug-only
267 void PrintCorpus() {
268 if (!FeatureDebug) return;
269 Printf("======= CORPUS:\n");
270 int i = 0;
271 for (auto II : Inputs) {
272 if (std::find(II->U.begin(), II->U.end(), 'F') != II->U.end()) {
273 Printf("[%2d] ", i);
274 Printf("%s sz=%zd ", Sha1ToString(II->Sha1).c_str(), II->U.size());
275 PrintUnit(II->U);
276 Printf(" ");
277 PrintFeatureSet(II->UniqFeatureSet);
278 Printf("\n");
280 i++;
284 void Replace(InputInfo *II, const Unit &U) {
285 assert(II->U.size() > U.size());
286 Hashes.erase(Sha1ToString(II->Sha1));
287 DeleteFile(*II);
288 ComputeSHA1(U.data(), U.size(), II->Sha1);
289 Hashes.insert(Sha1ToString(II->Sha1));
290 II->U = U;
291 II->Reduced = true;
292 DistributionNeedsUpdate = true;
295 bool HasUnit(const Unit &U) { return Hashes.count(Hash(U)); }
296 bool HasUnit(const std::string &H) { return Hashes.count(H); }
297 InputInfo &ChooseUnitToMutate(Random &Rand) {
298 InputInfo &II = *Inputs[ChooseUnitIdxToMutate(Rand)];
299 assert(!II.U.empty());
300 return II;
303 InputInfo &ChooseUnitToCrossOverWith(Random &Rand, bool UniformDist) {
304 if (!UniformDist) {
305 return ChooseUnitToMutate(Rand);
307 InputInfo &II = *Inputs[Rand(Inputs.size())];
308 assert(!II.U.empty());
309 return II;
312 // Returns an index of random unit from the corpus to mutate.
313 size_t ChooseUnitIdxToMutate(Random &Rand) {
314 UpdateCorpusDistribution(Rand);
315 size_t Idx = static_cast<size_t>(CorpusDistribution(Rand));
316 assert(Idx < Inputs.size());
317 return Idx;
320 void PrintStats() {
321 for (size_t i = 0; i < Inputs.size(); i++) {
322 const auto &II = *Inputs[i];
323 Printf(" [% 3zd %s] sz: % 5zd runs: % 5zd succ: % 5zd focus: %d\n", i,
324 Sha1ToString(II.Sha1).c_str(), II.U.size(),
325 II.NumExecutedMutations, II.NumSuccessfullMutations, II.HasFocusFunction);
329 void PrintFeatureSet() {
330 for (size_t i = 0; i < kFeatureSetSize; i++) {
331 if(size_t Sz = GetFeature(i))
332 Printf("[%zd: id %zd sz%zd] ", i, SmallestElementPerFeature[i], Sz);
334 Printf("\n\t");
335 for (size_t i = 0; i < Inputs.size(); i++)
336 if (size_t N = Inputs[i]->NumFeatures)
337 Printf(" %zd=>%zd ", i, N);
338 Printf("\n");
341 void DeleteFile(const InputInfo &II) {
342 if (!OutputCorpus.empty() && II.MayDeleteFile)
343 RemoveFile(DirPlusFile(OutputCorpus, Sha1ToString(II.Sha1)));
346 void DeleteInput(size_t Idx) {
347 InputInfo &II = *Inputs[Idx];
348 DeleteFile(II);
349 Unit().swap(II.U);
350 II.Energy = 0.0;
351 II.NeedsEnergyUpdate = false;
352 DistributionNeedsUpdate = true;
353 if (FeatureDebug)
354 Printf("EVICTED %zd\n", Idx);
357 void AddRareFeature(uint32_t Idx) {
358 // Maintain *at least* TopXRarestFeatures many rare features
359 // and all features with a frequency below ConsideredRare.
360 // Remove all other features.
361 while (RareFeatures.size() > Entropic.NumberOfRarestFeatures &&
362 FreqOfMostAbundantRareFeature > Entropic.FeatureFrequencyThreshold) {
364 // Find most and second most abbundant feature.
365 uint32_t MostAbundantRareFeatureIndices[2] = {RareFeatures[0],
366 RareFeatures[0]};
367 size_t Delete = 0;
368 for (size_t i = 0; i < RareFeatures.size(); i++) {
369 uint32_t Idx2 = RareFeatures[i];
370 if (GlobalFeatureFreqs[Idx2] >=
371 GlobalFeatureFreqs[MostAbundantRareFeatureIndices[0]]) {
372 MostAbundantRareFeatureIndices[1] = MostAbundantRareFeatureIndices[0];
373 MostAbundantRareFeatureIndices[0] = Idx2;
374 Delete = i;
378 // Remove most abundant rare feature.
379 RareFeatures[Delete] = RareFeatures.back();
380 RareFeatures.pop_back();
382 for (auto II : Inputs) {
383 if (II->DeleteFeatureFreq(MostAbundantRareFeatureIndices[0]))
384 II->NeedsEnergyUpdate = true;
387 // Set 2nd most abundant as the new most abundant feature count.
388 FreqOfMostAbundantRareFeature =
389 GlobalFeatureFreqs[MostAbundantRareFeatureIndices[1]];
392 // Add rare feature, handle collisions, and update energy.
393 RareFeatures.push_back(Idx);
394 GlobalFeatureFreqs[Idx] = 0;
395 for (auto II : Inputs) {
396 II->DeleteFeatureFreq(Idx);
398 // Apply add-one smoothing to this locally undiscovered feature.
399 // Zero energy seeds will never be fuzzed and remain zero energy.
400 if (II->Energy > 0.0) {
401 II->SumIncidence += 1;
402 II->Energy += logl(II->SumIncidence) / II->SumIncidence;
406 DistributionNeedsUpdate = true;
409 bool AddFeature(size_t Idx, uint32_t NewSize, bool Shrink) {
410 assert(NewSize);
411 Idx = Idx % kFeatureSetSize;
412 uint32_t OldSize = GetFeature(Idx);
413 if (OldSize == 0 || (Shrink && OldSize > NewSize)) {
414 if (OldSize > 0) {
415 size_t OldIdx = SmallestElementPerFeature[Idx];
416 InputInfo &II = *Inputs[OldIdx];
417 assert(II.NumFeatures > 0);
418 II.NumFeatures--;
419 if (II.NumFeatures == 0)
420 DeleteInput(OldIdx);
421 } else {
422 NumAddedFeatures++;
423 if (Entropic.Enabled)
424 AddRareFeature((uint32_t)Idx);
426 NumUpdatedFeatures++;
427 if (FeatureDebug)
428 Printf("ADD FEATURE %zd sz %d\n", Idx, NewSize);
429 SmallestElementPerFeature[Idx] = Inputs.size();
430 InputSizesPerFeature[Idx] = NewSize;
431 return true;
433 return false;
436 // Increment frequency of feature Idx globally and locally.
437 void UpdateFeatureFrequency(InputInfo *II, size_t Idx) {
438 uint32_t Idx32 = Idx % kFeatureSetSize;
440 // Saturated increment.
441 if (GlobalFeatureFreqs[Idx32] == 0xFFFF)
442 return;
443 uint16_t Freq = GlobalFeatureFreqs[Idx32]++;
445 // Skip if abundant.
446 if (Freq > FreqOfMostAbundantRareFeature ||
447 std::find(RareFeatures.begin(), RareFeatures.end(), Idx32) ==
448 RareFeatures.end())
449 return;
451 // Update global frequencies.
452 if (Freq == FreqOfMostAbundantRareFeature)
453 FreqOfMostAbundantRareFeature++;
455 // Update local frequencies.
456 if (II)
457 II->UpdateFeatureFrequency(Idx32);
460 size_t NumFeatures() const { return NumAddedFeatures; }
461 size_t NumFeatureUpdates() const { return NumUpdatedFeatures; }
463 private:
465 static const bool FeatureDebug = false;
467 size_t GetFeature(size_t Idx) const { return InputSizesPerFeature[Idx]; }
469 void ValidateFeatureSet() {
470 if (FeatureDebug)
471 PrintFeatureSet();
472 for (size_t Idx = 0; Idx < kFeatureSetSize; Idx++)
473 if (GetFeature(Idx))
474 Inputs[SmallestElementPerFeature[Idx]]->Tmp++;
475 for (auto II: Inputs) {
476 if (II->Tmp != II->NumFeatures)
477 Printf("ZZZ %zd %zd\n", II->Tmp, II->NumFeatures);
478 assert(II->Tmp == II->NumFeatures);
479 II->Tmp = 0;
483 // Updates the probability distribution for the units in the corpus.
484 // Must be called whenever the corpus or unit weights are changed.
486 // Hypothesis: inputs that maximize information about globally rare features
487 // are interesting.
488 void UpdateCorpusDistribution(Random &Rand) {
489 // Skip update if no seeds or rare features were added/deleted.
490 // Sparse updates for local change of feature frequencies,
491 // i.e., randomly do not skip.
492 if (!DistributionNeedsUpdate &&
493 (!Entropic.Enabled || Rand(kSparseEnergyUpdates)))
494 return;
496 DistributionNeedsUpdate = false;
498 size_t N = Inputs.size();
499 assert(N);
500 Intervals.resize(N + 1);
501 Weights.resize(N);
502 std::iota(Intervals.begin(), Intervals.end(), 0);
504 std::chrono::microseconds AverageUnitExecutionTime(0);
505 for (auto II : Inputs) {
506 AverageUnitExecutionTime += II->TimeOfUnit;
508 AverageUnitExecutionTime /= N;
510 bool VanillaSchedule = true;
511 if (Entropic.Enabled) {
512 for (auto II : Inputs) {
513 if (II->NeedsEnergyUpdate && II->Energy != 0.0) {
514 II->NeedsEnergyUpdate = false;
515 II->UpdateEnergy(RareFeatures.size(), Entropic.ScalePerExecTime,
516 AverageUnitExecutionTime);
520 for (size_t i = 0; i < N; i++) {
522 if (Inputs[i]->NumFeatures == 0) {
523 // If the seed doesn't represent any features, assign zero energy.
524 Weights[i] = 0.;
525 } else if (Inputs[i]->NumExecutedMutations / kMaxMutationFactor >
526 NumExecutedMutations / Inputs.size()) {
527 // If the seed was fuzzed a lot more than average, assign zero energy.
528 Weights[i] = 0.;
529 } else {
530 // Otherwise, simply assign the computed energy.
531 Weights[i] = Inputs[i]->Energy;
534 // If energy for all seeds is zero, fall back to vanilla schedule.
535 if (Weights[i] > 0.0)
536 VanillaSchedule = false;
540 if (VanillaSchedule) {
541 for (size_t i = 0; i < N; i++)
542 Weights[i] = Inputs[i]->NumFeatures
543 ? (i + 1) * (Inputs[i]->HasFocusFunction ? 1000 : 1)
544 : 0.;
547 if (FeatureDebug) {
548 for (size_t i = 0; i < N; i++)
549 Printf("%zd ", Inputs[i]->NumFeatures);
550 Printf("SCORE\n");
551 for (size_t i = 0; i < N; i++)
552 Printf("%f ", Weights[i]);
553 Printf("Weights\n");
555 CorpusDistribution = std::piecewise_constant_distribution<double>(
556 Intervals.begin(), Intervals.end(), Weights.begin());
558 std::piecewise_constant_distribution<double> CorpusDistribution;
560 Vector<double> Intervals;
561 Vector<double> Weights;
563 std::unordered_set<std::string> Hashes;
564 Vector<InputInfo*> Inputs;
566 size_t NumAddedFeatures = 0;
567 size_t NumUpdatedFeatures = 0;
568 uint32_t InputSizesPerFeature[kFeatureSetSize];
569 uint32_t SmallestElementPerFeature[kFeatureSetSize];
571 bool DistributionNeedsUpdate = true;
572 uint16_t FreqOfMostAbundantRareFeature = 0;
573 uint16_t GlobalFeatureFreqs[kFeatureSetSize] = {};
574 Vector<uint32_t> RareFeatures;
576 std::string OutputCorpus;
579 } // namespace fuzzer
581 #endif // LLVM_FUZZER_CORPUS