1 //===- FuzzerCorpus.h - Internal header for the Fuzzer ----------*- C++ -* ===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 //===----------------------------------------------------------------------===//
11 #ifndef LLVM_FUZZER_CORPUS
12 #define LLVM_FUZZER_CORPUS
14 #include "FuzzerDataFlowTrace.h"
15 #include "FuzzerDefs.h"
17 #include "FuzzerRandom.h"
18 #include "FuzzerSHA1.h"
19 #include "FuzzerTracePC.h"
25 #include <unordered_set>
30 Unit U
; // The actual input data.
31 std::chrono::microseconds TimeOfUnit
;
32 uint8_t Sha1
[kSHA1NumBytes
]; // Checksum.
33 // Number of features that this input has and no smaller input has.
34 size_t NumFeatures
= 0;
35 size_t Tmp
= 0; // Used by ValidateFeatureSet.
37 size_t NumExecutedMutations
= 0;
38 size_t NumSuccessfullMutations
= 0;
39 bool NeverReduce
= false;
40 bool MayDeleteFile
= false;
42 bool HasFocusFunction
= false;
43 std::vector
<uint32_t> UniqFeatureSet
;
44 std::vector
<uint8_t> DataFlowTraceForFocusFunction
;
46 bool NeedsEnergyUpdate
= false;
48 double SumIncidence
= 0.0;
49 std::vector
<std::pair
<uint32_t, uint16_t>> FeatureFreqs
;
51 // Delete feature Idx and its frequency from FeatureFreqs.
52 bool DeleteFeatureFreq(uint32_t Idx
) {
53 if (FeatureFreqs
.empty())
56 // Binary search over local feature frequencies sorted by index.
57 auto Lower
= std::lower_bound(FeatureFreqs
.begin(), FeatureFreqs
.end(),
58 std::pair
<uint32_t, uint16_t>(Idx
, 0));
60 if (Lower
!= FeatureFreqs
.end() && Lower
->first
== Idx
) {
61 FeatureFreqs
.erase(Lower
);
67 // Assign more energy to a high-entropy seed, i.e., that reveals more
68 // information about the globally rare features in the neighborhood of the
69 // seed. Since we do not know the entropy of a seed that has never been
70 // executed we assign fresh seeds maximum entropy and let II->Energy approach
71 // the true entropy from above. If ScalePerExecTime is true, the computed
72 // entropy is scaled based on how fast this input executes compared to the
73 // average execution time of inputs. The faster an input executes, the more
74 // energy gets assigned to the input.
75 void UpdateEnergy(size_t GlobalNumberOfFeatures
, bool ScalePerExecTime
,
76 std::chrono::microseconds AverageUnitExecutionTime
) {
80 // Apply add-one smoothing to locally discovered features.
81 for (const auto &F
: FeatureFreqs
) {
82 double LocalIncidence
= F
.second
+ 1;
83 Energy
-= LocalIncidence
* log(LocalIncidence
);
84 SumIncidence
+= LocalIncidence
;
87 // Apply add-one smoothing to locally undiscovered features.
88 // PreciseEnergy -= 0; // since log(1.0) == 0)
90 static_cast<double>(GlobalNumberOfFeatures
- FeatureFreqs
.size());
92 // Add a single locally abundant feature apply add-one smoothing.
93 double AbdIncidence
= static_cast<double>(NumExecutedMutations
+ 1);
94 Energy
-= AbdIncidence
* log(AbdIncidence
);
95 SumIncidence
+= AbdIncidence
;
98 if (SumIncidence
!= 0)
99 Energy
= Energy
/ SumIncidence
+ log(SumIncidence
);
101 if (ScalePerExecTime
) {
102 // Scaling to favor inputs with lower execution time.
103 uint32_t PerfScore
= 100;
104 if (TimeOfUnit
.count() > AverageUnitExecutionTime
.count() * 10)
106 else if (TimeOfUnit
.count() > AverageUnitExecutionTime
.count() * 4)
108 else if (TimeOfUnit
.count() > AverageUnitExecutionTime
.count() * 2)
110 else if (TimeOfUnit
.count() * 3 > AverageUnitExecutionTime
.count() * 4)
112 else if (TimeOfUnit
.count() * 4 < AverageUnitExecutionTime
.count())
114 else if (TimeOfUnit
.count() * 3 < AverageUnitExecutionTime
.count())
116 else if (TimeOfUnit
.count() * 2 < AverageUnitExecutionTime
.count())
123 // Increment the frequency of the feature Idx.
124 void UpdateFeatureFrequency(uint32_t Idx
) {
125 NeedsEnergyUpdate
= true;
127 // The local feature frequencies is an ordered vector of pairs.
128 // If there are no local feature frequencies, push_back preserves order.
129 // Set the feature frequency for feature Idx32 to 1.
130 if (FeatureFreqs
.empty()) {
131 FeatureFreqs
.push_back(std::pair
<uint32_t, uint16_t>(Idx
, 1));
135 // Binary search over local feature frequencies sorted by index.
136 auto Lower
= std::lower_bound(FeatureFreqs
.begin(), FeatureFreqs
.end(),
137 std::pair
<uint32_t, uint16_t>(Idx
, 0));
139 // If feature Idx32 already exists, increment its frequency.
140 // Otherwise, insert a new pair right after the next lower index.
141 if (Lower
!= FeatureFreqs
.end() && Lower
->first
== Idx
) {
144 FeatureFreqs
.insert(Lower
, std::pair
<uint32_t, uint16_t>(Idx
, 1));
149 struct EntropicOptions
{
151 size_t NumberOfRarestFeatures
;
152 size_t FeatureFrequencyThreshold
;
153 bool ScalePerExecTime
;
157 static const uint32_t kFeatureSetSize
= 1 << 21;
158 static const uint8_t kMaxMutationFactor
= 20;
159 static const size_t kSparseEnergyUpdates
= 100;
161 size_t NumExecutedMutations
= 0;
163 EntropicOptions Entropic
;
166 InputCorpus(const std::string
&OutputCorpus
, EntropicOptions Entropic
)
167 : Entropic(Entropic
), OutputCorpus(OutputCorpus
) {
168 memset(InputSizesPerFeature
, 0, sizeof(InputSizesPerFeature
));
169 memset(SmallestElementPerFeature
, 0, sizeof(SmallestElementPerFeature
));
172 for (auto II
: Inputs
)
175 size_t size() const { return Inputs
.size(); }
176 size_t SizeInBytes() const {
178 for (auto II
: Inputs
)
182 size_t NumActiveUnits() const {
184 for (auto II
: Inputs
)
185 Res
+= !II
->U
.empty();
188 size_t MaxInputSize() const {
190 for (auto II
: Inputs
)
191 Res
= std::max(Res
, II
->U
.size());
194 void IncrementNumExecutedMutations() { NumExecutedMutations
++; }
196 size_t NumInputsThatTouchFocusFunction() {
197 return std::count_if(Inputs
.begin(), Inputs
.end(), [](const InputInfo
*II
) {
198 return II
->HasFocusFunction
;
202 size_t NumInputsWithDataFlowTrace() {
203 return std::count_if(Inputs
.begin(), Inputs
.end(), [](const InputInfo
*II
) {
204 return !II
->DataFlowTraceForFocusFunction
.empty();
208 bool empty() const { return Inputs
.empty(); }
209 const Unit
&operator[] (size_t Idx
) const { return Inputs
[Idx
]->U
; }
210 InputInfo
*AddToCorpus(const Unit
&U
, size_t NumFeatures
, bool MayDeleteFile
,
211 bool HasFocusFunction
, bool NeverReduce
,
212 std::chrono::microseconds TimeOfUnit
,
213 const std::vector
<uint32_t> &FeatureSet
,
214 const DataFlowTrace
&DFT
, const InputInfo
*BaseII
) {
217 Printf("ADD_TO_CORPUS %zd NF %zd\n", Inputs
.size(), NumFeatures
);
218 // Inputs.size() is cast to uint32_t below.
219 assert(Inputs
.size() < std::numeric_limits
<uint32_t>::max());
220 Inputs
.push_back(new InputInfo());
221 InputInfo
&II
= *Inputs
.back();
223 II
.NumFeatures
= NumFeatures
;
224 II
.NeverReduce
= NeverReduce
;
225 II
.TimeOfUnit
= TimeOfUnit
;
226 II
.MayDeleteFile
= MayDeleteFile
;
227 II
.UniqFeatureSet
= FeatureSet
;
228 II
.HasFocusFunction
= HasFocusFunction
;
229 // Assign maximal energy to the new seed.
230 II
.Energy
= RareFeatures
.empty() ? 1.0 : log(RareFeatures
.size());
231 II
.SumIncidence
= static_cast<double>(RareFeatures
.size());
232 II
.NeedsEnergyUpdate
= false;
233 std::sort(II
.UniqFeatureSet
.begin(), II
.UniqFeatureSet
.end());
234 ComputeSHA1(U
.data(), U
.size(), II
.Sha1
);
235 auto Sha1Str
= Sha1ToString(II
.Sha1
);
236 Hashes
.insert(Sha1Str
);
237 if (HasFocusFunction
)
238 if (auto V
= DFT
.Get(Sha1Str
))
239 II
.DataFlowTraceForFocusFunction
= *V
;
240 // This is a gross heuristic.
241 // Ideally, when we add an element to a corpus we need to know its DFT.
242 // But if we don't, we'll use the DFT of its base input.
243 if (II
.DataFlowTraceForFocusFunction
.empty() && BaseII
)
244 II
.DataFlowTraceForFocusFunction
= BaseII
->DataFlowTraceForFocusFunction
;
245 DistributionNeedsUpdate
= true;
247 // ValidateFeatureSet();
252 void PrintUnit(const Unit
&U
) {
253 if (!FeatureDebug
) return;
254 for (uint8_t C
: U
) {
255 if (C
!= 'F' && C
!= 'U' && C
!= 'Z')
262 void PrintFeatureSet(const std::vector
<uint32_t> &FeatureSet
) {
263 if (!FeatureDebug
) return;
265 for (uint32_t Feature
: FeatureSet
)
266 Printf("%u,", Feature
);
272 if (!FeatureDebug
) return;
273 Printf("======= CORPUS:\n");
275 for (auto II
: Inputs
) {
276 if (std::find(II
->U
.begin(), II
->U
.end(), 'F') != II
->U
.end()) {
278 Printf("%s sz=%zd ", Sha1ToString(II
->Sha1
).c_str(), II
->U
.size());
281 PrintFeatureSet(II
->UniqFeatureSet
);
288 void Replace(InputInfo
*II
, const Unit
&U
,
289 std::chrono::microseconds TimeOfUnit
) {
290 assert(II
->U
.size() > U
.size());
291 Hashes
.erase(Sha1ToString(II
->Sha1
));
293 ComputeSHA1(U
.data(), U
.size(), II
->Sha1
);
294 Hashes
.insert(Sha1ToString(II
->Sha1
));
297 II
->TimeOfUnit
= TimeOfUnit
;
298 DistributionNeedsUpdate
= true;
301 bool HasUnit(const Unit
&U
) { return Hashes
.count(Hash(U
)); }
302 bool HasUnit(const std::string
&H
) { return Hashes
.count(H
); }
303 InputInfo
&ChooseUnitToMutate(Random
&Rand
) {
304 InputInfo
&II
= *Inputs
[ChooseUnitIdxToMutate(Rand
)];
305 assert(!II
.U
.empty());
309 InputInfo
&ChooseUnitToCrossOverWith(Random
&Rand
, bool UniformDist
) {
311 return ChooseUnitToMutate(Rand
);
313 InputInfo
&II
= *Inputs
[Rand(Inputs
.size())];
314 assert(!II
.U
.empty());
318 // Returns an index of random unit from the corpus to mutate.
319 size_t ChooseUnitIdxToMutate(Random
&Rand
) {
320 UpdateCorpusDistribution(Rand
);
321 size_t Idx
= static_cast<size_t>(CorpusDistribution(Rand
));
322 assert(Idx
< Inputs
.size());
327 for (size_t i
= 0; i
< Inputs
.size(); i
++) {
328 const auto &II
= *Inputs
[i
];
329 Printf(" [% 3zd %s] sz: % 5zd runs: % 5zd succ: % 5zd focus: %d\n", i
,
330 Sha1ToString(II
.Sha1
).c_str(), II
.U
.size(),
331 II
.NumExecutedMutations
, II
.NumSuccessfullMutations
,
332 II
.HasFocusFunction
);
336 void PrintFeatureSet() {
337 for (size_t i
= 0; i
< kFeatureSetSize
; i
++) {
338 if(size_t Sz
= GetFeature(i
))
339 Printf("[%zd: id %zd sz%zd] ", i
, SmallestElementPerFeature
[i
], Sz
);
342 for (size_t i
= 0; i
< Inputs
.size(); i
++)
343 if (size_t N
= Inputs
[i
]->NumFeatures
)
344 Printf(" %zd=>%zd ", i
, N
);
348 void DeleteFile(const InputInfo
&II
) {
349 if (!OutputCorpus
.empty() && II
.MayDeleteFile
)
350 RemoveFile(DirPlusFile(OutputCorpus
, Sha1ToString(II
.Sha1
)));
353 void DeleteInput(size_t Idx
) {
354 InputInfo
&II
= *Inputs
[Idx
];
358 II
.NeedsEnergyUpdate
= false;
359 DistributionNeedsUpdate
= true;
361 Printf("EVICTED %zd\n", Idx
);
364 void AddRareFeature(uint32_t Idx
) {
365 // Maintain *at least* TopXRarestFeatures many rare features
366 // and all features with a frequency below ConsideredRare.
367 // Remove all other features.
368 while (RareFeatures
.size() > Entropic
.NumberOfRarestFeatures
&&
369 FreqOfMostAbundantRareFeature
> Entropic
.FeatureFrequencyThreshold
) {
371 // Find most and second most abbundant feature.
372 uint32_t MostAbundantRareFeatureIndices
[2] = {RareFeatures
[0],
375 for (size_t i
= 0; i
< RareFeatures
.size(); i
++) {
376 uint32_t Idx2
= RareFeatures
[i
];
377 if (GlobalFeatureFreqs
[Idx2
] >=
378 GlobalFeatureFreqs
[MostAbundantRareFeatureIndices
[0]]) {
379 MostAbundantRareFeatureIndices
[1] = MostAbundantRareFeatureIndices
[0];
380 MostAbundantRareFeatureIndices
[0] = Idx2
;
385 // Remove most abundant rare feature.
386 IsRareFeature
[Delete
] = false;
387 RareFeatures
[Delete
] = RareFeatures
.back();
388 RareFeatures
.pop_back();
390 for (auto II
: Inputs
) {
391 if (II
->DeleteFeatureFreq(MostAbundantRareFeatureIndices
[0]))
392 II
->NeedsEnergyUpdate
= true;
395 // Set 2nd most abundant as the new most abundant feature count.
396 FreqOfMostAbundantRareFeature
=
397 GlobalFeatureFreqs
[MostAbundantRareFeatureIndices
[1]];
400 // Add rare feature, handle collisions, and update energy.
401 RareFeatures
.push_back(Idx
);
402 IsRareFeature
[Idx
] = true;
403 GlobalFeatureFreqs
[Idx
] = 0;
404 for (auto II
: Inputs
) {
405 II
->DeleteFeatureFreq(Idx
);
407 // Apply add-one smoothing to this locally undiscovered feature.
408 // Zero energy seeds will never be fuzzed and remain zero energy.
409 if (II
->Energy
> 0.0) {
410 II
->SumIncidence
+= 1;
411 II
->Energy
+= log(II
->SumIncidence
) / II
->SumIncidence
;
415 DistributionNeedsUpdate
= true;
418 bool AddFeature(size_t Idx
, uint32_t NewSize
, bool Shrink
) {
420 Idx
= Idx
% kFeatureSetSize
;
421 uint32_t OldSize
= GetFeature(Idx
);
422 if (OldSize
== 0 || (Shrink
&& OldSize
> NewSize
)) {
424 size_t OldIdx
= SmallestElementPerFeature
[Idx
];
425 InputInfo
&II
= *Inputs
[OldIdx
];
426 assert(II
.NumFeatures
> 0);
428 if (II
.NumFeatures
== 0)
432 if (Entropic
.Enabled
)
433 AddRareFeature((uint32_t)Idx
);
435 NumUpdatedFeatures
++;
437 Printf("ADD FEATURE %zd sz %d\n", Idx
, NewSize
);
438 // Inputs.size() is guaranteed to be less than UINT32_MAX by AddToCorpus.
439 SmallestElementPerFeature
[Idx
] = static_cast<uint32_t>(Inputs
.size());
440 InputSizesPerFeature
[Idx
] = NewSize
;
446 // Increment frequency of feature Idx globally and locally.
447 void UpdateFeatureFrequency(InputInfo
*II
, size_t Idx
) {
448 uint32_t Idx32
= Idx
% kFeatureSetSize
;
450 // Saturated increment.
451 if (GlobalFeatureFreqs
[Idx32
] == 0xFFFF)
453 uint16_t Freq
= GlobalFeatureFreqs
[Idx32
]++;
456 if (Freq
> FreqOfMostAbundantRareFeature
|| !IsRareFeature
[Idx32
])
459 // Update global frequencies.
460 if (Freq
== FreqOfMostAbundantRareFeature
)
461 FreqOfMostAbundantRareFeature
++;
463 // Update local frequencies.
465 II
->UpdateFeatureFrequency(Idx32
);
468 size_t NumFeatures() const { return NumAddedFeatures
; }
469 size_t NumFeatureUpdates() const { return NumUpdatedFeatures
; }
473 static const bool FeatureDebug
= false;
475 uint32_t GetFeature(size_t Idx
) const { return InputSizesPerFeature
[Idx
]; }
477 void ValidateFeatureSet() {
480 for (size_t Idx
= 0; Idx
< kFeatureSetSize
; Idx
++)
482 Inputs
[SmallestElementPerFeature
[Idx
]]->Tmp
++;
483 for (auto II
: Inputs
) {
484 if (II
->Tmp
!= II
->NumFeatures
)
485 Printf("ZZZ %zd %zd\n", II
->Tmp
, II
->NumFeatures
);
486 assert(II
->Tmp
== II
->NumFeatures
);
491 // Updates the probability distribution for the units in the corpus.
492 // Must be called whenever the corpus or unit weights are changed.
494 // Hypothesis: inputs that maximize information about globally rare features
496 void UpdateCorpusDistribution(Random
&Rand
) {
497 // Skip update if no seeds or rare features were added/deleted.
498 // Sparse updates for local change of feature frequencies,
499 // i.e., randomly do not skip.
500 if (!DistributionNeedsUpdate
&&
501 (!Entropic
.Enabled
|| Rand(kSparseEnergyUpdates
)))
504 DistributionNeedsUpdate
= false;
506 size_t N
= Inputs
.size();
508 Intervals
.resize(N
+ 1);
510 std::iota(Intervals
.begin(), Intervals
.end(), 0);
512 std::chrono::microseconds
AverageUnitExecutionTime(0);
513 for (auto II
: Inputs
) {
514 AverageUnitExecutionTime
+= II
->TimeOfUnit
;
516 AverageUnitExecutionTime
/= N
;
518 bool VanillaSchedule
= true;
519 if (Entropic
.Enabled
) {
520 for (auto II
: Inputs
) {
521 if (II
->NeedsEnergyUpdate
&& II
->Energy
!= 0.0) {
522 II
->NeedsEnergyUpdate
= false;
523 II
->UpdateEnergy(RareFeatures
.size(), Entropic
.ScalePerExecTime
,
524 AverageUnitExecutionTime
);
528 for (size_t i
= 0; i
< N
; i
++) {
530 if (Inputs
[i
]->NumFeatures
== 0) {
531 // If the seed doesn't represent any features, assign zero energy.
533 } else if (Inputs
[i
]->NumExecutedMutations
/ kMaxMutationFactor
>
534 NumExecutedMutations
/ Inputs
.size()) {
535 // If the seed was fuzzed a lot more than average, assign zero energy.
538 // Otherwise, simply assign the computed energy.
539 Weights
[i
] = Inputs
[i
]->Energy
;
542 // If energy for all seeds is zero, fall back to vanilla schedule.
543 if (Weights
[i
] > 0.0)
544 VanillaSchedule
= false;
548 if (VanillaSchedule
) {
549 for (size_t i
= 0; i
< N
; i
++)
551 Inputs
[i
]->NumFeatures
552 ? static_cast<double>((i
+ 1) *
553 (Inputs
[i
]->HasFocusFunction
? 1000 : 1))
558 for (size_t i
= 0; i
< N
; i
++)
559 Printf("%zd ", Inputs
[i
]->NumFeatures
);
561 for (size_t i
= 0; i
< N
; i
++)
562 Printf("%f ", Weights
[i
]);
565 CorpusDistribution
= std::piecewise_constant_distribution
<double>(
566 Intervals
.begin(), Intervals
.end(), Weights
.begin());
568 std::piecewise_constant_distribution
<double> CorpusDistribution
;
570 std::vector
<double> Intervals
;
571 std::vector
<double> Weights
;
573 std::unordered_set
<std::string
> Hashes
;
574 std::vector
<InputInfo
*> Inputs
;
576 size_t NumAddedFeatures
= 0;
577 size_t NumUpdatedFeatures
= 0;
578 uint32_t InputSizesPerFeature
[kFeatureSetSize
];
579 uint32_t SmallestElementPerFeature
[kFeatureSetSize
];
581 bool DistributionNeedsUpdate
= true;
582 uint16_t FreqOfMostAbundantRareFeature
= 0;
583 uint16_t GlobalFeatureFreqs
[kFeatureSetSize
] = {};
584 std::vector
<uint32_t> RareFeatures
;
585 std::bitset
<kFeatureSetSize
> IsRareFeature
;
587 std::string OutputCorpus
;
590 } // namespace fuzzer
592 #endif // LLVM_FUZZER_CORPUS