1 //===- FuzzerCorpus.h - Internal header for the Fuzzer ----------*- C++ -* ===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 //===----------------------------------------------------------------------===//
11 #ifndef LLVM_FUZZER_CORPUS
12 #define LLVM_FUZZER_CORPUS
14 #include "FuzzerDataFlowTrace.h"
15 #include "FuzzerDefs.h"
17 #include "FuzzerRandom.h"
18 #include "FuzzerSHA1.h"
19 #include "FuzzerTracePC.h"
24 #include <unordered_set>
29 Unit U
; // The actual input data.
30 std::chrono::microseconds TimeOfUnit
;
31 uint8_t Sha1
[kSHA1NumBytes
]; // Checksum.
32 // Number of features that this input has and no smaller input has.
33 size_t NumFeatures
= 0;
34 size_t Tmp
= 0; // Used by ValidateFeatureSet.
36 size_t NumExecutedMutations
= 0;
37 size_t NumSuccessfullMutations
= 0;
38 bool NeverReduce
= false;
39 bool MayDeleteFile
= false;
41 bool HasFocusFunction
= false;
42 Vector
<uint32_t> UniqFeatureSet
;
43 Vector
<uint8_t> DataFlowTraceForFocusFunction
;
45 bool NeedsEnergyUpdate
= false;
47 size_t SumIncidence
= 0;
48 Vector
<std::pair
<uint32_t, uint16_t>> FeatureFreqs
;
50 // Delete feature Idx and its frequency from FeatureFreqs.
51 bool DeleteFeatureFreq(uint32_t Idx
) {
52 if (FeatureFreqs
.empty())
55 // Binary search over local feature frequencies sorted by index.
56 auto Lower
= std::lower_bound(FeatureFreqs
.begin(), FeatureFreqs
.end(),
57 std::pair
<uint32_t, uint16_t>(Idx
, 0));
59 if (Lower
!= FeatureFreqs
.end() && Lower
->first
== Idx
) {
60 FeatureFreqs
.erase(Lower
);
66 // Assign more energy to a high-entropy seed, i.e., that reveals more
67 // information about the globally rare features in the neighborhood of the
68 // seed. Since we do not know the entropy of a seed that has never been
69 // executed we assign fresh seeds maximum entropy and let II->Energy approach
70 // the true entropy from above. If ScalePerExecTime is true, the computed
71 // entropy is scaled based on how fast this input executes compared to the
72 // average execution time of inputs. The faster an input executes, the more
73 // energy gets assigned to the input.
74 void UpdateEnergy(size_t GlobalNumberOfFeatures
, bool ScalePerExecTime
,
75 std::chrono::microseconds AverageUnitExecutionTime
) {
79 // Apply add-one smoothing to locally discovered features.
80 for (auto F
: FeatureFreqs
) {
81 size_t LocalIncidence
= F
.second
+ 1;
82 Energy
-= LocalIncidence
* logl(LocalIncidence
);
83 SumIncidence
+= LocalIncidence
;
86 // Apply add-one smoothing to locally undiscovered features.
87 // PreciseEnergy -= 0; // since logl(1.0) == 0)
88 SumIncidence
+= (GlobalNumberOfFeatures
- FeatureFreqs
.size());
90 // Add a single locally abundant feature apply add-one smoothing.
91 size_t AbdIncidence
= NumExecutedMutations
+ 1;
92 Energy
-= AbdIncidence
* logl(AbdIncidence
);
93 SumIncidence
+= AbdIncidence
;
96 if (SumIncidence
!= 0)
97 Energy
= (Energy
/ SumIncidence
) + logl(SumIncidence
);
99 if (ScalePerExecTime
) {
100 // Scaling to favor inputs with lower execution time.
101 uint32_t PerfScore
= 100;
102 if (TimeOfUnit
.count() > AverageUnitExecutionTime
.count() * 10)
104 else if (TimeOfUnit
.count() > AverageUnitExecutionTime
.count() * 4)
106 else if (TimeOfUnit
.count() > AverageUnitExecutionTime
.count() * 2)
108 else if (TimeOfUnit
.count() * 3 > AverageUnitExecutionTime
.count() * 4)
110 else if (TimeOfUnit
.count() * 4 < AverageUnitExecutionTime
.count())
112 else if (TimeOfUnit
.count() * 3 < AverageUnitExecutionTime
.count())
114 else if (TimeOfUnit
.count() * 2 < AverageUnitExecutionTime
.count())
121 // Increment the frequency of the feature Idx.
122 void UpdateFeatureFrequency(uint32_t Idx
) {
123 NeedsEnergyUpdate
= true;
125 // The local feature frequencies is an ordered vector of pairs.
126 // If there are no local feature frequencies, push_back preserves order.
127 // Set the feature frequency for feature Idx32 to 1.
128 if (FeatureFreqs
.empty()) {
129 FeatureFreqs
.push_back(std::pair
<uint32_t, uint16_t>(Idx
, 1));
133 // Binary search over local feature frequencies sorted by index.
134 auto Lower
= std::lower_bound(FeatureFreqs
.begin(), FeatureFreqs
.end(),
135 std::pair
<uint32_t, uint16_t>(Idx
, 0));
137 // If feature Idx32 already exists, increment its frequency.
138 // Otherwise, insert a new pair right after the next lower index.
139 if (Lower
!= FeatureFreqs
.end() && Lower
->first
== Idx
) {
142 FeatureFreqs
.insert(Lower
, std::pair
<uint32_t, uint16_t>(Idx
, 1));
147 struct EntropicOptions
{
149 size_t NumberOfRarestFeatures
;
150 size_t FeatureFrequencyThreshold
;
151 bool ScalePerExecTime
;
155 static const uint32_t kFeatureSetSize
= 1 << 21;
156 static const uint8_t kMaxMutationFactor
= 20;
157 static const size_t kSparseEnergyUpdates
= 100;
159 size_t NumExecutedMutations
= 0;
161 EntropicOptions Entropic
;
164 InputCorpus(const std::string
&OutputCorpus
, EntropicOptions Entropic
)
165 : Entropic(Entropic
), OutputCorpus(OutputCorpus
) {
166 memset(InputSizesPerFeature
, 0, sizeof(InputSizesPerFeature
));
167 memset(SmallestElementPerFeature
, 0, sizeof(SmallestElementPerFeature
));
170 for (auto II
: Inputs
)
173 size_t size() const { return Inputs
.size(); }
174 size_t SizeInBytes() const {
176 for (auto II
: Inputs
)
180 size_t NumActiveUnits() const {
182 for (auto II
: Inputs
)
183 Res
+= !II
->U
.empty();
186 size_t MaxInputSize() const {
188 for (auto II
: Inputs
)
189 Res
= std::max(Res
, II
->U
.size());
192 void IncrementNumExecutedMutations() { NumExecutedMutations
++; }
194 size_t NumInputsThatTouchFocusFunction() {
195 return std::count_if(Inputs
.begin(), Inputs
.end(), [](const InputInfo
*II
) {
196 return II
->HasFocusFunction
;
200 size_t NumInputsWithDataFlowTrace() {
201 return std::count_if(Inputs
.begin(), Inputs
.end(), [](const InputInfo
*II
) {
202 return !II
->DataFlowTraceForFocusFunction
.empty();
206 bool empty() const { return Inputs
.empty(); }
207 const Unit
&operator[] (size_t Idx
) const { return Inputs
[Idx
]->U
; }
208 InputInfo
*AddToCorpus(const Unit
&U
, size_t NumFeatures
, bool MayDeleteFile
,
209 bool HasFocusFunction
, bool NeverReduce
,
210 std::chrono::microseconds TimeOfUnit
,
211 const Vector
<uint32_t> &FeatureSet
,
212 const DataFlowTrace
&DFT
, const InputInfo
*BaseII
) {
215 Printf("ADD_TO_CORPUS %zd NF %zd\n", Inputs
.size(), NumFeatures
);
216 Inputs
.push_back(new InputInfo());
217 InputInfo
&II
= *Inputs
.back();
219 II
.NumFeatures
= NumFeatures
;
220 II
.NeverReduce
= NeverReduce
;
221 II
.TimeOfUnit
= TimeOfUnit
;
222 II
.MayDeleteFile
= MayDeleteFile
;
223 II
.UniqFeatureSet
= FeatureSet
;
224 II
.HasFocusFunction
= HasFocusFunction
;
225 // Assign maximal energy to the new seed.
226 II
.Energy
= RareFeatures
.empty() ? 1.0 : log(RareFeatures
.size());
227 II
.SumIncidence
= RareFeatures
.size();
228 II
.NeedsEnergyUpdate
= false;
229 std::sort(II
.UniqFeatureSet
.begin(), II
.UniqFeatureSet
.end());
230 ComputeSHA1(U
.data(), U
.size(), II
.Sha1
);
231 auto Sha1Str
= Sha1ToString(II
.Sha1
);
232 Hashes
.insert(Sha1Str
);
233 if (HasFocusFunction
)
234 if (auto V
= DFT
.Get(Sha1Str
))
235 II
.DataFlowTraceForFocusFunction
= *V
;
236 // This is a gross heuristic.
237 // Ideally, when we add an element to a corpus we need to know its DFT.
238 // But if we don't, we'll use the DFT of its base input.
239 if (II
.DataFlowTraceForFocusFunction
.empty() && BaseII
)
240 II
.DataFlowTraceForFocusFunction
= BaseII
->DataFlowTraceForFocusFunction
;
241 DistributionNeedsUpdate
= true;
243 // ValidateFeatureSet();
248 void PrintUnit(const Unit
&U
) {
249 if (!FeatureDebug
) return;
250 for (uint8_t C
: U
) {
251 if (C
!= 'F' && C
!= 'U' && C
!= 'Z')
258 void PrintFeatureSet(const Vector
<uint32_t> &FeatureSet
) {
259 if (!FeatureDebug
) return;
261 for (uint32_t Feature
: FeatureSet
)
262 Printf("%u,", Feature
);
268 if (!FeatureDebug
) return;
269 Printf("======= CORPUS:\n");
271 for (auto II
: Inputs
) {
272 if (std::find(II
->U
.begin(), II
->U
.end(), 'F') != II
->U
.end()) {
274 Printf("%s sz=%zd ", Sha1ToString(II
->Sha1
).c_str(), II
->U
.size());
277 PrintFeatureSet(II
->UniqFeatureSet
);
284 void Replace(InputInfo
*II
, const Unit
&U
) {
285 assert(II
->U
.size() > U
.size());
286 Hashes
.erase(Sha1ToString(II
->Sha1
));
288 ComputeSHA1(U
.data(), U
.size(), II
->Sha1
);
289 Hashes
.insert(Sha1ToString(II
->Sha1
));
292 DistributionNeedsUpdate
= true;
295 bool HasUnit(const Unit
&U
) { return Hashes
.count(Hash(U
)); }
296 bool HasUnit(const std::string
&H
) { return Hashes
.count(H
); }
297 InputInfo
&ChooseUnitToMutate(Random
&Rand
) {
298 InputInfo
&II
= *Inputs
[ChooseUnitIdxToMutate(Rand
)];
299 assert(!II
.U
.empty());
303 InputInfo
&ChooseUnitToCrossOverWith(Random
&Rand
, bool UniformDist
) {
305 return ChooseUnitToMutate(Rand
);
307 InputInfo
&II
= *Inputs
[Rand(Inputs
.size())];
308 assert(!II
.U
.empty());
312 // Returns an index of random unit from the corpus to mutate.
313 size_t ChooseUnitIdxToMutate(Random
&Rand
) {
314 UpdateCorpusDistribution(Rand
);
315 size_t Idx
= static_cast<size_t>(CorpusDistribution(Rand
));
316 assert(Idx
< Inputs
.size());
321 for (size_t i
= 0; i
< Inputs
.size(); i
++) {
322 const auto &II
= *Inputs
[i
];
323 Printf(" [% 3zd %s] sz: % 5zd runs: % 5zd succ: % 5zd focus: %d\n", i
,
324 Sha1ToString(II
.Sha1
).c_str(), II
.U
.size(),
325 II
.NumExecutedMutations
, II
.NumSuccessfullMutations
, II
.HasFocusFunction
);
329 void PrintFeatureSet() {
330 for (size_t i
= 0; i
< kFeatureSetSize
; i
++) {
331 if(size_t Sz
= GetFeature(i
))
332 Printf("[%zd: id %zd sz%zd] ", i
, SmallestElementPerFeature
[i
], Sz
);
335 for (size_t i
= 0; i
< Inputs
.size(); i
++)
336 if (size_t N
= Inputs
[i
]->NumFeatures
)
337 Printf(" %zd=>%zd ", i
, N
);
341 void DeleteFile(const InputInfo
&II
) {
342 if (!OutputCorpus
.empty() && II
.MayDeleteFile
)
343 RemoveFile(DirPlusFile(OutputCorpus
, Sha1ToString(II
.Sha1
)));
346 void DeleteInput(size_t Idx
) {
347 InputInfo
&II
= *Inputs
[Idx
];
351 II
.NeedsEnergyUpdate
= false;
352 DistributionNeedsUpdate
= true;
354 Printf("EVICTED %zd\n", Idx
);
357 void AddRareFeature(uint32_t Idx
) {
358 // Maintain *at least* TopXRarestFeatures many rare features
359 // and all features with a frequency below ConsideredRare.
360 // Remove all other features.
361 while (RareFeatures
.size() > Entropic
.NumberOfRarestFeatures
&&
362 FreqOfMostAbundantRareFeature
> Entropic
.FeatureFrequencyThreshold
) {
364 // Find most and second most abbundant feature.
365 uint32_t MostAbundantRareFeatureIndices
[2] = {RareFeatures
[0],
368 for (size_t i
= 0; i
< RareFeatures
.size(); i
++) {
369 uint32_t Idx2
= RareFeatures
[i
];
370 if (GlobalFeatureFreqs
[Idx2
] >=
371 GlobalFeatureFreqs
[MostAbundantRareFeatureIndices
[0]]) {
372 MostAbundantRareFeatureIndices
[1] = MostAbundantRareFeatureIndices
[0];
373 MostAbundantRareFeatureIndices
[0] = Idx2
;
378 // Remove most abundant rare feature.
379 RareFeatures
[Delete
] = RareFeatures
.back();
380 RareFeatures
.pop_back();
382 for (auto II
: Inputs
) {
383 if (II
->DeleteFeatureFreq(MostAbundantRareFeatureIndices
[0]))
384 II
->NeedsEnergyUpdate
= true;
387 // Set 2nd most abundant as the new most abundant feature count.
388 FreqOfMostAbundantRareFeature
=
389 GlobalFeatureFreqs
[MostAbundantRareFeatureIndices
[1]];
392 // Add rare feature, handle collisions, and update energy.
393 RareFeatures
.push_back(Idx
);
394 GlobalFeatureFreqs
[Idx
] = 0;
395 for (auto II
: Inputs
) {
396 II
->DeleteFeatureFreq(Idx
);
398 // Apply add-one smoothing to this locally undiscovered feature.
399 // Zero energy seeds will never be fuzzed and remain zero energy.
400 if (II
->Energy
> 0.0) {
401 II
->SumIncidence
+= 1;
402 II
->Energy
+= logl(II
->SumIncidence
) / II
->SumIncidence
;
406 DistributionNeedsUpdate
= true;
409 bool AddFeature(size_t Idx
, uint32_t NewSize
, bool Shrink
) {
411 Idx
= Idx
% kFeatureSetSize
;
412 uint32_t OldSize
= GetFeature(Idx
);
413 if (OldSize
== 0 || (Shrink
&& OldSize
> NewSize
)) {
415 size_t OldIdx
= SmallestElementPerFeature
[Idx
];
416 InputInfo
&II
= *Inputs
[OldIdx
];
417 assert(II
.NumFeatures
> 0);
419 if (II
.NumFeatures
== 0)
423 if (Entropic
.Enabled
)
424 AddRareFeature((uint32_t)Idx
);
426 NumUpdatedFeatures
++;
428 Printf("ADD FEATURE %zd sz %d\n", Idx
, NewSize
);
429 SmallestElementPerFeature
[Idx
] = Inputs
.size();
430 InputSizesPerFeature
[Idx
] = NewSize
;
436 // Increment frequency of feature Idx globally and locally.
437 void UpdateFeatureFrequency(InputInfo
*II
, size_t Idx
) {
438 uint32_t Idx32
= Idx
% kFeatureSetSize
;
440 // Saturated increment.
441 if (GlobalFeatureFreqs
[Idx32
] == 0xFFFF)
443 uint16_t Freq
= GlobalFeatureFreqs
[Idx32
]++;
446 if (Freq
> FreqOfMostAbundantRareFeature
||
447 std::find(RareFeatures
.begin(), RareFeatures
.end(), Idx32
) ==
451 // Update global frequencies.
452 if (Freq
== FreqOfMostAbundantRareFeature
)
453 FreqOfMostAbundantRareFeature
++;
455 // Update local frequencies.
457 II
->UpdateFeatureFrequency(Idx32
);
460 size_t NumFeatures() const { return NumAddedFeatures
; }
461 size_t NumFeatureUpdates() const { return NumUpdatedFeatures
; }
465 static const bool FeatureDebug
= false;
467 size_t GetFeature(size_t Idx
) const { return InputSizesPerFeature
[Idx
]; }
469 void ValidateFeatureSet() {
472 for (size_t Idx
= 0; Idx
< kFeatureSetSize
; Idx
++)
474 Inputs
[SmallestElementPerFeature
[Idx
]]->Tmp
++;
475 for (auto II
: Inputs
) {
476 if (II
->Tmp
!= II
->NumFeatures
)
477 Printf("ZZZ %zd %zd\n", II
->Tmp
, II
->NumFeatures
);
478 assert(II
->Tmp
== II
->NumFeatures
);
483 // Updates the probability distribution for the units in the corpus.
484 // Must be called whenever the corpus or unit weights are changed.
486 // Hypothesis: inputs that maximize information about globally rare features
488 void UpdateCorpusDistribution(Random
&Rand
) {
489 // Skip update if no seeds or rare features were added/deleted.
490 // Sparse updates for local change of feature frequencies,
491 // i.e., randomly do not skip.
492 if (!DistributionNeedsUpdate
&&
493 (!Entropic
.Enabled
|| Rand(kSparseEnergyUpdates
)))
496 DistributionNeedsUpdate
= false;
498 size_t N
= Inputs
.size();
500 Intervals
.resize(N
+ 1);
502 std::iota(Intervals
.begin(), Intervals
.end(), 0);
504 std::chrono::microseconds
AverageUnitExecutionTime(0);
505 for (auto II
: Inputs
) {
506 AverageUnitExecutionTime
+= II
->TimeOfUnit
;
508 AverageUnitExecutionTime
/= N
;
510 bool VanillaSchedule
= true;
511 if (Entropic
.Enabled
) {
512 for (auto II
: Inputs
) {
513 if (II
->NeedsEnergyUpdate
&& II
->Energy
!= 0.0) {
514 II
->NeedsEnergyUpdate
= false;
515 II
->UpdateEnergy(RareFeatures
.size(), Entropic
.ScalePerExecTime
,
516 AverageUnitExecutionTime
);
520 for (size_t i
= 0; i
< N
; i
++) {
522 if (Inputs
[i
]->NumFeatures
== 0) {
523 // If the seed doesn't represent any features, assign zero energy.
525 } else if (Inputs
[i
]->NumExecutedMutations
/ kMaxMutationFactor
>
526 NumExecutedMutations
/ Inputs
.size()) {
527 // If the seed was fuzzed a lot more than average, assign zero energy.
530 // Otherwise, simply assign the computed energy.
531 Weights
[i
] = Inputs
[i
]->Energy
;
534 // If energy for all seeds is zero, fall back to vanilla schedule.
535 if (Weights
[i
] > 0.0)
536 VanillaSchedule
= false;
540 if (VanillaSchedule
) {
541 for (size_t i
= 0; i
< N
; i
++)
542 Weights
[i
] = Inputs
[i
]->NumFeatures
543 ? (i
+ 1) * (Inputs
[i
]->HasFocusFunction
? 1000 : 1)
548 for (size_t i
= 0; i
< N
; i
++)
549 Printf("%zd ", Inputs
[i
]->NumFeatures
);
551 for (size_t i
= 0; i
< N
; i
++)
552 Printf("%f ", Weights
[i
]);
555 CorpusDistribution
= std::piecewise_constant_distribution
<double>(
556 Intervals
.begin(), Intervals
.end(), Weights
.begin());
558 std::piecewise_constant_distribution
<double> CorpusDistribution
;
560 Vector
<double> Intervals
;
561 Vector
<double> Weights
;
563 std::unordered_set
<std::string
> Hashes
;
564 Vector
<InputInfo
*> Inputs
;
566 size_t NumAddedFeatures
= 0;
567 size_t NumUpdatedFeatures
= 0;
568 uint32_t InputSizesPerFeature
[kFeatureSetSize
];
569 uint32_t SmallestElementPerFeature
[kFeatureSetSize
];
571 bool DistributionNeedsUpdate
= true;
572 uint16_t FreqOfMostAbundantRareFeature
= 0;
573 uint16_t GlobalFeatureFreqs
[kFeatureSetSize
] = {};
574 Vector
<uint32_t> RareFeatures
;
576 std::string OutputCorpus
;
579 } // namespace fuzzer
581 #endif // LLVM_FUZZER_CORPUS