Include all dupe types (event when value is zero) in scan stats.
[chromium-blink-merge.git] / courgette / adjustment_method.cc
blob2bc926903721b15ba88266f60d72e69b6240e814
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "courgette/adjustment_method.h"
7 #include <algorithm>
8 #include <list>
9 #include <map>
10 #include <set>
11 #include <string>
12 #include <vector>
14 #include "base/basictypes.h"
15 #include "base/logging.h"
16 #include "base/strings/string_number_conversions.h"
17 #include "base/strings/stringprintf.h"
18 #include "courgette/assembly_program.h"
19 #include "courgette/courgette.h"
20 #include "courgette/encoded_program.h"
22 namespace courgette {
24 ////////////////////////////////////////////////////////////////////////////////
26 class NullAdjustmentMethod : public AdjustmentMethod {
27 bool Adjust(const AssemblyProgram& model, AssemblyProgram* program) {
28 return true;
32 ////////////////////////////////////////////////////////////////////////////////
34 // The purpose of adjustment is to assign indexes to Labels of a program 'p' to
35 // make the sequence of indexes similar to a 'model' program 'm'. Labels
36 // themselves don't have enough information to do this job, so we work with a
37 // LabelInfo surrogate for each label.
39 class LabelInfo {
40 public:
41 Label* label_; // The label that this info a surrogate for.
43 // Information used only in debugging messages.
44 uint32 is_model_ : 1; // Is the label in the model?
45 uint32 debug_index_ : 31; // An unique small number for naming the label.
47 uint32 refs_; // Number of times this Label is referenced.
49 LabelInfo* assignment_; // Label from other program corresponding to this.
51 // LabelInfos are in a doubly linked list ordered by address (label_->rva_) so
52 // we can quickly find Labels adjacent in address order.
53 LabelInfo* next_addr_; // Label(Info) at next highest address.
54 LabelInfo* prev_addr_; // Label(Info) at next lowest address.
56 std::vector<uint32> positions_; // Offsets into the trace of references.
58 // Just a no-argument constructor and copy constructor. Actual LabelInfo
59 // objects are allocated in std::pair structs in a std::map.
60 LabelInfo()
61 : label_(NULL), is_model_(false), debug_index_(0), refs_(0),
62 assignment_(NULL),
63 next_addr_(NULL),
64 prev_addr_(NULL) {}
66 private:
67 void operator=(const LabelInfo*); // Disallow assignment only.
69 // Public compiler generated copy constructor is needed to constuct
70 // std::pair<Label*, LabelInfo> so that fresh LabelInfos can be allocated
71 // inside a std::map.
74 struct OrderLabelInfoByAddressAscending {
75 bool operator()(const LabelInfo* a, const LabelInfo* b) const {
76 return a->label_->rva_ < b->label_->rva_;
80 static std::string ToString(LabelInfo* info) {
81 std::string s;
82 base::StringAppendF(&s, "%c%d", "pm"[info->is_model_], info->debug_index_);
83 if (info->label_->index_ != Label::kNoIndex)
84 base::StringAppendF(&s, " (%d)", info->label_->index_);
86 base::StringAppendF(&s, " #%u", info->refs_);
87 return s;
90 // General graph matching is exponential, essentially trying all permutations.
91 // The exponential algorithm can be made faster by avoiding consideration of
92 // impossible or unlikely matches. We can make the matching practical by eager
93 // matching - by looking for likely matches and commiting to them, and using the
94 // committed assignment as the basis for further matching.
96 // The basic eager graph-matching assignment is based on several ideas:
98 // * The strongest match will be for parts of the program that have not
99 // changed. If part of a program has not changed, then the number of
100 // references to a label will be the same, and corresponding pairs of
101 // adjacent labels will have the same RVA difference.
103 // * Some assignments are 'obvious' if you look at the distribution. Example:
104 // if both the program and the model have a label that is referred to much
105 // more often than the next most refered-to label, it is likely the two
106 // labels correspond.
108 // * If a label from the program corresponds to a label in the model, it is
109 // likely that the labels near the corresponding labels also match. A
110 // conservative way of extending the match is to assign only those labels
111 // which have exactly the same address offset and reference count.
113 // * If two labels correspond, then we can try to match up the references
114 // before and after the labels in the reference stream. For this to be
115 // practical, the number of references has to be small, e.g. each label has
116 // exactly one reference.
119 // Note: we also tried a completely different approach: random assignment
120 // followed by simulated annealing. This produced similar results. The results
121 // were not as good for very small differences because the simulated annealing
122 // never quite hit the groove. And simulated annealing was several orders of
123 // magnitude slower.
126 // TRIE node for suffix strings in the label reference sequence.
128 // We dynamically build a trie for both the program and model, growing the trie
129 // as necessary. The trie node for a (possibly) empty string of label
130 // references contains the distribution of labels following the string. The
131 // roots node (for the empty string) thus contains the simple distribution of
132 // labels within the label reference stream.
134 struct Node {
135 Node(LabelInfo* in_edge, Node* prev)
136 : in_edge_(in_edge), prev_(prev), count_(0),
137 in_queue_(false) {
138 length_ = 1 + (prev_ ? prev_->length_ : 0);
140 LabelInfo* in_edge_; //
141 Node* prev_; // Node at shorter length.
142 int count_; // Frequency of this path in Trie.
143 int length_;
144 typedef std::map<LabelInfo*, Node*> Edges;
145 Edges edges_;
146 std::vector<int> places_; // Indexes into sequence of this item.
147 std::list<Node*> edges_in_frequency_order;
149 bool in_queue_;
150 bool Extended() const { return !edges_.empty(); }
152 uint32 Weight() const {
153 return edges_in_frequency_order.front()->count_;
157 static std::string ToString(Node* node) {
158 std::vector<std::string> prefix;
159 for (Node* n = node; n->prev_; n = n->prev_)
160 prefix.push_back(ToString(n->in_edge_));
162 std::string s;
163 s += "{";
164 const char* sep = "";
165 while (!prefix.empty()) {
166 s += sep;
167 sep = ",";
168 s += prefix.back();
169 prefix.pop_back();
172 s += base::StringPrintf("%u", node->count_);
173 s += " @";
174 s += base::Uint64ToString(node->edges_in_frequency_order.size());
175 s += "}";
176 return s;
179 typedef std::vector<LabelInfo*> Trace;
181 struct OrderNodeByCountDecreasing {
182 bool operator()(Node* a, Node* b) const {
183 if (a->count_ != b->count_)
184 return (a->count_) > (b->count_);
185 return a->places_.at(0) < b->places_.at(0); // Prefer first occuring.
189 struct OrderNodeByWeightDecreasing {
190 bool operator()(Node* a, Node* b) const {
191 // (Maybe tie-break on total count, followed by lowest assigned node indexes
192 // in path.)
193 uint32 a_weight = a->Weight();
194 uint32 b_weight = b->Weight();
195 if (a_weight != b_weight)
196 return a_weight > b_weight;
197 if (a->length_ != b->length_)
198 return a->length_ > b->length_; // Prefer longer.
199 return a->places_.at(0) < b->places_.at(0); // Prefer first occuring.
203 typedef std::set<Node*, OrderNodeByWeightDecreasing> NodeQueue;
205 class AssignmentProblem {
206 public:
207 AssignmentProblem(const Trace& model,
208 const Trace& problem)
209 : m_trace_(model),
210 p_trace_(problem),
211 m_root_(NULL),
212 p_root_(NULL) {
215 ~AssignmentProblem() {
216 for (size_t i = 0; i < all_nodes_.size(); ++i)
217 delete all_nodes_[i];
220 bool Solve() {
221 m_root_ = MakeRootNode(m_trace_);
222 p_root_ = MakeRootNode(p_trace_);
223 AddToQueue(p_root_);
225 while (!worklist_.empty()) {
226 Node* node = *worklist_.begin();
227 node->in_queue_ = false;
228 worklist_.erase(node);
229 TrySolveNode(node);
232 VLOG(2) << unsolved_.size() << " unsolved items";
233 return true;
236 private:
237 void AddToQueue(Node* node) {
238 if (node->length_ >= 10) {
239 VLOG(4) << "Length clipped " << ToString(node->prev_);
240 return;
242 if (node->in_queue_) {
243 LOG(ERROR) << "Double add " << ToString(node);
244 return;
246 // just to be sure data for prioritizing is available
247 ExtendNode(node, p_trace_);
248 // SkipCommittedLabels(node);
249 if (node->edges_in_frequency_order.empty())
250 return;
251 node->in_queue_ = true;
252 worklist_.insert(node);
255 void SkipCommittedLabels(Node* node) {
256 ExtendNode(node, p_trace_);
257 uint32 skipped = 0;
258 while (!node->edges_in_frequency_order.empty() &&
259 node->edges_in_frequency_order.front()->in_edge_->assignment_) {
260 ++skipped;
261 node->edges_in_frequency_order.pop_front();
263 if (skipped > 0)
264 VLOG(4) << "Skipped " << skipped << " at " << ToString(node);
267 void TrySolveNode(Node* p_node) {
268 Node* front = p_node->edges_in_frequency_order.front();
269 if (front->in_edge_->assignment_) {
270 p_node->edges_in_frequency_order.pop_front();
271 AddToQueue(front);
272 AddToQueue(p_node);
273 return;
276 // Compare frequencies of unassigned edges, and either make
277 // assignment(s) or move node to unsolved list
279 Node* m_node = FindModelNode(p_node);
281 if (m_node == NULL) {
282 VLOG(2) << "Can't find model node";
283 unsolved_.insert(p_node);
284 return;
286 ExtendNode(m_node, m_trace_);
288 // Lets just try greedy
290 SkipCommittedLabels(m_node);
291 if (m_node->edges_in_frequency_order.empty()) {
292 VLOG(4) << "Punting, no elements left in model vs "
293 << p_node->edges_in_frequency_order.size();
294 unsolved_.insert(p_node);
295 return;
297 Node* m_match = m_node->edges_in_frequency_order.front();
298 Node* p_match = p_node->edges_in_frequency_order.front();
300 if (p_match->count_ > 1.1 * m_match->count_ ||
301 m_match->count_ > 1.1 * p_match->count_) {
302 VLOG(3) << "Tricky distribution "
303 << p_match->count_ << ":" << m_match->count_ << " "
304 << ToString(p_match) << " vs " << ToString(m_match);
305 return;
308 m_node->edges_in_frequency_order.pop_front();
309 p_node->edges_in_frequency_order.pop_front();
311 LabelInfo* p_label_info = p_match->in_edge_;
312 LabelInfo* m_label_info = m_match->in_edge_;
313 int m_index = p_label_info->label_->index_;
314 if (m_index != Label::kNoIndex) {
315 VLOG(2) << "Cant use unassigned label from model " << m_index;
316 unsolved_.insert(p_node);
317 return;
320 Assign(p_label_info, m_label_info);
322 AddToQueue(p_match); // find matches within new match
323 AddToQueue(p_node); // and more matches within this node
326 void Assign(LabelInfo* p_info, LabelInfo* m_info) {
327 AssignOne(p_info, m_info);
328 VLOG(4) << "Assign " << ToString(p_info) << " := " << ToString(m_info);
329 // Now consider unassigned adjacent addresses
330 TryExtendAssignment(p_info, m_info);
333 void AssignOne(LabelInfo* p_info, LabelInfo* m_info) {
334 p_info->label_->index_ = m_info->label_->index_;
336 // Mark as assigned
337 m_info->assignment_ = p_info;
338 p_info->assignment_ = m_info;
341 void TryExtendAssignment(LabelInfo* p_info, LabelInfo* m_info) {
342 RVA m_rva_base = m_info->label_->rva_;
343 RVA p_rva_base = p_info->label_->rva_;
345 LabelInfo* m_info_next = m_info->next_addr_;
346 LabelInfo* p_info_next = p_info->next_addr_;
347 for ( ; m_info_next && p_info_next; ) {
348 if (m_info_next->assignment_)
349 break;
351 RVA m_rva = m_info_next->label_->rva_;
352 RVA p_rva = p_info_next->label_->rva_;
354 if (m_rva - m_rva_base != p_rva - p_rva_base) {
355 // previous label was pointing to something that is different size
356 break;
358 LabelInfo* m_info_next_next = m_info_next->next_addr_;
359 LabelInfo* p_info_next_next = p_info_next->next_addr_;
360 if (m_info_next_next && p_info_next_next) {
361 RVA m_rva_next = m_info_next_next->label_->rva_;
362 RVA p_rva_next = p_info_next_next->label_->rva_;
363 if (m_rva_next - m_rva != p_rva_next - p_rva) {
364 // Since following labels are no longer in address lockstep, assume
365 // this address has a difference.
366 break;
370 // The label has inconsistent numbers of references, it is probably not
371 // the same thing.
372 if (m_info_next->refs_ != p_info_next->refs_) {
373 break;
376 VLOG(4) << " Extending assignment -> "
377 << ToString(p_info_next) << " := " << ToString(m_info_next);
379 AssignOne(p_info_next, m_info_next);
381 if (p_info_next->refs_ == m_info_next->refs_ &&
382 p_info_next->refs_ == 1) {
383 TryExtendSequence(p_info_next->positions_[0],
384 m_info_next->positions_[0]);
385 TryExtendSequenceBackwards(p_info_next->positions_[0],
386 m_info_next->positions_[0]);
389 p_info_next = p_info_next_next;
390 m_info_next = m_info_next_next;
393 LabelInfo* m_info_prev = m_info->prev_addr_;
394 LabelInfo* p_info_prev = p_info->prev_addr_;
395 for ( ; m_info_prev && p_info_prev; ) {
396 if (m_info_prev->assignment_)
397 break;
399 RVA m_rva = m_info_prev->label_->rva_;
400 RVA p_rva = p_info_prev->label_->rva_;
402 if (m_rva - m_rva_base != p_rva - p_rva_base) {
403 // previous label was pointing to something that is different size
404 break;
406 LabelInfo* m_info_prev_prev = m_info_prev->prev_addr_;
407 LabelInfo* p_info_prev_prev = p_info_prev->prev_addr_;
409 // The the label has inconsistent numbers of references, it is
410 // probably not the same thing
411 if (m_info_prev->refs_ != p_info_prev->refs_) {
412 break;
415 AssignOne(p_info_prev, m_info_prev);
416 VLOG(4) << " Extending assignment <- " << ToString(p_info_prev) << " := "
417 << ToString(m_info_prev);
419 p_info_prev = p_info_prev_prev;
420 m_info_prev = m_info_prev_prev;
424 uint32 TryExtendSequence(uint32 p_pos_start, uint32 m_pos_start) {
425 uint32 p_pos = p_pos_start + 1;
426 uint32 m_pos = m_pos_start + 1;
428 while (p_pos < p_trace_.size() && m_pos < m_trace_.size()) {
429 LabelInfo* p_info = p_trace_[p_pos];
430 LabelInfo* m_info = m_trace_[m_pos];
432 // To match, either (1) both are assigned or (2) both are unassigned.
433 if ((p_info->assignment_ == NULL) != (m_info->assignment_ == NULL))
434 break;
436 // If they are assigned, it needs to be consistent (same index).
437 if (p_info->assignment_ && m_info->assignment_) {
438 if (p_info->label_->index_ != m_info->label_->index_)
439 break;
440 ++p_pos;
441 ++m_pos;
442 continue;
445 if (p_info->refs_ != m_info->refs_)
446 break;
448 AssignOne(p_info, m_info);
449 VLOG(4) << " Extending assignment seq[+" << p_pos - p_pos_start
450 << "] -> " << ToString(p_info) << " := " << ToString(m_info);
452 ++p_pos;
453 ++m_pos;
456 return p_pos - p_pos_start;
459 uint32 TryExtendSequenceBackwards(uint32 p_pos_start, uint32 m_pos_start) {
460 if (p_pos_start == 0 || m_pos_start == 0)
461 return 0;
463 uint32 p_pos = p_pos_start - 1;
464 uint32 m_pos = m_pos_start - 1;
466 while (p_pos > 0 && m_pos > 0) {
467 LabelInfo* p_info = p_trace_[p_pos];
468 LabelInfo* m_info = m_trace_[m_pos];
470 if ((p_info->assignment_ == NULL) != (m_info->assignment_ == NULL))
471 break;
473 if (p_info->assignment_ && m_info->assignment_) {
474 if (p_info->label_->index_ != m_info->label_->index_)
475 break;
476 --p_pos;
477 --m_pos;
478 continue;
481 if (p_info->refs_ != m_info->refs_)
482 break;
484 AssignOne(p_info, m_info);
485 VLOG(4) << " Extending assignment seq[-" << p_pos_start - p_pos
486 << "] <- " << ToString(p_info) << " := " << ToString(m_info);
488 --p_pos;
489 --m_pos;
492 return p_pos - p_pos_start;
495 Node* FindModelNode(Node* node) {
496 if (node->prev_ == NULL)
497 return m_root_;
499 Node* m_parent = FindModelNode(node->prev_);
500 if (m_parent == NULL) {
501 return NULL;
504 ExtendNode(m_parent, m_trace_);
506 LabelInfo* p_label = node->in_edge_;
507 LabelInfo* m_label = p_label->assignment_;
508 if (m_label == NULL) {
509 VLOG(2) << "Expected assigned prefix";
510 return NULL;
513 Node::Edges::iterator e = m_parent->edges_.find(m_label);
514 if (e == m_parent->edges_.end()) {
515 VLOG(3) << "Expected defined edge in parent";
516 return NULL;
519 return e->second;
522 Node* MakeRootNode(const Trace& trace) {
523 Node* node = new Node(NULL, NULL);
524 all_nodes_.push_back(node);
525 for (uint32 i = 0; i < trace.size(); ++i) {
526 ++node->count_;
527 node->places_.push_back(i);
529 return node;
532 void ExtendNode(Node* node, const Trace& trace) {
533 // Make sure trie is filled in at this node.
534 if (node->Extended())
535 return;
536 for (size_t i = 0; i < node->places_.size(); ++i) {
537 uint32 index = node->places_.at(i);
538 if (index < trace.size()) {
539 LabelInfo* item = trace.at(index);
540 Node*& slot = node->edges_[item];
541 if (slot == NULL) {
542 slot = new Node(item, node);
543 all_nodes_.push_back(slot);
544 node->edges_in_frequency_order.push_back(slot);
546 slot->places_.push_back(index + 1);
547 ++slot->count_;
550 node->edges_in_frequency_order.sort(OrderNodeByCountDecreasing());
553 const Trace& m_trace_;
554 const Trace& p_trace_;
555 Node* m_root_;
556 Node* p_root_;
558 NodeQueue worklist_;
559 NodeQueue unsolved_;
561 std::vector<Node*> all_nodes_;
563 DISALLOW_COPY_AND_ASSIGN(AssignmentProblem);
566 class GraphAdjuster : public AdjustmentMethod {
567 public:
568 GraphAdjuster()
569 : prog_(NULL),
570 model_(NULL),
571 debug_label_index_gen_(0) {}
572 ~GraphAdjuster() {}
574 bool Adjust(const AssemblyProgram& model, AssemblyProgram* program) {
575 VLOG(1) << "GraphAdjuster::Adjust";
576 prog_ = program;
577 model_ = &model;
578 debug_label_index_gen_ = 0;
579 return Finish();
582 bool Finish() {
583 prog_->UnassignIndexes();
584 CollectTraces(model_, &model_abs32_, &model_rel32_, true);
585 CollectTraces(prog_, &prog_abs32_, &prog_rel32_, false);
586 Solve(model_abs32_, prog_abs32_);
587 Solve(model_rel32_, prog_rel32_);
588 prog_->AssignRemainingIndexes();
589 return true;
592 private:
594 void CollectTraces(const AssemblyProgram* program, Trace* abs32, Trace* rel32,
595 bool is_model) {
596 const InstructionVector& instructions = program->instructions();
597 for (size_t i = 0; i < instructions.size(); ++i) {
598 Instruction* instruction = instructions[i];
599 if (Label* label = program->InstructionAbs32Label(instruction))
600 ReferenceLabel(abs32, label, is_model);
601 if (Label* label = program->InstructionRel32Label(instruction))
602 ReferenceLabel(rel32, label, is_model);
604 // TODO(sra): we could simply append all the labels in index order to
605 // incorporate some costing for entropy (bigger deltas) that will be
606 // introduced into the label address table by non-monotonic ordering. This
607 // would have some knock-on effects to parts of the algorithm that work on
608 // single-occurrence labels.
611 void Solve(const Trace& model, const Trace& problem) {
612 LinkLabelInfos(model);
613 LinkLabelInfos(problem);
614 AssignmentProblem a(model, problem);
615 a.Solve();
618 void LinkLabelInfos(const Trace& trace) {
619 typedef std::set<LabelInfo*, OrderLabelInfoByAddressAscending> Ordered;
620 Ordered ordered;
621 for (Trace::const_iterator p = trace.begin(); p != trace.end(); ++p)
622 ordered.insert(*p);
623 LabelInfo* prev = NULL;
624 for (Ordered::iterator p = ordered.begin(); p != ordered.end(); ++p) {
625 LabelInfo* curr = *p;
626 if (prev) prev->next_addr_ = curr;
627 curr->prev_addr_ = prev;
628 prev = curr;
630 if (curr->positions_.size() != curr->refs_)
631 NOTREACHED();
635 void ReferenceLabel(Trace* trace, Label* label, bool is_model) {
636 trace->push_back(MakeLabelInfo(label, is_model,
637 static_cast<uint32>(trace->size())));
640 LabelInfo* MakeLabelInfo(Label* label, bool is_model, uint32 position) {
641 LabelInfo& slot = label_infos_[label];
642 if (slot.label_ == NULL) {
643 slot.label_ = label;
644 slot.is_model_ = is_model;
645 slot.debug_index_ = ++debug_label_index_gen_;
647 slot.positions_.push_back(position);
648 ++slot.refs_;
649 return &slot;
652 AssemblyProgram* prog_; // Program to be adjusted, owned by caller.
653 const AssemblyProgram* model_; // Program to be mimicked, owned by caller.
655 Trace model_abs32_;
656 Trace model_rel32_;
657 Trace prog_abs32_;
658 Trace prog_rel32_;
660 int debug_label_index_gen_;
662 // Note LabelInfo is allocated inside map, so the LabelInfo lifetimes are
663 // managed by the map.
664 std::map<Label*, LabelInfo> label_infos_;
666 private:
667 DISALLOW_COPY_AND_ASSIGN(GraphAdjuster);
671 ////////////////////////////////////////////////////////////////////////////////
673 void AdjustmentMethod::Destroy() { delete this; }
675 AdjustmentMethod* AdjustmentMethod::MakeNullAdjustmentMethod() {
676 return new NullAdjustmentMethod();
679 AdjustmentMethod* AdjustmentMethod::MakeTrieAdjustmentMethod() {
680 return new GraphAdjuster();
683 Status Adjust(const AssemblyProgram& model, AssemblyProgram* program) {
684 AdjustmentMethod* method = AdjustmentMethod::MakeProductionAdjustmentMethod();
685 bool ok = method->Adjust(model, program);
686 method->Destroy();
687 if (ok)
688 return C_OK;
689 else
690 return C_ADJUSTMENT_FAILED;
693 } // namespace courgette