1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "courgette/adjustment_method.h"
14 #include "base/basictypes.h"
15 #include "base/logging.h"
16 #include "base/string_number_conversions.h"
17 #include "base/stringprintf.h"
18 #include "courgette/assembly_program.h"
19 #include "courgette/courgette.h"
20 #include "courgette/encoded_program.h"
24 ////////////////////////////////////////////////////////////////////////////////
26 class NullAdjustmentMethod
: public AdjustmentMethod
{
27 bool Adjust(const AssemblyProgram
& model
, AssemblyProgram
* program
) {
32 ////////////////////////////////////////////////////////////////////////////////
34 // The purpose of adjustment is to assign indexes to Labels of a program 'p' to
35 // make the sequence of indexes similar to a 'model' program 'm'. Labels
36 // themselves don't have enough information to do this job, so we work with a
37 // LabelInfo surrogate for each label.
41 Label
* label_
; // The label that this info a surrogate for.
43 // Information used only in debugging messages.
44 uint32 is_model_
: 1; // Is the label in the model?
45 uint32 debug_index_
: 31; // An unique small number for naming the label.
47 uint32 refs_
; // Number of times this Label is referenced.
49 LabelInfo
* assignment_
; // Label from other program corresponding to this.
51 // LabelInfos are in a doubly linked list ordered by address (label_->rva_) so
52 // we can quickly find Labels adjacent in address order.
53 LabelInfo
* next_addr_
; // Label(Info) at next highest address.
54 LabelInfo
* prev_addr_
; // Label(Info) at next lowest address.
56 std::vector
<uint32
> positions_
; // Offsets into the trace of references.
58 // Just a no-argument constructor and copy constructor. Actual LabelInfo
59 // objects are allocated in std::pair structs in a std::map.
61 : label_(NULL
), is_model_(false), debug_index_(0), refs_(0),
67 void operator=(const LabelInfo
*); // Disallow assignment only.
69 // Public compiler generated copy constructor is needed to constuct
70 // std::pair<Label*, LabelInfo> so that fresh LabelInfos can be allocated
74 struct OrderLabelInfoByAddressAscending
{
75 bool operator()(const LabelInfo
* a
, const LabelInfo
* b
) const {
76 return a
->label_
->rva_
< b
->label_
->rva_
;
80 static std::string
ToString(LabelInfo
* info
) {
82 base::StringAppendF(&s
, "%c%d", "pm"[info
->is_model_
], info
->debug_index_
);
83 if (info
->label_
->index_
!= Label::kNoIndex
)
84 base::StringAppendF(&s
, " (%d)", info
->label_
->index_
);
86 base::StringAppendF(&s
, " #%u", info
->refs_
);
90 // General graph matching is exponential, essentially trying all permutations.
91 // The exponential algorithm can be made faster by avoiding consideration of
92 // impossible or unlikely matches. We can make the matching practical by eager
93 // matching - by looking for likely matches and commiting to them, and using the
94 // committed assignment as the basis for further matching.
96 // The basic eager graph-matching assignment is based on several ideas:
98 // * The strongest match will be for parts of the program that have not
99 // changed. If part of a program has not changed, then the number of
100 // references to a label will be the same, and corresponding pairs of
101 // adjacent labels will have the same RVA difference.
103 // * Some assignments are 'obvious' if you look at the distribution. Example:
104 // if both the program and the model have a label that is referred to much
105 // more often than the next most refered-to label, it is likely the two
106 // labels correspond.
108 // * If a label from the program corresponds to a label in the model, it is
109 // likely that the labels near the corresponding labels also match. A
110 // conservative way of extending the match is to assign only those labels
111 // which have exactly the same address offset and reference count.
113 // * If two labels correspond, then we can try to match up the references
114 // before and after the labels in the reference stream. For this to be
115 // practical, the number of references has to be small, e.g. each label has
116 // exactly one reference.
119 // Note: we also tried a completely different approach: random assignment
120 // followed by simulated annealing. This produced similar results. The results
121 // were not as good for very small differences because the simulated annealing
122 // never quite hit the groove. And simulated annealing was several orders of
126 // TRIE node for suffix strings in the label reference sequence.
128 // We dynamically build a trie for both the program and model, growing the trie
129 // as necessary. The trie node for a (possibly) empty string of label
130 // references contains the distribution of labels following the string. The
131 // roots node (for the empty string) thus contains the simple distribution of
132 // labels within the label reference stream.
135 Node(LabelInfo
* in_edge
, Node
* prev
)
136 : in_edge_(in_edge
), prev_(prev
), count_(0),
138 length_
= 1 + (prev_
? prev_
->length_
: 0);
140 LabelInfo
* in_edge_
; //
141 Node
* prev_
; // Node at shorter length.
142 int count_
; // Frequency of this path in Trie.
144 typedef std::map
<LabelInfo
*, Node
*> Edges
;
146 std::vector
<int> places_
; // Indexes into sequence of this item.
147 std::list
<Node
*> edges_in_frequency_order
;
150 bool Extended() const { return !edges_
.empty(); }
152 uint32
Weight() const {
153 return edges_in_frequency_order
.front()->count_
;
157 static std::string
ToString(Node
* node
) {
158 std::vector
<std::string
> prefix
;
159 for (Node
* n
= node
; n
->prev_
; n
= n
->prev_
)
160 prefix
.push_back(ToString(n
->in_edge_
));
164 const char* sep
= "";
165 while (!prefix
.empty()) {
172 s
+= base::StringPrintf("%u", node
->count_
);
174 s
+= base::Uint64ToString(node
->edges_in_frequency_order
.size());
179 typedef std::vector
<LabelInfo
*> Trace
;
181 struct OrderNodeByCountDecreasing
{
182 bool operator()(Node
* a
, Node
* b
) const {
183 if (a
->count_
!= b
->count_
)
184 return (a
->count_
) > (b
->count_
);
185 return a
->places_
.at(0) < b
->places_
.at(0); // Prefer first occuring.
189 struct OrderNodeByWeightDecreasing
{
190 bool operator()(Node
* a
, Node
* b
) const {
191 // (Maybe tie-break on total count, followed by lowest assigned node indexes
193 uint32 a_weight
= a
->Weight();
194 uint32 b_weight
= b
->Weight();
195 if (a_weight
!= b_weight
)
196 return a_weight
> b_weight
;
197 if (a
->length_
!= b
->length_
)
198 return a
->length_
> b
->length_
; // Prefer longer.
199 return a
->places_
.at(0) < b
->places_
.at(0); // Prefer first occuring.
203 typedef std::set
<Node
*, OrderNodeByWeightDecreasing
> NodeQueue
;
205 class AssignmentProblem
{
207 AssignmentProblem(const Trace
& model
,
208 const Trace
& problem
)
215 ~AssignmentProblem() {
216 for (size_t i
= 0; i
< all_nodes_
.size(); ++i
)
217 delete all_nodes_
[i
];
221 m_root_
= MakeRootNode(m_trace_
);
222 p_root_
= MakeRootNode(p_trace_
);
225 while (!worklist_
.empty()) {
226 Node
* node
= *worklist_
.begin();
227 node
->in_queue_
= false;
228 worklist_
.erase(node
);
232 VLOG(2) << unsolved_
.size() << " unsolved items";
237 void AddToQueue(Node
* node
) {
238 if (node
->length_
>= 10) {
239 VLOG(4) << "Length clipped " << ToString(node
->prev_
);
242 if (node
->in_queue_
) {
243 LOG(ERROR
) << "Double add " << ToString(node
);
246 // just to be sure data for prioritizing is available
247 ExtendNode(node
, p_trace_
);
248 // SkipCommittedLabels(node);
249 if (node
->edges_in_frequency_order
.empty())
251 node
->in_queue_
= true;
252 worklist_
.insert(node
);
255 void SkipCommittedLabels(Node
* node
) {
256 ExtendNode(node
, p_trace_
);
258 while (!node
->edges_in_frequency_order
.empty() &&
259 node
->edges_in_frequency_order
.front()->in_edge_
->assignment_
) {
261 node
->edges_in_frequency_order
.pop_front();
264 VLOG(4) << "Skipped " << skipped
<< " at " << ToString(node
);
267 void TrySolveNode(Node
* p_node
) {
268 Node
* front
= p_node
->edges_in_frequency_order
.front();
269 if (front
->in_edge_
->assignment_
) {
270 p_node
->edges_in_frequency_order
.pop_front();
276 // Compare frequencies of unassigned edges, and either make
277 // assignment(s) or move node to unsolved list
279 Node
* m_node
= FindModelNode(p_node
);
281 if (m_node
== NULL
) {
282 VLOG(2) << "Can't find model node";
283 unsolved_
.insert(p_node
);
286 ExtendNode(m_node
, m_trace_
);
288 // Lets just try greedy
290 SkipCommittedLabels(m_node
);
291 if (m_node
->edges_in_frequency_order
.empty()) {
292 VLOG(4) << "Punting, no elements left in model vs "
293 << p_node
->edges_in_frequency_order
.size();
294 unsolved_
.insert(p_node
);
297 Node
* m_match
= m_node
->edges_in_frequency_order
.front();
298 Node
* p_match
= p_node
->edges_in_frequency_order
.front();
300 if (p_match
->count_
> 1.1 * m_match
->count_
||
301 m_match
->count_
> 1.1 * p_match
->count_
) {
302 VLOG(3) << "Tricky distribution "
303 << p_match
->count_
<< ":" << m_match
->count_
<< " "
304 << ToString(p_match
) << " vs " << ToString(m_match
);
308 m_node
->edges_in_frequency_order
.pop_front();
309 p_node
->edges_in_frequency_order
.pop_front();
311 LabelInfo
* p_label_info
= p_match
->in_edge_
;
312 LabelInfo
* m_label_info
= m_match
->in_edge_
;
313 int m_index
= p_label_info
->label_
->index_
;
314 if (m_index
!= Label::kNoIndex
) {
315 VLOG(2) << "Cant use unassigned label from model " << m_index
;
316 unsolved_
.insert(p_node
);
320 Assign(p_label_info
, m_label_info
);
322 AddToQueue(p_match
); // find matches within new match
323 AddToQueue(p_node
); // and more matches within this node
326 void Assign(LabelInfo
* p_info
, LabelInfo
* m_info
) {
327 AssignOne(p_info
, m_info
);
328 VLOG(4) << "Assign " << ToString(p_info
) << " := " << ToString(m_info
);
329 // Now consider unassigned adjacent addresses
330 TryExtendAssignment(p_info
, m_info
);
333 void AssignOne(LabelInfo
* p_info
, LabelInfo
* m_info
) {
334 p_info
->label_
->index_
= m_info
->label_
->index_
;
337 m_info
->assignment_
= p_info
;
338 p_info
->assignment_
= m_info
;
341 void TryExtendAssignment(LabelInfo
* p_info
, LabelInfo
* m_info
) {
342 RVA m_rva_base
= m_info
->label_
->rva_
;
343 RVA p_rva_base
= p_info
->label_
->rva_
;
345 LabelInfo
* m_info_next
= m_info
->next_addr_
;
346 LabelInfo
* p_info_next
= p_info
->next_addr_
;
347 for ( ; m_info_next
&& p_info_next
; ) {
348 if (m_info_next
->assignment_
)
351 RVA m_rva
= m_info_next
->label_
->rva_
;
352 RVA p_rva
= p_info_next
->label_
->rva_
;
354 if (m_rva
- m_rva_base
!= p_rva
- p_rva_base
) {
355 // previous label was pointing to something that is different size
358 LabelInfo
* m_info_next_next
= m_info_next
->next_addr_
;
359 LabelInfo
* p_info_next_next
= p_info_next
->next_addr_
;
360 if (m_info_next_next
&& p_info_next_next
) {
361 RVA m_rva_next
= m_info_next_next
->label_
->rva_
;
362 RVA p_rva_next
= p_info_next_next
->label_
->rva_
;
363 if (m_rva_next
- m_rva
!= p_rva_next
- p_rva
) {
364 // Since following labels are no longer in address lockstep, assume
365 // this address has a difference.
370 // The label has inconsistent numbers of references, it is probably not
372 if (m_info_next
->refs_
!= p_info_next
->refs_
) {
376 VLOG(4) << " Extending assignment -> "
377 << ToString(p_info_next
) << " := " << ToString(m_info_next
);
379 AssignOne(p_info_next
, m_info_next
);
381 if (p_info_next
->refs_
== m_info_next
->refs_
&&
382 p_info_next
->refs_
== 1) {
383 TryExtendSequence(p_info_next
->positions_
[0],
384 m_info_next
->positions_
[0]);
385 TryExtendSequenceBackwards(p_info_next
->positions_
[0],
386 m_info_next
->positions_
[0]);
389 p_info_next
= p_info_next_next
;
390 m_info_next
= m_info_next_next
;
393 LabelInfo
* m_info_prev
= m_info
->prev_addr_
;
394 LabelInfo
* p_info_prev
= p_info
->prev_addr_
;
395 for ( ; m_info_prev
&& p_info_prev
; ) {
396 if (m_info_prev
->assignment_
)
399 RVA m_rva
= m_info_prev
->label_
->rva_
;
400 RVA p_rva
= p_info_prev
->label_
->rva_
;
402 if (m_rva
- m_rva_base
!= p_rva
- p_rva_base
) {
403 // previous label was pointing to something that is different size
406 LabelInfo
* m_info_prev_prev
= m_info_prev
->prev_addr_
;
407 LabelInfo
* p_info_prev_prev
= p_info_prev
->prev_addr_
;
409 // The the label has inconsistent numbers of references, it is
410 // probably not the same thing
411 if (m_info_prev
->refs_
!= p_info_prev
->refs_
) {
415 AssignOne(p_info_prev
, m_info_prev
);
416 VLOG(4) << " Extending assignment <- " << ToString(p_info_prev
) << " := "
417 << ToString(m_info_prev
);
419 p_info_prev
= p_info_prev_prev
;
420 m_info_prev
= m_info_prev_prev
;
424 uint32
TryExtendSequence(uint32 p_pos_start
, uint32 m_pos_start
) {
425 uint32 p_pos
= p_pos_start
+ 1;
426 uint32 m_pos
= m_pos_start
+ 1;
428 while (p_pos
< p_trace_
.size() && m_pos
< m_trace_
.size()) {
429 LabelInfo
* p_info
= p_trace_
[p_pos
];
430 LabelInfo
* m_info
= m_trace_
[m_pos
];
432 // To match, either (1) both are assigned or (2) both are unassigned.
433 if ((p_info
->assignment_
== NULL
) != (m_info
->assignment_
== NULL
))
436 // If they are assigned, it needs to be consistent (same index).
437 if (p_info
->assignment_
&& m_info
->assignment_
) {
438 if (p_info
->label_
->index_
!= m_info
->label_
->index_
)
445 if (p_info
->refs_
!= m_info
->refs_
)
448 AssignOne(p_info
, m_info
);
449 VLOG(4) << " Extending assignment seq[+" << p_pos
- p_pos_start
450 << "] -> " << ToString(p_info
) << " := " << ToString(m_info
);
456 return p_pos
- p_pos_start
;
459 uint32
TryExtendSequenceBackwards(uint32 p_pos_start
, uint32 m_pos_start
) {
460 if (p_pos_start
== 0 || m_pos_start
== 0)
463 uint32 p_pos
= p_pos_start
- 1;
464 uint32 m_pos
= m_pos_start
- 1;
466 while (p_pos
> 0 && m_pos
> 0) {
467 LabelInfo
* p_info
= p_trace_
[p_pos
];
468 LabelInfo
* m_info
= m_trace_
[m_pos
];
470 if ((p_info
->assignment_
== NULL
) != (m_info
->assignment_
== NULL
))
473 if (p_info
->assignment_
&& m_info
->assignment_
) {
474 if (p_info
->label_
->index_
!= m_info
->label_
->index_
)
481 if (p_info
->refs_
!= m_info
->refs_
)
484 AssignOne(p_info
, m_info
);
485 VLOG(4) << " Extending assignment seq[-" << p_pos_start
- p_pos
486 << "] <- " << ToString(p_info
) << " := " << ToString(m_info
);
492 return p_pos
- p_pos_start
;
495 Node
* FindModelNode(Node
* node
) {
496 if (node
->prev_
== NULL
)
499 Node
* m_parent
= FindModelNode(node
->prev_
);
500 if (m_parent
== NULL
) {
504 ExtendNode(m_parent
, m_trace_
);
506 LabelInfo
* p_label
= node
->in_edge_
;
507 LabelInfo
* m_label
= p_label
->assignment_
;
508 if (m_label
== NULL
) {
509 VLOG(2) << "Expected assigned prefix";
513 Node::Edges::iterator e
= m_parent
->edges_
.find(m_label
);
514 if (e
== m_parent
->edges_
.end()) {
515 VLOG(3) << "Expected defined edge in parent";
522 Node
* MakeRootNode(const Trace
& trace
) {
523 Node
* node
= new Node(NULL
, NULL
);
524 all_nodes_
.push_back(node
);
525 for (uint32 i
= 0; i
< trace
.size(); ++i
) {
527 node
->places_
.push_back(i
);
532 void ExtendNode(Node
* node
, const Trace
& trace
) {
533 // Make sure trie is filled in at this node.
534 if (node
->Extended())
536 for (size_t i
= 0; i
< node
->places_
.size(); ++i
) {
537 uint32 index
= node
->places_
.at(i
);
538 if (index
< trace
.size()) {
539 LabelInfo
* item
= trace
.at(index
);
540 Node
*& slot
= node
->edges_
[item
];
542 slot
= new Node(item
, node
);
543 all_nodes_
.push_back(slot
);
544 node
->edges_in_frequency_order
.push_back(slot
);
546 slot
->places_
.push_back(index
+ 1);
550 node
->edges_in_frequency_order
.sort(OrderNodeByCountDecreasing());
553 const Trace
& m_trace_
;
554 const Trace
& p_trace_
;
561 std::vector
<Node
*> all_nodes_
;
563 DISALLOW_COPY_AND_ASSIGN(AssignmentProblem
);
566 class GraphAdjuster
: public AdjustmentMethod
{
571 debug_label_index_gen_(0) {}
574 bool Adjust(const AssemblyProgram
& model
, AssemblyProgram
* program
) {
575 VLOG(1) << "GraphAdjuster::Adjust";
578 debug_label_index_gen_
= 0;
583 prog_
->UnassignIndexes();
584 CollectTraces(model_
, &model_abs32_
, &model_rel32_
, true);
585 CollectTraces(prog_
, &prog_abs32_
, &prog_rel32_
, false);
586 Solve(model_abs32_
, prog_abs32_
);
587 Solve(model_rel32_
, prog_rel32_
);
588 prog_
->AssignRemainingIndexes();
594 void CollectTraces(const AssemblyProgram
* program
, Trace
* abs32
, Trace
* rel32
,
596 const InstructionVector
& instructions
= program
->instructions();
597 for (size_t i
= 0; i
< instructions
.size(); ++i
) {
598 Instruction
* instruction
= instructions
[i
];
599 if (Label
* label
= program
->InstructionAbs32Label(instruction
))
600 ReferenceLabel(abs32
, label
, is_model
);
601 if (Label
* label
= program
->InstructionRel32Label(instruction
))
602 ReferenceLabel(rel32
, label
, is_model
);
604 // TODO(sra): we could simply append all the labels in index order to
605 // incorporate some costing for entropy (bigger deltas) that will be
606 // introduced into the label address table by non-monotonic ordering. This
607 // would have some knock-on effects to parts of the algorithm that work on
608 // single-occurrence labels.
611 void Solve(const Trace
& model
, const Trace
& problem
) {
612 LinkLabelInfos(model
);
613 LinkLabelInfos(problem
);
614 AssignmentProblem
a(model
, problem
);
618 void LinkLabelInfos(const Trace
& trace
) {
619 typedef std::set
<LabelInfo
*, OrderLabelInfoByAddressAscending
> Ordered
;
621 for (Trace::const_iterator p
= trace
.begin(); p
!= trace
.end(); ++p
)
623 LabelInfo
* prev
= NULL
;
624 for (Ordered::iterator p
= ordered
.begin(); p
!= ordered
.end(); ++p
) {
625 LabelInfo
* curr
= *p
;
626 if (prev
) prev
->next_addr_
= curr
;
627 curr
->prev_addr_
= prev
;
630 if (curr
->positions_
.size() != curr
->refs_
)
635 void ReferenceLabel(Trace
* trace
, Label
* label
, bool is_model
) {
636 trace
->push_back(MakeLabelInfo(label
, is_model
,
637 static_cast<uint32
>(trace
->size())));
640 LabelInfo
* MakeLabelInfo(Label
* label
, bool is_model
, uint32 position
) {
641 LabelInfo
& slot
= label_infos_
[label
];
642 if (slot
.label_
== NULL
) {
644 slot
.is_model_
= is_model
;
645 slot
.debug_index_
= ++debug_label_index_gen_
;
647 slot
.positions_
.push_back(position
);
652 AssemblyProgram
* prog_
; // Program to be adjusted, owned by caller.
653 const AssemblyProgram
* model_
; // Program to be mimicked, owned by caller.
660 int debug_label_index_gen_
;
662 // Note LabelInfo is allocated inside map, so the LabelInfo lifetimes are
663 // managed by the map.
664 std::map
<Label
*, LabelInfo
> label_infos_
;
667 DISALLOW_COPY_AND_ASSIGN(GraphAdjuster
);
671 ////////////////////////////////////////////////////////////////////////////////
673 void AdjustmentMethod::Destroy() { delete this; }
675 AdjustmentMethod
* AdjustmentMethod::MakeNullAdjustmentMethod() {
676 return new NullAdjustmentMethod();
679 AdjustmentMethod
* AdjustmentMethod::MakeTrieAdjustmentMethod() {
680 return new GraphAdjuster();
683 Status
Adjust(const AssemblyProgram
& model
, AssemblyProgram
* program
) {
684 AdjustmentMethod
* method
= AdjustmentMethod::MakeProductionAdjustmentMethod();
685 bool ok
= method
->Adjust(model
, program
);
690 return C_ADJUSTMENT_FAILED
;
693 } // namespace courgette