1 //===- bolt/Rewrite/LinuxKernelRewriter.cpp -------------------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // Support for updating Linux Kernel metadata.
11 //===----------------------------------------------------------------------===//
13 #include "bolt/Core/BinaryFunction.h"
14 #include "bolt/Rewrite/MetadataRewriter.h"
15 #include "bolt/Rewrite/MetadataRewriters.h"
16 #include "bolt/Utils/CommandLineOpts.h"
17 #include "llvm/Support/CommandLine.h"
18 #include "llvm/Support/Errc.h"
27 cl::desc("print ORC unwind information for instructions"),
28 cl::init(true), cl::Hidden
, cl::cat(BoltCategory
));
31 DumpORC("dump-orc", cl::desc("dump raw ORC unwind information (sorted)"),
32 cl::init(false), cl::Hidden
, cl::cat(BoltCategory
));
36 /// Linux Kernel supports stack unwinding using ORC (oops rewind capability).
37 /// ORC state at every IP can be described by the following data structure.
43 bool operator==(const ORCState
&Other
) const {
44 return SPOffset
== Other
.SPOffset
&& BPOffset
== Other
.BPOffset
&&
48 bool operator!=(const ORCState
&Other
) const { return !(*this == Other
); }
51 /// Basic printer for ORC entry. It does not provide the same level of
52 /// information as objtool (for now).
53 inline raw_ostream
&operator<<(raw_ostream
&OS
, const ORCState
&E
) {
55 OS
<< format("{sp: %d, bp: %d, info: 0x%x}", E
.SPOffset
, E
.BPOffset
,
62 /// Section terminator ORC entry.
63 static ORCState NullORC
= {0, 0, 0};
65 class LinuxKernelRewriter final
: public MetadataRewriter
{
66 /// Linux Kernel special sections point to a specific instruction in many
67 /// cases. Unlike SDTMarkerInfo, these markers can come from different
69 struct LKInstructionMarkerInfo
{
70 uint64_t SectionOffset
;
71 int32_t PCRelativeOffset
;
73 StringRef SectionName
;
76 /// Map linux kernel program locations/instructions to their pointers in
77 /// special linux kernel sections
78 std::unordered_map
<uint64_t, std::vector
<LKInstructionMarkerInfo
>> LKMarkers
;
80 /// Linux ORC sections.
81 ErrorOr
<BinarySection
&> ORCUnwindSection
= std::errc::bad_address
;
82 ErrorOr
<BinarySection
&> ORCUnwindIPSection
= std::errc::bad_address
;
84 /// Size of entries in ORC sections.
85 static constexpr size_t ORC_UNWIND_ENTRY_SIZE
= 6;
86 static constexpr size_t ORC_UNWIND_IP_ENTRY_SIZE
= 4;
89 uint64_t IP
; /// Instruction address.
90 BinaryFunction
*BF
; /// Binary function corresponding to the entry.
91 ORCState ORC
; /// Stack unwind info in ORC format.
93 bool operator<(const ORCListEntry
&Other
) const {
98 return ORC
== NullORC
;
102 using ORCListType
= std::vector
<ORCListEntry
>;
103 ORCListType ORCEntries
;
105 /// Insert an LKMarker for a given code pointer \p PC from a non-code section
107 void insertLKMarker(uint64_t PC
, uint64_t SectionOffset
,
108 int32_t PCRelativeOffset
, bool IsPCRelative
,
109 StringRef SectionName
);
111 /// Process linux kernel special sections and their relocations.
112 void processLKSections();
114 /// Process special linux kernel section, __ex_table.
115 void processLKExTable();
117 /// Process special linux kernel section, .pci_fixup.
118 void processLKPCIFixup();
120 /// Process __ksymtab and __ksymtab_gpl.
121 void processLKKSymtab(bool IsGPL
= false);
123 /// Process special linux kernel section, __bug_table.
124 void processLKBugTable();
126 /// Process special linux kernel section, .smp_locks.
127 void processLKSMPLocks();
129 /// Update LKMarkers' locations for the output binary.
130 void updateLKMarkers();
132 /// Read ORC unwind information and annotate instructions.
133 Error
readORCTables();
135 /// Update ORC for functions once CFG is constructed.
136 Error
processORCPostCFG();
138 /// Update ORC data in the binary.
139 Error
rewriteORCTables();
141 /// Mark instructions referenced by kernel metadata.
142 Error
markInstructions();
145 LinuxKernelRewriter(BinaryContext
&BC
)
146 : MetadataRewriter("linux-kernel-rewriter", BC
) {}
148 Error
preCFGInitializer() override
{
150 if (Error E
= markInstructions())
153 if (Error E
= readORCTables())
156 return Error::success();
159 Error
postCFGInitializer() override
{
160 if (Error E
= processORCPostCFG())
163 return Error::success();
166 Error
postEmitFinalizer() override
{
169 if (Error E
= rewriteORCTables())
172 return Error::success();
176 Error
LinuxKernelRewriter::markInstructions() {
177 for (const uint64_t PC
: llvm::make_first_range(LKMarkers
)) {
178 BinaryFunction
*BF
= BC
.getBinaryFunctionContainingAddress(PC
);
180 if (!BF
|| !BC
.shouldEmit(*BF
))
183 const uint64_t Offset
= PC
- BF
->getAddress();
184 MCInst
*Inst
= BF
->getInstructionAtOffset(Offset
);
186 return createStringError(errc::executable_format_error
,
187 "no instruction matches kernel marker offset");
189 BC
.MIB
->setOffset(*Inst
, static_cast<uint32_t>(Offset
));
191 BF
->setHasSDTMarker(true);
194 return Error::success();
197 void LinuxKernelRewriter::insertLKMarker(uint64_t PC
, uint64_t SectionOffset
,
198 int32_t PCRelativeOffset
,
200 StringRef SectionName
) {
201 LKMarkers
[PC
].emplace_back(LKInstructionMarkerInfo
{
202 SectionOffset
, PCRelativeOffset
, IsPCRelative
, SectionName
});
205 void LinuxKernelRewriter::processLKSections() {
206 assert(opts::LinuxKernelMode
&&
207 "process Linux Kernel special sections and their relocations only in "
208 "linux kernel mode.\n");
213 processLKKSymtab(true);
218 /// Process __ex_table section of Linux Kernel.
219 /// This section contains information regarding kernel level exception
220 /// handling (https://www.kernel.org/doc/html/latest/x86/exception-tables.html).
221 /// More documentation is in arch/x86/include/asm/extable.h.
223 /// The section is the list of the following structures:
225 /// struct exception_table_entry {
231 void LinuxKernelRewriter::processLKExTable() {
232 ErrorOr
<BinarySection
&> SectionOrError
=
233 BC
.getUniqueSectionByName("__ex_table");
237 const uint64_t SectionSize
= SectionOrError
->getSize();
238 const uint64_t SectionAddress
= SectionOrError
->getAddress();
239 assert((SectionSize
% 12) == 0 &&
240 "The size of the __ex_table section should be a multiple of 12");
241 for (uint64_t I
= 0; I
< SectionSize
; I
+= 4) {
242 const uint64_t EntryAddress
= SectionAddress
+ I
;
243 ErrorOr
<uint64_t> Offset
= BC
.getSignedValueAtAddress(EntryAddress
, 4);
244 assert(Offset
&& "failed reading PC-relative offset for __ex_table");
245 int32_t SignedOffset
= *Offset
;
246 const uint64_t RefAddress
= EntryAddress
+ SignedOffset
;
248 BinaryFunction
*ContainingBF
=
249 BC
.getBinaryFunctionContainingAddress(RefAddress
);
253 MCSymbol
*ReferencedSymbol
= ContainingBF
->getSymbol();
254 const uint64_t FunctionOffset
= RefAddress
- ContainingBF
->getAddress();
257 llvm_unreachable("bad alignment of __ex_table");
261 insertLKMarker(RefAddress
, I
, SignedOffset
, true, "__ex_table");
266 ReferencedSymbol
= ContainingBF
->addEntryPointAtOffset(FunctionOffset
);
267 BC
.addRelocation(EntryAddress
, ReferencedSymbol
, Relocation::getPC32(), 0,
272 assert(!FunctionOffset
&&
273 "__ex_table handler entry should point to function start");
274 BC
.addRelocation(EntryAddress
, ReferencedSymbol
, Relocation::getPC32(), 0,
281 /// Process .pci_fixup section of Linux Kernel.
282 /// This section contains a list of entries for different PCI devices and their
283 /// corresponding hook handler (code pointer where the fixup
284 /// code resides, usually on x86_64 it is an entry PC relative 32 bit offset).
285 /// Documentation is in include/linux/pci.h.
286 void LinuxKernelRewriter::processLKPCIFixup() {
287 ErrorOr
<BinarySection
&> SectionOrError
=
288 BC
.getUniqueSectionByName(".pci_fixup");
289 assert(SectionOrError
&&
290 ".pci_fixup section not found in Linux Kernel binary");
291 const uint64_t SectionSize
= SectionOrError
->getSize();
292 const uint64_t SectionAddress
= SectionOrError
->getAddress();
293 assert((SectionSize
% 16) == 0 && ".pci_fixup size is not a multiple of 16");
295 for (uint64_t I
= 12; I
+ 4 <= SectionSize
; I
+= 16) {
296 const uint64_t PC
= SectionAddress
+ I
;
297 ErrorOr
<uint64_t> Offset
= BC
.getSignedValueAtAddress(PC
, 4);
298 assert(Offset
&& "cannot read value from .pci_fixup");
299 const int32_t SignedOffset
= *Offset
;
300 const uint64_t HookupAddress
= PC
+ SignedOffset
;
301 BinaryFunction
*HookupFunction
=
302 BC
.getBinaryFunctionAtAddress(HookupAddress
);
303 assert(HookupFunction
&& "expected function for entry in .pci_fixup");
304 BC
.addRelocation(PC
, HookupFunction
->getSymbol(), Relocation::getPC32(), 0,
309 /// Process __ksymtab[_gpl] sections of Linux Kernel.
310 /// This section lists all the vmlinux symbols that kernel modules can access.
312 /// All the entries are 4 bytes each and hence we can read them by one by one
313 /// and ignore the ones that are not pointing to the .text section. All pointers
314 /// are PC relative offsets. Always, points to the beginning of the function.
315 void LinuxKernelRewriter::processLKKSymtab(bool IsGPL
) {
316 StringRef SectionName
= "__ksymtab";
318 SectionName
= "__ksymtab_gpl";
319 ErrorOr
<BinarySection
&> SectionOrError
=
320 BC
.getUniqueSectionByName(SectionName
);
321 assert(SectionOrError
&&
322 "__ksymtab[_gpl] section not found in Linux Kernel binary");
323 const uint64_t SectionSize
= SectionOrError
->getSize();
324 const uint64_t SectionAddress
= SectionOrError
->getAddress();
325 assert((SectionSize
% 4) == 0 &&
326 "The size of the __ksymtab[_gpl] section should be a multiple of 4");
328 for (uint64_t I
= 0; I
< SectionSize
; I
+= 4) {
329 const uint64_t EntryAddress
= SectionAddress
+ I
;
330 ErrorOr
<uint64_t> Offset
= BC
.getSignedValueAtAddress(EntryAddress
, 4);
331 assert(Offset
&& "Reading valid PC-relative offset for a ksymtab entry");
332 const int32_t SignedOffset
= *Offset
;
333 const uint64_t RefAddress
= EntryAddress
+ SignedOffset
;
334 BinaryFunction
*BF
= BC
.getBinaryFunctionAtAddress(RefAddress
);
338 BC
.addRelocation(EntryAddress
, BF
->getSymbol(), Relocation::getPC32(), 0,
343 /// Process __bug_table section.
344 /// This section contains information useful for kernel debugging.
345 /// Each entry in the section is a struct bug_entry that contains a pointer to
346 /// the ud2 instruction corresponding to the bug, corresponding file name (both
347 /// pointers use PC relative offset addressing), line number, and flags.
348 /// The definition of the struct bug_entry can be found in
349 /// `include/asm-generic/bug.h`
350 void LinuxKernelRewriter::processLKBugTable() {
351 ErrorOr
<BinarySection
&> SectionOrError
=
352 BC
.getUniqueSectionByName("__bug_table");
356 const uint64_t SectionSize
= SectionOrError
->getSize();
357 const uint64_t SectionAddress
= SectionOrError
->getAddress();
358 assert((SectionSize
% 12) == 0 &&
359 "The size of the __bug_table section should be a multiple of 12");
360 for (uint64_t I
= 0; I
< SectionSize
; I
+= 12) {
361 const uint64_t EntryAddress
= SectionAddress
+ I
;
362 ErrorOr
<uint64_t> Offset
= BC
.getSignedValueAtAddress(EntryAddress
, 4);
364 "Reading valid PC-relative offset for a __bug_table entry");
365 const int32_t SignedOffset
= *Offset
;
366 const uint64_t RefAddress
= EntryAddress
+ SignedOffset
;
367 assert(BC
.getBinaryFunctionContainingAddress(RefAddress
) &&
368 "__bug_table entries should point to a function");
370 insertLKMarker(RefAddress
, I
, SignedOffset
, true, "__bug_table");
374 /// .smp_locks section contains PC-relative references to instructions with LOCK
375 /// prefix. The prefix can be converted to NOP at boot time on non-SMP systems.
376 void LinuxKernelRewriter::processLKSMPLocks() {
377 ErrorOr
<BinarySection
&> SectionOrError
=
378 BC
.getUniqueSectionByName(".smp_locks");
382 uint64_t SectionSize
= SectionOrError
->getSize();
383 const uint64_t SectionAddress
= SectionOrError
->getAddress();
384 assert((SectionSize
% 4) == 0 &&
385 "The size of the .smp_locks section should be a multiple of 4");
387 for (uint64_t I
= 0; I
< SectionSize
; I
+= 4) {
388 const uint64_t EntryAddress
= SectionAddress
+ I
;
389 ErrorOr
<uint64_t> Offset
= BC
.getSignedValueAtAddress(EntryAddress
, 4);
390 assert(Offset
&& "Reading valid PC-relative offset for a .smp_locks entry");
391 int32_t SignedOffset
= *Offset
;
392 uint64_t RefAddress
= EntryAddress
+ SignedOffset
;
394 BinaryFunction
*ContainingBF
=
395 BC
.getBinaryFunctionContainingAddress(RefAddress
);
399 insertLKMarker(RefAddress
, I
, SignedOffset
, true, ".smp_locks");
403 void LinuxKernelRewriter::updateLKMarkers() {
404 if (LKMarkers
.size() == 0)
407 std::unordered_map
<std::string
, uint64_t> PatchCounts
;
408 for (std::pair
<const uint64_t, std::vector
<LKInstructionMarkerInfo
>>
409 &LKMarkerInfoKV
: LKMarkers
) {
410 const uint64_t OriginalAddress
= LKMarkerInfoKV
.first
;
411 const BinaryFunction
*BF
=
412 BC
.getBinaryFunctionContainingAddress(OriginalAddress
, false, true);
416 uint64_t NewAddress
= BF
->translateInputToOutputAddress(OriginalAddress
);
420 // Apply base address.
421 if (OriginalAddress
>= 0xffffffff00000000 && NewAddress
< 0xffffffff)
422 NewAddress
= NewAddress
+ 0xffffffff00000000;
424 if (OriginalAddress
== NewAddress
)
427 for (LKInstructionMarkerInfo
&LKMarkerInfo
: LKMarkerInfoKV
.second
) {
428 StringRef SectionName
= LKMarkerInfo
.SectionName
;
429 SimpleBinaryPatcher
*LKPatcher
;
430 ErrorOr
<BinarySection
&> BSec
= BC
.getUniqueSectionByName(SectionName
);
431 assert(BSec
&& "missing section info for kernel section");
432 if (!BSec
->getPatcher())
433 BSec
->registerPatcher(std::make_unique
<SimpleBinaryPatcher
>());
434 LKPatcher
= static_cast<SimpleBinaryPatcher
*>(BSec
->getPatcher());
435 PatchCounts
[std::string(SectionName
)]++;
436 if (LKMarkerInfo
.IsPCRelative
)
437 LKPatcher
->addLE32Patch(LKMarkerInfo
.SectionOffset
,
438 NewAddress
- OriginalAddress
+
439 LKMarkerInfo
.PCRelativeOffset
);
441 LKPatcher
->addLE64Patch(LKMarkerInfo
.SectionOffset
, NewAddress
);
444 outs() << "BOLT-INFO: patching linux kernel sections. Total patches per "
445 "section are as follows:\n";
446 for (const std::pair
<const std::string
, uint64_t> &KV
: PatchCounts
)
447 outs() << " Section: " << KV
.first
<< ", patch-counts: " << KV
.second
451 Error
LinuxKernelRewriter::readORCTables() {
452 // NOTE: we should ignore relocations for orc tables as the tables are sorted
453 // post-link time and relocations are not updated.
454 ORCUnwindSection
= BC
.getUniqueSectionByName(".orc_unwind");
455 ORCUnwindIPSection
= BC
.getUniqueSectionByName(".orc_unwind_ip");
457 if (!ORCUnwindSection
&& !ORCUnwindIPSection
)
458 return Error::success();
460 if (!ORCUnwindSection
|| !ORCUnwindIPSection
)
461 return createStringError(errc::executable_format_error
,
462 "missing ORC section");
464 const uint64_t NumEntries
=
465 ORCUnwindIPSection
->getSize() / ORC_UNWIND_IP_ENTRY_SIZE
;
466 if (ORCUnwindSection
->getSize() != NumEntries
* ORC_UNWIND_ENTRY_SIZE
||
467 ORCUnwindIPSection
->getSize() != NumEntries
* ORC_UNWIND_IP_ENTRY_SIZE
)
468 return createStringError(errc::executable_format_error
,
469 "ORC entries number mismatch detected");
471 const uint64_t IPSectionAddress
= ORCUnwindIPSection
->getAddress();
472 DataExtractor OrcDE
= DataExtractor(ORCUnwindSection
->getContents(),
473 BC
.AsmInfo
->isLittleEndian(),
474 BC
.AsmInfo
->getCodePointerSize());
475 DataExtractor IPDE
= DataExtractor(ORCUnwindIPSection
->getContents(),
476 BC
.AsmInfo
->isLittleEndian(),
477 BC
.AsmInfo
->getCodePointerSize());
478 DataExtractor::Cursor
ORCCursor(0);
479 DataExtractor::Cursor
IPCursor(0);
481 for (uint32_t Index
= 0; Index
< NumEntries
; ++Index
) {
483 IPSectionAddress
+ IPCursor
.tell() + (int32_t)IPDE
.getU32(IPCursor
);
485 // Consume the status of the cursor.
487 return createStringError(errc::executable_format_error
,
488 "out of bounds while reading ORC IP table");
490 if (IP
< PrevIP
&& opts::Verbosity
)
491 errs() << "BOLT-WARNING: out of order IP 0x" << Twine::utohexstr(IP
)
492 << " detected while reading ORC\n";
496 // Store all entries, includes those we are not going to update as the
497 // tables need to be sorted globally before being written out.
498 ORCEntries
.push_back(ORCListEntry());
499 ORCListEntry
&Entry
= ORCEntries
.back();
502 Entry
.ORC
.SPOffset
= (int16_t)OrcDE
.getU16(ORCCursor
);
503 Entry
.ORC
.BPOffset
= (int16_t)OrcDE
.getU16(ORCCursor
);
504 Entry
.ORC
.Info
= (int16_t)OrcDE
.getU16(ORCCursor
);
506 // Consume the status of the cursor.
508 return createStringError(errc::executable_format_error
,
509 "out of bounds while reading ORC");
511 BinaryFunction
*&BF
= Entry
.BF
;
512 BF
= BC
.getBinaryFunctionContainingAddress(IP
, /*CheckPastEnd*/ true);
514 // If the entry immediately pointing past the end of the function is not
515 // the terminator entry, then it does not belong to this function.
516 if (BF
&& BF
->getAddress() + BF
->getSize() == IP
&& Entry
.ORC
!= NullORC
)
519 // If terminator entry points to the start of the function, then it belongs
520 // to a different function that contains the previous IP.
521 if (BF
&& BF
->getAddress() == IP
&& Entry
.ORC
== NullORC
)
522 BF
= BC
.getBinaryFunctionContainingAddress(IP
- 1);
526 errs() << "BOLT-WARNING: no binary function found matching ORC 0x"
527 << Twine::utohexstr(IP
) << ": " << Entry
.ORC
<< '\n';
531 if (Entry
.ORC
== NullORC
)
536 if (!BF
->hasInstructions())
539 MCInst
*Inst
= BF
->getInstructionAtOffset(IP
- BF
->getAddress());
541 return createStringError(
542 errc::executable_format_error
,
543 "no instruction at address 0x%" PRIx64
" in .orc_unwind_ip", IP
);
545 // Some addresses will have two entries associated with them. The first
546 // one being a "weak" section terminator. Since we ignore the terminator,
547 // we should only assign one entry per instruction.
548 if (BC
.MIB
->hasAnnotation(*Inst
, "ORC"))
549 return createStringError(
550 errc::executable_format_error
,
551 "duplicate non-terminal ORC IP 0x%" PRIx64
" in .orc_unwind_ip", IP
);
553 BC
.MIB
->addAnnotation(*Inst
, "ORC", Entry
.ORC
);
556 // Older kernels could contain unsorted tables in the file as the tables were
557 // sorted during boot time.
558 llvm::sort(ORCEntries
);
561 outs() << "BOLT-INFO: ORC unwind information:\n";
562 for (const ORCListEntry
&E
: ORCEntries
) {
563 outs() << "0x" << Twine::utohexstr(E
.IP
) << ": " << E
.ORC
;
565 outs() << ": " << *E
.BF
;
570 return Error::success();
573 Error
LinuxKernelRewriter::processORCPostCFG() {
574 // Propagate ORC to the rest of the function. We can annotate every
575 // instruction in every function, but to minimize the overhead, we annotate
576 // the first instruction in every basic block to reflect the state at the
577 // entry. This way, the ORC state can be calculated based on annotations
578 // regardless of the basic block layout. Note that if we insert/delete
579 // instructions, we must take care to attach ORC info to the new/deleted ones.
580 for (BinaryFunction
&BF
: llvm::make_second_range(BC
.getBinaryFunctions())) {
582 std::optional
<ORCState
> CurrentState
;
583 for (BinaryBasicBlock
&BB
: BF
) {
584 for (MCInst
&Inst
: BB
) {
585 ErrorOr
<ORCState
> State
=
586 BC
.MIB
->tryGetAnnotationAs
<ORCState
>(Inst
, "ORC");
589 CurrentState
= *State
;
593 // In case there was no ORC entry that matched the function start
594 // address, we need to propagate ORC state from the previous entry.
597 llvm::partition_point(ORCEntries
, [&](const ORCListEntry
&E
) {
598 return E
.IP
< BF
.getAddress();
600 if (It
!= ORCEntries
.begin())
603 if (It
->ORC
== NullORC
&& BF
.hasORC())
604 errs() << "BOLT-WARNING: ORC unwind info excludes prologue for "
607 CurrentState
= It
->ORC
;
608 if (It
->ORC
!= NullORC
)
612 // While printing ORC, attach info to every instruction for convenience.
613 if (opts::PrintORC
|| &Inst
== &BB
.front())
614 BC
.MIB
->addAnnotation(Inst
, "ORC", *CurrentState
);
619 return Error::success();
622 Error
LinuxKernelRewriter::rewriteORCTables() {
624 return Error::success();
628 std::unique_ptr
<MetadataRewriter
>
629 llvm::bolt::createLinuxKernelRewriter(BinaryContext
&BC
) {
630 return std::make_unique
<LinuxKernelRewriter
>(BC
);