1 //====- X86SpeculativeLoadHardening.cpp - A Spectre v1 mitigation ---------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// Provide a pass which mitigates speculative execution attacks which operate
11 /// by speculating incorrectly past some predicate (a type check, bounds check,
12 /// or other condition) to reach a load with invalid inputs and leak the data
13 /// accessed by that load using a side channel out of the speculative domain.
15 /// For details on the attacks, see the first variant in both the Project Zero
16 /// writeup and the Spectre paper:
17 /// https://googleprojectzero.blogspot.com/2018/01/reading-privileged-memory-with-side.html
18 /// https://spectreattack.com/spectre.pdf
20 //===----------------------------------------------------------------------===//
23 #include "X86InstrInfo.h"
24 #include "X86Subtarget.h"
25 #include "llvm/ADT/ArrayRef.h"
26 #include "llvm/ADT/DenseMap.h"
27 #include "llvm/ADT/STLExtras.h"
28 #include "llvm/ADT/SmallPtrSet.h"
29 #include "llvm/ADT/SmallSet.h"
30 #include "llvm/ADT/SmallVector.h"
31 #include "llvm/ADT/SparseBitVector.h"
32 #include "llvm/ADT/Statistic.h"
33 #include "llvm/CodeGen/MachineBasicBlock.h"
34 #include "llvm/CodeGen/MachineConstantPool.h"
35 #include "llvm/CodeGen/MachineFunction.h"
36 #include "llvm/CodeGen/MachineFunctionPass.h"
37 #include "llvm/CodeGen/MachineInstr.h"
38 #include "llvm/CodeGen/MachineInstrBuilder.h"
39 #include "llvm/CodeGen/MachineModuleInfo.h"
40 #include "llvm/CodeGen/MachineOperand.h"
41 #include "llvm/CodeGen/MachineRegisterInfo.h"
42 #include "llvm/CodeGen/MachineSSAUpdater.h"
43 #include "llvm/CodeGen/TargetInstrInfo.h"
44 #include "llvm/CodeGen/TargetRegisterInfo.h"
45 #include "llvm/CodeGen/TargetSchedule.h"
46 #include "llvm/CodeGen/TargetSubtargetInfo.h"
47 #include "llvm/IR/DebugLoc.h"
48 #include "llvm/MC/MCSchedule.h"
49 #include "llvm/Pass.h"
50 #include "llvm/Support/CommandLine.h"
51 #include "llvm/Support/Debug.h"
52 #include "llvm/Support/raw_ostream.h"
53 #include "llvm/Target/TargetMachine.h"
61 #define PASS_KEY "x86-slh"
62 #define DEBUG_TYPE PASS_KEY
64 STATISTIC(NumCondBranchesTraced
, "Number of conditional branches traced");
65 STATISTIC(NumBranchesUntraced
, "Number of branches unable to trace");
66 STATISTIC(NumAddrRegsHardened
,
67 "Number of address mode used registers hardaned");
68 STATISTIC(NumPostLoadRegsHardened
,
69 "Number of post-load register values hardened");
70 STATISTIC(NumCallsOrJumpsHardened
,
71 "Number of calls or jumps requiring extra hardening");
72 STATISTIC(NumInstsInserted
, "Number of instructions inserted");
73 STATISTIC(NumLFENCEsInserted
, "Number of lfence instructions inserted");
75 static cl::opt
<bool> EnableSpeculativeLoadHardening(
76 "x86-speculative-load-hardening",
77 cl::desc("Force enable speculative load hardening"), cl::init(false),
80 static cl::opt
<bool> HardenEdgesWithLFENCE(
83 "Use LFENCE along each conditional edge to harden against speculative "
84 "loads rather than conditional movs and poisoned pointers."),
85 cl::init(false), cl::Hidden
);
87 static cl::opt
<bool> EnablePostLoadHardening(
88 PASS_KEY
"-post-load",
89 cl::desc("Harden the value loaded *after* it is loaded by "
90 "flushing the loaded bits to 1. This is hard to do "
91 "in general but can be done easily for GPRs."),
92 cl::init(true), cl::Hidden
);
94 static cl::opt
<bool> FenceCallAndRet(
95 PASS_KEY
"-fence-call-and-ret",
96 cl::desc("Use a full speculation fence to harden both call and ret edges "
97 "rather than a lighter weight mitigation."),
98 cl::init(false), cl::Hidden
);
100 static cl::opt
<bool> HardenInterprocedurally(
102 cl::desc("Harden interprocedurally by passing our state in and out of "
103 "functions in the high bits of the stack pointer."),
104 cl::init(true), cl::Hidden
);
107 HardenLoads(PASS_KEY
"-loads",
108 cl::desc("Sanitize loads from memory. When disable, no "
109 "significant security is provided."),
110 cl::init(true), cl::Hidden
);
112 static cl::opt
<bool> HardenIndirectCallsAndJumps(
113 PASS_KEY
"-indirect",
114 cl::desc("Harden indirect calls and jumps against using speculatively "
115 "stored attacker controlled addresses. This is designed to "
116 "mitigate Spectre v1.2 style attacks."),
117 cl::init(true), cl::Hidden
);
121 class X86SpeculativeLoadHardeningPass
: public MachineFunctionPass
{
123 X86SpeculativeLoadHardeningPass() : MachineFunctionPass(ID
) { }
125 StringRef
getPassName() const override
{
126 return "X86 speculative load hardening";
128 bool runOnMachineFunction(MachineFunction
&MF
) override
;
129 void getAnalysisUsage(AnalysisUsage
&AU
) const override
;
131 /// Pass identification, replacement for typeid.
135 /// The information about a block's conditional terminators needed to trace
136 /// our predicate state through the exiting edges.
137 struct BlockCondInfo
{
138 MachineBasicBlock
*MBB
;
140 // We mostly have one conditional branch, and in extremely rare cases have
141 // two. Three and more are so rare as to be unimportant for compile time.
142 SmallVector
<MachineInstr
*, 2> CondBrs
;
144 MachineInstr
*UncondBr
;
147 /// Manages the predicate state traced through the program.
149 unsigned InitialReg
= 0;
150 unsigned PoisonReg
= 0;
152 const TargetRegisterClass
*RC
;
153 MachineSSAUpdater SSA
;
155 PredState(MachineFunction
&MF
, const TargetRegisterClass
*RC
)
159 const X86Subtarget
*Subtarget
= nullptr;
160 MachineRegisterInfo
*MRI
= nullptr;
161 const X86InstrInfo
*TII
= nullptr;
162 const TargetRegisterInfo
*TRI
= nullptr;
164 std::optional
<PredState
> PS
;
166 void hardenEdgesWithLFENCE(MachineFunction
&MF
);
168 SmallVector
<BlockCondInfo
, 16> collectBlockCondInfo(MachineFunction
&MF
);
170 SmallVector
<MachineInstr
*, 16>
171 tracePredStateThroughCFG(MachineFunction
&MF
, ArrayRef
<BlockCondInfo
> Infos
);
173 void unfoldCallAndJumpLoads(MachineFunction
&MF
);
175 SmallVector
<MachineInstr
*, 16>
176 tracePredStateThroughIndirectBranches(MachineFunction
&MF
);
178 void tracePredStateThroughBlocksAndHarden(MachineFunction
&MF
);
180 unsigned saveEFLAGS(MachineBasicBlock
&MBB
,
181 MachineBasicBlock::iterator InsertPt
,
182 const DebugLoc
&Loc
);
183 void restoreEFLAGS(MachineBasicBlock
&MBB
,
184 MachineBasicBlock::iterator InsertPt
, const DebugLoc
&Loc
,
187 void mergePredStateIntoSP(MachineBasicBlock
&MBB
,
188 MachineBasicBlock::iterator InsertPt
,
189 const DebugLoc
&Loc
, unsigned PredStateReg
);
190 unsigned extractPredStateFromSP(MachineBasicBlock
&MBB
,
191 MachineBasicBlock::iterator InsertPt
,
192 const DebugLoc
&Loc
);
195 hardenLoadAddr(MachineInstr
&MI
, MachineOperand
&BaseMO
,
196 MachineOperand
&IndexMO
,
197 SmallDenseMap
<unsigned, unsigned, 32> &AddrRegToHardenedReg
);
199 sinkPostLoadHardenedInst(MachineInstr
&MI
,
200 SmallPtrSetImpl
<MachineInstr
*> &HardenedInstrs
);
201 bool canHardenRegister(Register Reg
);
202 unsigned hardenValueInRegister(Register Reg
, MachineBasicBlock
&MBB
,
203 MachineBasicBlock::iterator InsertPt
,
204 const DebugLoc
&Loc
);
205 unsigned hardenPostLoad(MachineInstr
&MI
);
206 void hardenReturnInstr(MachineInstr
&MI
);
207 void tracePredStateThroughCall(MachineInstr
&MI
);
208 void hardenIndirectCallOrJumpInstr(
210 SmallDenseMap
<unsigned, unsigned, 32> &AddrRegToHardenedReg
);
213 } // end anonymous namespace
215 char X86SpeculativeLoadHardeningPass::ID
= 0;
217 void X86SpeculativeLoadHardeningPass::getAnalysisUsage(
218 AnalysisUsage
&AU
) const {
219 MachineFunctionPass::getAnalysisUsage(AU
);
222 static MachineBasicBlock
&splitEdge(MachineBasicBlock
&MBB
,
223 MachineBasicBlock
&Succ
, int SuccCount
,
224 MachineInstr
*Br
, MachineInstr
*&UncondBr
,
225 const X86InstrInfo
&TII
) {
226 assert(!Succ
.isEHPad() && "Shouldn't get edges to EH pads!");
228 MachineFunction
&MF
= *MBB
.getParent();
230 MachineBasicBlock
&NewMBB
= *MF
.CreateMachineBasicBlock();
232 // We have to insert the new block immediately after the current one as we
233 // don't know what layout-successor relationships the successor has and we
234 // may not be able to (and generally don't want to) try to fix those up.
235 MF
.insert(std::next(MachineFunction::iterator(&MBB
)), &NewMBB
);
237 // Update the branch instruction if necessary.
239 assert(Br
->getOperand(0).getMBB() == &Succ
&&
240 "Didn't start with the right target!");
241 Br
->getOperand(0).setMBB(&NewMBB
);
243 // If this successor was reached through a branch rather than fallthrough,
244 // we might have *broken* fallthrough and so need to inject a new
245 // unconditional branch.
247 MachineBasicBlock
&OldLayoutSucc
=
248 *std::next(MachineFunction::iterator(&NewMBB
));
249 assert(MBB
.isSuccessor(&OldLayoutSucc
) &&
250 "Without an unconditional branch, the old layout successor should "
251 "be an actual successor!");
253 BuildMI(&MBB
, DebugLoc(), TII
.get(X86::JMP_1
)).addMBB(&OldLayoutSucc
);
254 // Update the unconditional branch now that we've added one.
255 UncondBr
= &*BrBuilder
;
258 // Insert unconditional "jump Succ" instruction in the new block if
260 if (!NewMBB
.isLayoutSuccessor(&Succ
)) {
261 SmallVector
<MachineOperand
, 4> Cond
;
262 TII
.insertBranch(NewMBB
, &Succ
, nullptr, Cond
, Br
->getDebugLoc());
266 "Cannot have a branchless successor and an unconditional branch!");
267 assert(NewMBB
.isLayoutSuccessor(&Succ
) &&
268 "A non-branch successor must have been a layout successor before "
269 "and now is a layout successor of the new block.");
272 // If this is the only edge to the successor, we can just replace it in the
273 // CFG. Otherwise we need to add a new entry in the CFG for the new
275 if (SuccCount
== 1) {
276 MBB
.replaceSuccessor(&Succ
, &NewMBB
);
278 MBB
.splitSuccessor(&Succ
, &NewMBB
);
281 // Hook up the edge from the new basic block to the old successor in the CFG.
282 NewMBB
.addSuccessor(&Succ
);
284 // Fix PHI nodes in Succ so they refer to NewMBB instead of MBB.
285 for (MachineInstr
&MI
: Succ
) {
288 for (int OpIdx
= 1, NumOps
= MI
.getNumOperands(); OpIdx
< NumOps
;
290 MachineOperand
&OpV
= MI
.getOperand(OpIdx
);
291 MachineOperand
&OpMBB
= MI
.getOperand(OpIdx
+ 1);
292 assert(OpMBB
.isMBB() && "Block operand to a PHI is not a block!");
293 if (OpMBB
.getMBB() != &MBB
)
296 // If this is the last edge to the succesor, just replace MBB in the PHI
297 if (SuccCount
== 1) {
298 OpMBB
.setMBB(&NewMBB
);
302 // Otherwise, append a new pair of operands for the new incoming edge.
303 MI
.addOperand(MF
, OpV
);
304 MI
.addOperand(MF
, MachineOperand::CreateMBB(&NewMBB
));
309 // Inherit live-ins from the successor
310 for (auto &LI
: Succ
.liveins())
311 NewMBB
.addLiveIn(LI
);
313 LLVM_DEBUG(dbgs() << " Split edge from '" << MBB
.getName() << "' to '"
314 << Succ
.getName() << "'.\n");
318 /// Removing duplicate PHI operands to leave the PHI in a canonical and
319 /// predictable form.
321 /// FIXME: It's really frustrating that we have to do this, but SSA-form in MIR
322 /// isn't what you might expect. We may have multiple entries in PHI nodes for
323 /// a single predecessor. This makes CFG-updating extremely complex, so here we
324 /// simplify all PHI nodes to a model even simpler than the IR's model: exactly
325 /// one entry per predecessor, regardless of how many edges there are.
326 static void canonicalizePHIOperands(MachineFunction
&MF
) {
327 SmallPtrSet
<MachineBasicBlock
*, 4> Preds
;
328 SmallVector
<int, 4> DupIndices
;
330 for (auto &MI
: MBB
) {
334 // First we scan the operands of the PHI looking for duplicate entries
335 // a particular predecessor. We retain the operand index of each duplicate
337 for (int OpIdx
= 1, NumOps
= MI
.getNumOperands(); OpIdx
< NumOps
;
339 if (!Preds
.insert(MI
.getOperand(OpIdx
+ 1).getMBB()).second
)
340 DupIndices
.push_back(OpIdx
);
342 // Now walk the duplicate indices, removing both the block and value. Note
343 // that these are stored as a vector making this element-wise removal
345 // potentially quadratic.
347 // FIXME: It is really frustrating that we have to use a quadratic
348 // removal algorithm here. There should be a better way, but the use-def
349 // updates required make that impossible using the public API.
351 // Note that we have to process these backwards so that we don't
352 // invalidate other indices with each removal.
353 while (!DupIndices
.empty()) {
354 int OpIdx
= DupIndices
.pop_back_val();
355 // Remove both the block and value operand, again in reverse order to
357 MI
.removeOperand(OpIdx
+ 1);
358 MI
.removeOperand(OpIdx
);
365 /// Helper to scan a function for loads vulnerable to misspeculation that we
368 /// We use this to avoid making changes to functions where there is nothing we
369 /// need to do to harden against misspeculation.
370 static bool hasVulnerableLoad(MachineFunction
&MF
) {
371 for (MachineBasicBlock
&MBB
: MF
) {
372 for (MachineInstr
&MI
: MBB
) {
373 // Loads within this basic block after an LFENCE are not at risk of
374 // speculatively executing with invalid predicates from prior control
375 // flow. So break out of this block but continue scanning the function.
376 if (MI
.getOpcode() == X86::LFENCE
)
379 // Looking for loads only.
383 // An MFENCE is modeled as a load but isn't vulnerable to misspeculation.
384 if (MI
.getOpcode() == X86::MFENCE
)
396 bool X86SpeculativeLoadHardeningPass::runOnMachineFunction(
397 MachineFunction
&MF
) {
398 LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF
.getName()
401 // Only run if this pass is forced enabled or we detect the relevant function
402 // attribute requesting SLH.
403 if (!EnableSpeculativeLoadHardening
&&
404 !MF
.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening
))
407 Subtarget
= &MF
.getSubtarget
<X86Subtarget
>();
408 MRI
= &MF
.getRegInfo();
409 TII
= Subtarget
->getInstrInfo();
410 TRI
= Subtarget
->getRegisterInfo();
412 // FIXME: Support for 32-bit.
413 PS
.emplace(MF
, &X86::GR64_NOSPRegClass
);
415 if (MF
.begin() == MF
.end())
416 // Nothing to do for a degenerate empty function...
419 // We support an alternative hardening technique based on a debug flag.
420 if (HardenEdgesWithLFENCE
) {
421 hardenEdgesWithLFENCE(MF
);
425 // Create a dummy debug loc to use for all the generated code here.
428 MachineBasicBlock
&Entry
= *MF
.begin();
429 auto EntryInsertPt
= Entry
.SkipPHIsLabelsAndDebug(Entry
.begin());
431 // Do a quick scan to see if we have any checkable loads.
432 bool HasVulnerableLoad
= hasVulnerableLoad(MF
);
434 // See if we have any conditional branching blocks that we will need to trace
435 // predicate state through.
436 SmallVector
<BlockCondInfo
, 16> Infos
= collectBlockCondInfo(MF
);
438 // If we have no interesting conditions or loads, nothing to do here.
439 if (!HasVulnerableLoad
&& Infos
.empty())
442 // The poison value is required to be an all-ones value for many aspects of
444 const int PoisonVal
= -1;
445 PS
->PoisonReg
= MRI
->createVirtualRegister(PS
->RC
);
446 BuildMI(Entry
, EntryInsertPt
, Loc
, TII
->get(X86::MOV64ri32
), PS
->PoisonReg
)
450 // If we have loads being hardened and we've asked for call and ret edges to
451 // get a full fence-based mitigation, inject that fence.
452 if (HasVulnerableLoad
&& FenceCallAndRet
) {
453 // We need to insert an LFENCE at the start of the function to suspend any
454 // incoming misspeculation from the caller. This helps two-fold: the caller
455 // may not have been protected as this code has been, and this code gets to
456 // not take any specific action to protect across calls.
457 // FIXME: We could skip this for functions which unconditionally return
459 BuildMI(Entry
, EntryInsertPt
, Loc
, TII
->get(X86::LFENCE
));
461 ++NumLFENCEsInserted
;
464 // If we guarded the entry with an LFENCE and have no conditionals to protect
465 // in blocks, then we're done.
466 if (FenceCallAndRet
&& Infos
.empty())
467 // We may have changed the function's code at this point to insert fences.
470 // For every basic block in the function which can b
471 if (HardenInterprocedurally
&& !FenceCallAndRet
) {
472 // Set up the predicate state by extracting it from the incoming stack
473 // pointer so we pick up any misspeculation in our caller.
474 PS
->InitialReg
= extractPredStateFromSP(Entry
, EntryInsertPt
, Loc
);
476 // Otherwise, just build the predicate state itself by zeroing a register
477 // as we don't need any initial state.
478 PS
->InitialReg
= MRI
->createVirtualRegister(PS
->RC
);
479 Register PredStateSubReg
= MRI
->createVirtualRegister(&X86::GR32RegClass
);
480 auto ZeroI
= BuildMI(Entry
, EntryInsertPt
, Loc
, TII
->get(X86::MOV32r0
),
483 MachineOperand
*ZeroEFLAGSDefOp
=
484 ZeroI
->findRegisterDefOperand(X86::EFLAGS
, /*TRI=*/nullptr);
485 assert(ZeroEFLAGSDefOp
&& ZeroEFLAGSDefOp
->isImplicit() &&
486 "Must have an implicit def of EFLAGS!");
487 ZeroEFLAGSDefOp
->setIsDead(true);
488 BuildMI(Entry
, EntryInsertPt
, Loc
, TII
->get(X86::SUBREG_TO_REG
),
491 .addReg(PredStateSubReg
)
492 .addImm(X86::sub_32bit
);
495 // We're going to need to trace predicate state throughout the function's
496 // CFG. Prepare for this by setting up our initial state of PHIs with unique
497 // predecessor entries and all the initial predicate state.
498 canonicalizePHIOperands(MF
);
500 // Track the updated values in an SSA updater to rewrite into SSA form at the
502 PS
->SSA
.Initialize(PS
->InitialReg
);
503 PS
->SSA
.AddAvailableValue(&Entry
, PS
->InitialReg
);
505 // Trace through the CFG.
506 auto CMovs
= tracePredStateThroughCFG(MF
, Infos
);
508 // We may also enter basic blocks in this function via exception handling
509 // control flow. Here, if we are hardening interprocedurally, we need to
510 // re-capture the predicate state from the throwing code. In the Itanium ABI,
511 // the throw will always look like a call to __cxa_throw and will have the
512 // predicate state in the stack pointer, so extract fresh predicate state from
513 // the stack pointer and make it available in SSA.
514 // FIXME: Handle non-itanium ABI EH models.
515 if (HardenInterprocedurally
) {
516 for (MachineBasicBlock
&MBB
: MF
) {
517 assert(!MBB
.isEHScopeEntry() && "Only Itanium ABI EH supported!");
518 assert(!MBB
.isEHFuncletEntry() && "Only Itanium ABI EH supported!");
519 assert(!MBB
.isCleanupFuncletEntry() && "Only Itanium ABI EH supported!");
522 PS
->SSA
.AddAvailableValue(
524 extractPredStateFromSP(MBB
, MBB
.SkipPHIsAndLabels(MBB
.begin()), Loc
));
528 if (HardenIndirectCallsAndJumps
) {
529 // If we are going to harden calls and jumps we need to unfold their memory
531 unfoldCallAndJumpLoads(MF
);
533 // Then we trace predicate state through the indirect branches.
534 auto IndirectBrCMovs
= tracePredStateThroughIndirectBranches(MF
);
535 CMovs
.append(IndirectBrCMovs
.begin(), IndirectBrCMovs
.end());
538 // Now that we have the predicate state available at the start of each block
539 // in the CFG, trace it through each block, hardening vulnerable instructions
541 tracePredStateThroughBlocksAndHarden(MF
);
543 // Now rewrite all the uses of the pred state using the SSA updater to insert
544 // PHIs connecting the state between blocks along the CFG edges.
545 for (MachineInstr
*CMovI
: CMovs
)
546 for (MachineOperand
&Op
: CMovI
->operands()) {
547 if (!Op
.isReg() || Op
.getReg() != PS
->InitialReg
)
550 PS
->SSA
.RewriteUse(Op
);
553 LLVM_DEBUG(dbgs() << "Final speculative load hardened function:\n"; MF
.dump();
554 dbgs() << "\n"; MF
.verify(this));
558 /// Implements the naive hardening approach of putting an LFENCE after every
559 /// potentially mis-predicted control flow construct.
561 /// We include this as an alternative mostly for the purpose of comparison. The
562 /// performance impact of this is expected to be extremely severe and not
563 /// practical for any real-world users.
564 void X86SpeculativeLoadHardeningPass::hardenEdgesWithLFENCE(
565 MachineFunction
&MF
) {
566 // First, we scan the function looking for blocks that are reached along edges
567 // that we might want to harden.
568 SmallSetVector
<MachineBasicBlock
*, 8> Blocks
;
569 for (MachineBasicBlock
&MBB
: MF
) {
570 // If there are no or only one successor, nothing to do here.
571 if (MBB
.succ_size() <= 1)
574 // Skip blocks unless their terminators start with a branch. Other
575 // terminators don't seem interesting for guarding against misspeculation.
576 auto TermIt
= MBB
.getFirstTerminator();
577 if (TermIt
== MBB
.end() || !TermIt
->isBranch())
580 // Add all the non-EH-pad succossors to the blocks we want to harden. We
581 // skip EH pads because there isn't really a condition of interest on
583 for (MachineBasicBlock
*SuccMBB
: MBB
.successors())
584 if (!SuccMBB
->isEHPad())
585 Blocks
.insert(SuccMBB
);
588 for (MachineBasicBlock
*MBB
: Blocks
) {
589 auto InsertPt
= MBB
->SkipPHIsAndLabels(MBB
->begin());
590 BuildMI(*MBB
, InsertPt
, DebugLoc(), TII
->get(X86::LFENCE
));
592 ++NumLFENCEsInserted
;
596 SmallVector
<X86SpeculativeLoadHardeningPass::BlockCondInfo
, 16>
597 X86SpeculativeLoadHardeningPass::collectBlockCondInfo(MachineFunction
&MF
) {
598 SmallVector
<BlockCondInfo
, 16> Infos
;
600 // Walk the function and build up a summary for each block's conditions that
601 // we need to trace through.
602 for (MachineBasicBlock
&MBB
: MF
) {
603 // If there are no or only one successor, nothing to do here.
604 if (MBB
.succ_size() <= 1)
607 // We want to reliably handle any conditional branch terminators in the
608 // MBB, so we manually analyze the branch. We can handle all of the
609 // permutations here, including ones that analyze branch cannot.
611 // The approach is to walk backwards across the terminators, resetting at
612 // any unconditional non-indirect branch, and track all conditional edges
613 // to basic blocks as well as the fallthrough or unconditional successor
614 // edge. For each conditional edge, we track the target and the opposite
615 // condition code in order to inject a "no-op" cmov into that successor
616 // that will harden the predicate. For the fallthrough/unconditional
617 // edge, we inject a separate cmov for each conditional branch with
618 // matching condition codes. This effectively implements an "and" of the
619 // condition flags, even if there isn't a single condition flag that would
620 // directly implement that. We don't bother trying to optimize either of
621 // these cases because if such an optimization is possible, LLVM should
622 // have optimized the conditional *branches* in that way already to reduce
623 // instruction count. This late, we simply assume the minimal number of
624 // branch instructions is being emitted and use that to guide our cmov
627 BlockCondInfo Info
= {&MBB
, {}, nullptr};
629 // Now walk backwards through the terminators and build up successors they
630 // reach and the conditions.
631 for (MachineInstr
&MI
: llvm::reverse(MBB
)) {
632 // Once we've handled all the terminators, we're done.
633 if (!MI
.isTerminator())
636 // If we see a non-branch terminator, we can't handle anything so bail.
637 if (!MI
.isBranch()) {
638 Info
.CondBrs
.clear();
642 // If we see an unconditional branch, reset our state, clear any
643 // fallthrough, and set this is the "else" successor.
644 if (MI
.getOpcode() == X86::JMP_1
) {
645 Info
.CondBrs
.clear();
650 // If we get an invalid condition, we have an indirect branch or some
651 // other unanalyzable "fallthrough" case. We model this as a nullptr for
652 // the destination so we can still guard any conditional successors.
653 // Consider code sequences like:
658 // We still want to harden the edge to `L1`.
659 if (X86::getCondFromBranch(MI
) == X86::COND_INVALID
) {
660 Info
.CondBrs
.clear();
665 // We have a vanilla conditional branch, add it to our list.
666 Info
.CondBrs
.push_back(&MI
);
668 if (Info
.CondBrs
.empty()) {
669 ++NumBranchesUntraced
;
670 LLVM_DEBUG(dbgs() << "WARNING: unable to secure successors of block:\n";
675 Infos
.push_back(Info
);
681 /// Trace the predicate state through the CFG, instrumenting each conditional
682 /// branch such that misspeculation through an edge will poison the predicate
685 /// Returns the list of inserted CMov instructions so that they can have their
686 /// uses of the predicate state rewritten into proper SSA form once it is
688 SmallVector
<MachineInstr
*, 16>
689 X86SpeculativeLoadHardeningPass::tracePredStateThroughCFG(
690 MachineFunction
&MF
, ArrayRef
<BlockCondInfo
> Infos
) {
691 // Collect the inserted cmov instructions so we can rewrite their uses of the
692 // predicate state into SSA form.
693 SmallVector
<MachineInstr
*, 16> CMovs
;
695 // Now walk all of the basic blocks looking for ones that end in conditional
696 // jumps where we need to update this register along each edge.
697 for (const BlockCondInfo
&Info
: Infos
) {
698 MachineBasicBlock
&MBB
= *Info
.MBB
;
699 const SmallVectorImpl
<MachineInstr
*> &CondBrs
= Info
.CondBrs
;
700 MachineInstr
*UncondBr
= Info
.UncondBr
;
702 LLVM_DEBUG(dbgs() << "Tracing predicate through block: " << MBB
.getName()
704 ++NumCondBranchesTraced
;
706 // Compute the non-conditional successor as either the target of any
707 // unconditional branch or the layout successor.
708 MachineBasicBlock
*UncondSucc
=
709 UncondBr
? (UncondBr
->getOpcode() == X86::JMP_1
710 ? UncondBr
->getOperand(0).getMBB()
712 : &*std::next(MachineFunction::iterator(&MBB
));
714 // Count how many edges there are to any given successor.
715 SmallDenseMap
<MachineBasicBlock
*, int> SuccCounts
;
717 ++SuccCounts
[UncondSucc
];
718 for (auto *CondBr
: CondBrs
)
719 ++SuccCounts
[CondBr
->getOperand(0).getMBB()];
721 // A lambda to insert cmov instructions into a block checking all of the
722 // condition codes in a sequence.
723 auto BuildCheckingBlockForSuccAndConds
=
724 [&](MachineBasicBlock
&MBB
, MachineBasicBlock
&Succ
, int SuccCount
,
725 MachineInstr
*Br
, MachineInstr
*&UncondBr
,
726 ArrayRef
<X86::CondCode
> Conds
) {
727 // First, we split the edge to insert the checking block into a safe
730 (SuccCount
== 1 && Succ
.pred_size() == 1)
732 : splitEdge(MBB
, Succ
, SuccCount
, Br
, UncondBr
, *TII
);
734 bool LiveEFLAGS
= Succ
.isLiveIn(X86::EFLAGS
);
736 CheckingMBB
.addLiveIn(X86::EFLAGS
);
738 // Now insert the cmovs to implement the checks.
739 auto InsertPt
= CheckingMBB
.begin();
740 assert((InsertPt
== CheckingMBB
.end() || !InsertPt
->isPHI()) &&
741 "Should never have a PHI in the initial checking block as it "
742 "always has a single predecessor!");
744 // We will wire each cmov to each other, but need to start with the
745 // incoming pred state.
746 unsigned CurStateReg
= PS
->InitialReg
;
748 for (X86::CondCode Cond
: Conds
) {
749 int PredStateSizeInBytes
= TRI
->getRegSizeInBits(*PS
->RC
) / 8;
750 auto CMovOp
= X86::getCMovOpcode(PredStateSizeInBytes
);
752 Register UpdatedStateReg
= MRI
->createVirtualRegister(PS
->RC
);
753 // Note that we intentionally use an empty debug location so that
754 // this picks up the preceding location.
755 auto CMovI
= BuildMI(CheckingMBB
, InsertPt
, DebugLoc(),
756 TII
->get(CMovOp
), UpdatedStateReg
)
758 .addReg(PS
->PoisonReg
)
760 // If this is the last cmov and the EFLAGS weren't originally
761 // live-in, mark them as killed.
762 if (!LiveEFLAGS
&& Cond
== Conds
.back())
763 CMovI
->findRegisterUseOperand(X86::EFLAGS
, /*TRI=*/nullptr)
767 LLVM_DEBUG(dbgs() << " Inserting cmov: "; CMovI
->dump();
770 // The first one of the cmovs will be using the top level
771 // `PredStateReg` and need to get rewritten into SSA form.
772 if (CurStateReg
== PS
->InitialReg
)
773 CMovs
.push_back(&*CMovI
);
775 // The next cmov should start from this one's def.
776 CurStateReg
= UpdatedStateReg
;
779 // And put the last one into the available values for SSA form of our
781 PS
->SSA
.AddAvailableValue(&CheckingMBB
, CurStateReg
);
784 std::vector
<X86::CondCode
> UncondCodeSeq
;
785 for (auto *CondBr
: CondBrs
) {
786 MachineBasicBlock
&Succ
= *CondBr
->getOperand(0).getMBB();
787 int &SuccCount
= SuccCounts
[&Succ
];
789 X86::CondCode Cond
= X86::getCondFromBranch(*CondBr
);
790 X86::CondCode InvCond
= X86::GetOppositeBranchCondition(Cond
);
791 UncondCodeSeq
.push_back(Cond
);
793 BuildCheckingBlockForSuccAndConds(MBB
, Succ
, SuccCount
, CondBr
, UncondBr
,
796 // Decrement the successor count now that we've split one of the edges.
797 // We need to keep the count of edges to the successor accurate in order
798 // to know above when to *replace* the successor in the CFG vs. just
799 // adding the new successor.
803 // Since we may have split edges and changed the number of successors,
804 // normalize the probabilities. This avoids doing it each time we split an
806 MBB
.normalizeSuccProbs();
808 // Finally, we need to insert cmovs into the "fallthrough" edge. Here, we
809 // need to intersect the other condition codes. We can do this by just
810 // doing a cmov for each one.
812 // If we have no fallthrough to protect (perhaps it is an indirect jump?)
813 // just skip this and continue.
816 assert(SuccCounts
[UncondSucc
] == 1 &&
817 "We should never have more than one edge to the unconditional "
818 "successor at this point because every other edge must have been "
821 // Sort and unique the codes to minimize them.
822 llvm::sort(UncondCodeSeq
);
823 UncondCodeSeq
.erase(llvm::unique(UncondCodeSeq
), UncondCodeSeq
.end());
825 // Build a checking version of the successor.
826 BuildCheckingBlockForSuccAndConds(MBB
, *UncondSucc
, /*SuccCount*/ 1,
827 UncondBr
, UncondBr
, UncondCodeSeq
);
833 /// Compute the register class for the unfolded load.
835 /// FIXME: This should probably live in X86InstrInfo, potentially by adding
836 /// a way to unfold into a newly created vreg rather than requiring a register
838 static const TargetRegisterClass
*
839 getRegClassForUnfoldedLoad(MachineFunction
&MF
, const X86InstrInfo
&TII
,
842 unsigned UnfoldedOpc
= TII
.getOpcodeAfterMemoryUnfold(
843 Opcode
, /*UnfoldLoad*/ true, /*UnfoldStore*/ false, &Index
);
844 const MCInstrDesc
&MCID
= TII
.get(UnfoldedOpc
);
845 return TII
.getRegClass(MCID
, Index
, &TII
.getRegisterInfo(), MF
);
848 void X86SpeculativeLoadHardeningPass::unfoldCallAndJumpLoads(
849 MachineFunction
&MF
) {
850 for (MachineBasicBlock
&MBB
: MF
)
851 // We use make_early_inc_range here so we can remove instructions if needed
852 // without disturbing the iteration.
853 for (MachineInstr
&MI
: llvm::make_early_inc_range(MBB
.instrs())) {
854 // Must either be a call or a branch.
855 if (!MI
.isCall() && !MI
.isBranch())
857 // We only care about loading variants of these instructions.
861 switch (MI
.getOpcode()) {
864 dbgs() << "ERROR: Found an unexpected loading branch or call "
866 MI
.dump(); dbgs() << "\n");
867 report_fatal_error("Unexpected loading branch or call!");
870 case X86::FARCALL16m
:
871 case X86::FARCALL32m
:
872 case X86::FARCALL64m
:
876 // We cannot mitigate far jumps or calls, but we also don't expect them
877 // to be vulnerable to Spectre v1.2 style attacks.
881 case X86::CALL16m_NT
:
883 case X86::CALL32m_NT
:
885 case X86::CALL64m_NT
:
892 case X86::TAILJMPm64
:
893 case X86::TAILJMPm64_REX
:
895 case X86::TCRETURNmi64
:
896 case X86::TCRETURNmi
: {
897 // Use the generic unfold logic now that we know we're dealing with
898 // expected instructions.
899 // FIXME: We don't have test coverage for all of these!
900 auto *UnfoldedRC
= getRegClassForUnfoldedLoad(MF
, *TII
, MI
.getOpcode());
903 << "ERROR: Unable to unfold load from instruction:\n";
904 MI
.dump(); dbgs() << "\n");
905 report_fatal_error("Unable to unfold load!");
907 Register Reg
= MRI
->createVirtualRegister(UnfoldedRC
);
908 SmallVector
<MachineInstr
*, 2> NewMIs
;
909 // If we were able to compute an unfolded reg class, any failure here
910 // is just a programming error so just assert.
912 TII
->unfoldMemoryOperand(MF
, MI
, Reg
, /*UnfoldLoad*/ true,
913 /*UnfoldStore*/ false, NewMIs
);
916 "Computed unfolded register class but failed to unfold");
917 // Now stitch the new instructions into place and erase the old one.
918 for (auto *NewMI
: NewMIs
)
919 MBB
.insert(MI
.getIterator(), NewMI
);
921 // Update the call site info.
922 if (MI
.isCandidateForCallSiteEntry())
923 MF
.eraseCallSiteInfo(&MI
);
925 MI
.eraseFromParent();
927 dbgs() << "Unfolded load successfully into:\n";
928 for (auto *NewMI
: NewMIs
) {
936 llvm_unreachable("Escaped switch with default!");
940 /// Trace the predicate state through indirect branches, instrumenting them to
941 /// poison the state if a target is reached that does not match the expected
944 /// This is designed to mitigate Spectre variant 1 attacks where an indirect
945 /// branch is trained to predict a particular target and then mispredicts that
946 /// target in a way that can leak data. Despite using an indirect branch, this
947 /// is really a variant 1 style attack: it does not steer execution to an
948 /// arbitrary or attacker controlled address, and it does not require any
949 /// special code executing next to the victim. This attack can also be mitigated
950 /// through retpolines, but those require either replacing indirect branches
951 /// with conditional direct branches or lowering them through a device that
952 /// blocks speculation. This mitigation can replace these retpoline-style
953 /// mitigations for jump tables and other indirect branches within a function
954 /// when variant 2 isn't a risk while allowing limited speculation. Indirect
955 /// calls, however, cannot be mitigated through this technique without changing
956 /// the ABI in a fundamental way.
957 SmallVector
<MachineInstr
*, 16>
958 X86SpeculativeLoadHardeningPass::tracePredStateThroughIndirectBranches(
959 MachineFunction
&MF
) {
960 // We use the SSAUpdater to insert PHI nodes for the target addresses of
961 // indirect branches. We don't actually need the full power of the SSA updater
962 // in this particular case as we always have immediately available values, but
963 // this avoids us having to re-implement the PHI construction logic.
964 MachineSSAUpdater
TargetAddrSSA(MF
);
965 TargetAddrSSA
.Initialize(MRI
->createVirtualRegister(&X86::GR64RegClass
));
967 // Track which blocks were terminated with an indirect branch.
968 SmallPtrSet
<MachineBasicBlock
*, 4> IndirectTerminatedMBBs
;
970 // We need to know what blocks end up reached via indirect branches. We
971 // expect this to be a subset of those whose address is taken and so track it
972 // directly via the CFG.
973 SmallPtrSet
<MachineBasicBlock
*, 4> IndirectTargetMBBs
;
975 // Walk all the blocks which end in an indirect branch and make the
976 // target address available.
977 for (MachineBasicBlock
&MBB
: MF
) {
978 // Find the last terminator.
979 auto MII
= MBB
.instr_rbegin();
980 while (MII
!= MBB
.instr_rend() && MII
->isDebugInstr())
982 if (MII
== MBB
.instr_rend())
984 MachineInstr
&TI
= *MII
;
985 if (!TI
.isTerminator() || !TI
.isBranch())
986 // No terminator or non-branch terminator.
991 switch (TI
.getOpcode()) {
993 // Direct branch or conditional branch (leading to fallthrough).
999 // We cannot mitigate far jumps or calls, but we also don't expect them
1000 // to be vulnerable to Spectre v1.2 or v2 (self trained) style attacks.
1004 case X86::JMP16m_NT
:
1006 case X86::JMP32m_NT
:
1008 case X86::JMP64m_NT
:
1009 // Mostly as documentation.
1010 report_fatal_error("Memory operand jumps should have been unfolded!");
1014 "Support for 16-bit indirect branches is not implemented.");
1017 "Support for 32-bit indirect branches is not implemented.");
1020 TargetReg
= TI
.getOperand(0).getReg();
1023 // We have definitely found an indirect branch. Verify that there are no
1024 // preceding conditional branches as we don't yet support that.
1025 if (llvm::any_of(MBB
.terminators(), [&](MachineInstr
&OtherTI
) {
1026 return !OtherTI
.isDebugInstr() && &OtherTI
!= &TI
;
1029 dbgs() << "ERROR: Found other terminators in a block with an indirect "
1030 "branch! This is not yet supported! Terminator sequence:\n";
1031 for (MachineInstr
&MI
: MBB
.terminators()) {
1036 report_fatal_error("Unimplemented terminator sequence!");
1039 // Make the target register an available value for this block.
1040 TargetAddrSSA
.AddAvailableValue(&MBB
, TargetReg
);
1041 IndirectTerminatedMBBs
.insert(&MBB
);
1043 // Add all the successors to our target candidates.
1044 for (MachineBasicBlock
*Succ
: MBB
.successors())
1045 IndirectTargetMBBs
.insert(Succ
);
1048 // Keep track of the cmov instructions we insert so we can return them.
1049 SmallVector
<MachineInstr
*, 16> CMovs
;
1051 // If we didn't find any indirect branches with targets, nothing to do here.
1052 if (IndirectTargetMBBs
.empty())
1055 // We found indirect branches and targets that need to be instrumented to
1056 // harden loads within them. Walk the blocks of the function (to get a stable
1057 // ordering) and instrument each target of an indirect branch.
1058 for (MachineBasicBlock
&MBB
: MF
) {
1059 // Skip the blocks that aren't candidate targets.
1060 if (!IndirectTargetMBBs
.count(&MBB
))
1063 // We don't expect EH pads to ever be reached via an indirect branch. If
1064 // this is desired for some reason, we could simply skip them here rather
1066 assert(!MBB
.isEHPad() &&
1067 "Unexpected EH pad as target of an indirect branch!");
1069 // We should never end up threading EFLAGS into a block to harden
1070 // conditional jumps as there would be an additional successor via the
1071 // indirect branch. As a consequence, all such edges would be split before
1072 // reaching here, and the inserted block will handle the EFLAGS-based
1074 assert(!MBB
.isLiveIn(X86::EFLAGS
) &&
1075 "Cannot check within a block that already has live-in EFLAGS!");
1077 // We can't handle having non-indirect edges into this block unless this is
1078 // the only successor and we can synthesize the necessary target address.
1079 for (MachineBasicBlock
*Pred
: MBB
.predecessors()) {
1080 // If we've already handled this by extracting the target directly,
1082 if (IndirectTerminatedMBBs
.count(Pred
))
1085 // Otherwise, we have to be the only successor. We generally expect this
1086 // to be true as conditional branches should have had a critical edge
1087 // split already. We don't however need to worry about EH pad successors
1088 // as they'll happily ignore the target and their hardening strategy is
1089 // resilient to all ways in which they could be reached speculatively.
1090 if (!llvm::all_of(Pred
->successors(), [&](MachineBasicBlock
*Succ
) {
1091 return Succ
->isEHPad() || Succ
== &MBB
;
1094 dbgs() << "ERROR: Found conditional entry to target of indirect "
1099 report_fatal_error("Cannot harden a conditional entry to a target of "
1100 "an indirect branch!");
1103 // Now we need to compute the address of this block and install it as a
1104 // synthetic target in the predecessor. We do this at the bottom of the
1106 auto InsertPt
= Pred
->getFirstTerminator();
1107 Register TargetReg
= MRI
->createVirtualRegister(&X86::GR64RegClass
);
1108 if (MF
.getTarget().getCodeModel() == CodeModel::Small
&&
1109 !Subtarget
->isPositionIndependent()) {
1110 // Directly materialize it into an immediate.
1111 auto AddrI
= BuildMI(*Pred
, InsertPt
, DebugLoc(),
1112 TII
->get(X86::MOV64ri32
), TargetReg
)
1116 LLVM_DEBUG(dbgs() << " Inserting mov: "; AddrI
->dump();
1119 auto AddrI
= BuildMI(*Pred
, InsertPt
, DebugLoc(), TII
->get(X86::LEA64r
),
1121 .addReg(/*Base*/ X86::RIP
)
1122 .addImm(/*Scale*/ 1)
1123 .addReg(/*Index*/ 0)
1125 .addReg(/*Segment*/ 0);
1128 LLVM_DEBUG(dbgs() << " Inserting lea: "; AddrI
->dump();
1131 // And make this available.
1132 TargetAddrSSA
.AddAvailableValue(Pred
, TargetReg
);
1135 // Materialize the needed SSA value of the target. Note that we need the
1136 // middle of the block as this block might at the bottom have an indirect
1137 // branch back to itself. We can do this here because at this point, every
1138 // predecessor of this block has an available value. This is basically just
1139 // automating the construction of a PHI node for this target.
1140 Register TargetReg
= TargetAddrSSA
.GetValueInMiddleOfBlock(&MBB
);
1142 // Insert a comparison of the incoming target register with this block's
1143 // address. This also requires us to mark the block as having its address
1144 // taken explicitly.
1145 MBB
.setMachineBlockAddressTaken();
1146 auto InsertPt
= MBB
.SkipPHIsLabelsAndDebug(MBB
.begin());
1147 if (MF
.getTarget().getCodeModel() == CodeModel::Small
&&
1148 !Subtarget
->isPositionIndependent()) {
1149 // Check directly against a relocated immediate when we can.
1150 auto CheckI
= BuildMI(MBB
, InsertPt
, DebugLoc(), TII
->get(X86::CMP64ri32
))
1151 .addReg(TargetReg
, RegState::Kill
)
1155 LLVM_DEBUG(dbgs() << " Inserting cmp: "; CheckI
->dump(); dbgs() << "\n");
1157 // Otherwise compute the address into a register first.
1158 Register AddrReg
= MRI
->createVirtualRegister(&X86::GR64RegClass
);
1160 BuildMI(MBB
, InsertPt
, DebugLoc(), TII
->get(X86::LEA64r
), AddrReg
)
1161 .addReg(/*Base*/ X86::RIP
)
1162 .addImm(/*Scale*/ 1)
1163 .addReg(/*Index*/ 0)
1165 .addReg(/*Segment*/ 0);
1168 LLVM_DEBUG(dbgs() << " Inserting lea: "; AddrI
->dump(); dbgs() << "\n");
1169 auto CheckI
= BuildMI(MBB
, InsertPt
, DebugLoc(), TII
->get(X86::CMP64rr
))
1170 .addReg(TargetReg
, RegState::Kill
)
1171 .addReg(AddrReg
, RegState::Kill
);
1174 LLVM_DEBUG(dbgs() << " Inserting cmp: "; CheckI
->dump(); dbgs() << "\n");
1177 // Now cmov over the predicate if the comparison wasn't equal.
1178 int PredStateSizeInBytes
= TRI
->getRegSizeInBits(*PS
->RC
) / 8;
1179 auto CMovOp
= X86::getCMovOpcode(PredStateSizeInBytes
);
1180 Register UpdatedStateReg
= MRI
->createVirtualRegister(PS
->RC
);
1182 BuildMI(MBB
, InsertPt
, DebugLoc(), TII
->get(CMovOp
), UpdatedStateReg
)
1183 .addReg(PS
->InitialReg
)
1184 .addReg(PS
->PoisonReg
)
1185 .addImm(X86::COND_NE
);
1186 CMovI
->findRegisterUseOperand(X86::EFLAGS
, /*TRI=*/nullptr)
1189 LLVM_DEBUG(dbgs() << " Inserting cmov: "; CMovI
->dump(); dbgs() << "\n");
1190 CMovs
.push_back(&*CMovI
);
1192 // And put the new value into the available values for SSA form of our
1194 PS
->SSA
.AddAvailableValue(&MBB
, UpdatedStateReg
);
1197 // Return all the newly inserted cmov instructions of the predicate state.
1201 // Returns true if the MI has EFLAGS as a register def operand and it's live,
1202 // otherwise it returns false
1203 static bool isEFLAGSDefLive(const MachineInstr
&MI
) {
1204 if (const MachineOperand
*DefOp
=
1205 MI
.findRegisterDefOperand(X86::EFLAGS
, /*TRI=*/nullptr)) {
1206 return !DefOp
->isDead();
1211 static bool isEFLAGSLive(MachineBasicBlock
&MBB
, MachineBasicBlock::iterator I
,
1212 const TargetRegisterInfo
&TRI
) {
1213 // Check if EFLAGS are alive by seeing if there is a def of them or they
1214 // live-in, and then seeing if that def is in turn used.
1215 for (MachineInstr
&MI
: llvm::reverse(llvm::make_range(MBB
.begin(), I
))) {
1216 if (MachineOperand
*DefOp
=
1217 MI
.findRegisterDefOperand(X86::EFLAGS
, /*TRI=*/nullptr)) {
1218 // If the def is dead, then EFLAGS is not live.
1219 if (DefOp
->isDead())
1222 // Otherwise we've def'ed it, and it is live.
1225 // While at this instruction, also check if we use and kill EFLAGS
1226 // which means it isn't live.
1227 if (MI
.killsRegister(X86::EFLAGS
, &TRI
))
1231 // If we didn't find anything conclusive (neither definitely alive or
1232 // definitely dead) return whether it lives into the block.
1233 return MBB
.isLiveIn(X86::EFLAGS
);
1236 /// Trace the predicate state through each of the blocks in the function,
1237 /// hardening everything necessary along the way.
1239 /// We call this routine once the initial predicate state has been established
1240 /// for each basic block in the function in the SSA updater. This routine traces
1241 /// it through the instructions within each basic block, and for non-returning
1242 /// blocks informs the SSA updater about the final state that lives out of the
1243 /// block. Along the way, it hardens any vulnerable instruction using the
1244 /// currently valid predicate state. We have to do these two things together
1245 /// because the SSA updater only works across blocks. Within a block, we track
1246 /// the current predicate state directly and update it as it changes.
1248 /// This operates in two passes over each block. First, we analyze the loads in
1249 /// the block to determine which strategy will be used to harden them: hardening
1250 /// the address or hardening the loaded value when loaded into a register
1251 /// amenable to hardening. We have to process these first because the two
1252 /// strategies may interact -- later hardening may change what strategy we wish
1253 /// to use. We also will analyze data dependencies between loads and avoid
1254 /// hardening those loads that are data dependent on a load with a hardened
1255 /// address. We also skip hardening loads already behind an LFENCE as that is
1256 /// sufficient to harden them against misspeculation.
1258 /// Second, we actively trace the predicate state through the block, applying
1259 /// the hardening steps we determined necessary in the first pass as we go.
1261 /// These two passes are applied to each basic block. We operate one block at a
1262 /// time to simplify reasoning about reachability and sequencing.
1263 void X86SpeculativeLoadHardeningPass::tracePredStateThroughBlocksAndHarden(
1264 MachineFunction
&MF
) {
1265 SmallPtrSet
<MachineInstr
*, 16> HardenPostLoad
;
1266 SmallPtrSet
<MachineInstr
*, 16> HardenLoadAddr
;
1268 SmallSet
<unsigned, 16> HardenedAddrRegs
;
1270 SmallDenseMap
<unsigned, unsigned, 32> AddrRegToHardenedReg
;
1272 // Track the set of load-dependent registers through the basic block. Because
1273 // the values of these registers have an existing data dependency on a loaded
1274 // value which we would have checked, we can omit any checks on them.
1275 SparseBitVector
<> LoadDepRegs
;
1277 for (MachineBasicBlock
&MBB
: MF
) {
1278 // The first pass over the block: collect all the loads which can have their
1279 // loaded value hardened and all the loads that instead need their address
1280 // hardened. During this walk we propagate load dependence for address
1281 // hardened loads and also look for LFENCE to stop hardening wherever
1282 // possible. When deciding whether or not to harden the loaded value or not,
1283 // we check to see if any registers used in the address will have been
1284 // hardened at this point and if so, harden any remaining address registers
1285 // as that often successfully re-uses hardened addresses and minimizes
1288 // FIXME: We should consider an aggressive mode where we continue to keep as
1289 // many loads value hardened even when some address register hardening would
1290 // be free (due to reuse).
1292 // Note that we only need this pass if we are actually hardening loads.
1294 for (MachineInstr
&MI
: MBB
) {
1295 // We naively assume that all def'ed registers of an instruction have
1296 // a data dependency on all of their operands.
1297 // FIXME: Do a more careful analysis of x86 to build a conservative
1299 if (llvm::any_of(MI
.uses(), [&](MachineOperand
&Op
) {
1300 return Op
.isReg() && LoadDepRegs
.test(Op
.getReg());
1302 for (MachineOperand
&Def
: MI
.defs())
1304 LoadDepRegs
.set(Def
.getReg());
1306 // Both Intel and AMD are guiding that they will change the semantics of
1307 // LFENCE to be a speculation barrier, so if we see an LFENCE, there is
1308 // no more need to guard things in this block.
1309 if (MI
.getOpcode() == X86::LFENCE
)
1312 // If this instruction cannot load, nothing to do.
1316 // Some instructions which "load" are trivially safe or unimportant.
1317 if (MI
.getOpcode() == X86::MFENCE
)
1320 // Extract the memory operand information about this instruction.
1321 const int MemRefBeginIdx
= X86::getFirstAddrOperandIdx(MI
);
1322 if (MemRefBeginIdx
< 0) {
1324 << "WARNING: unable to harden loading instruction: ";
1329 MachineOperand
&BaseMO
=
1330 MI
.getOperand(MemRefBeginIdx
+ X86::AddrBaseReg
);
1331 MachineOperand
&IndexMO
=
1332 MI
.getOperand(MemRefBeginIdx
+ X86::AddrIndexReg
);
1334 // If we have at least one (non-frame-index, non-RIP) register operand,
1335 // and neither operand is load-dependent, we need to check the load.
1336 unsigned BaseReg
= 0, IndexReg
= 0;
1337 if (!BaseMO
.isFI() && BaseMO
.getReg() != X86::RIP
&&
1338 BaseMO
.getReg() != X86::NoRegister
)
1339 BaseReg
= BaseMO
.getReg();
1340 if (IndexMO
.getReg() != X86::NoRegister
)
1341 IndexReg
= IndexMO
.getReg();
1343 if (!BaseReg
&& !IndexReg
)
1344 // No register operands!
1347 // If any register operand is dependent, this load is dependent and we
1348 // needn't check it.
1349 // FIXME: Is this true in the case where we are hardening loads after
1350 // they complete? Unclear, need to investigate.
1351 if ((BaseReg
&& LoadDepRegs
.test(BaseReg
)) ||
1352 (IndexReg
&& LoadDepRegs
.test(IndexReg
)))
1355 // If post-load hardening is enabled, this load is compatible with
1356 // post-load hardening, and we aren't already going to harden one of the
1357 // address registers, queue it up to be hardened post-load. Notably,
1358 // even once hardened this won't introduce a useful dependency that
1359 // could prune out subsequent loads.
1360 if (EnablePostLoadHardening
&& X86InstrInfo::isDataInvariantLoad(MI
) &&
1361 !isEFLAGSDefLive(MI
) && MI
.getDesc().getNumDefs() == 1 &&
1362 MI
.getOperand(0).isReg() &&
1363 canHardenRegister(MI
.getOperand(0).getReg()) &&
1364 !HardenedAddrRegs
.count(BaseReg
) &&
1365 !HardenedAddrRegs
.count(IndexReg
)) {
1366 HardenPostLoad
.insert(&MI
);
1367 HardenedAddrRegs
.insert(MI
.getOperand(0).getReg());
1371 // Record this instruction for address hardening and record its register
1372 // operands as being address-hardened.
1373 HardenLoadAddr
.insert(&MI
);
1375 HardenedAddrRegs
.insert(BaseReg
);
1377 HardenedAddrRegs
.insert(IndexReg
);
1379 for (MachineOperand
&Def
: MI
.defs())
1381 LoadDepRegs
.set(Def
.getReg());
1384 // Now re-walk the instructions in the basic block, and apply whichever
1385 // hardening strategy we have elected. Note that we do this in a second
1386 // pass specifically so that we have the complete set of instructions for
1387 // which we will do post-load hardening and can defer it in certain
1389 for (MachineInstr
&MI
: MBB
) {
1391 // We cannot both require hardening the def of a load and its address.
1392 assert(!(HardenLoadAddr
.count(&MI
) && HardenPostLoad
.count(&MI
)) &&
1393 "Requested to harden both the address and def of a load!");
1395 // Check if this is a load whose address needs to be hardened.
1396 if (HardenLoadAddr
.erase(&MI
)) {
1397 const int MemRefBeginIdx
= X86::getFirstAddrOperandIdx(MI
);
1398 assert(MemRefBeginIdx
>= 0 && "Cannot have an invalid index here!");
1400 MachineOperand
&BaseMO
=
1401 MI
.getOperand(MemRefBeginIdx
+ X86::AddrBaseReg
);
1402 MachineOperand
&IndexMO
=
1403 MI
.getOperand(MemRefBeginIdx
+ X86::AddrIndexReg
);
1404 hardenLoadAddr(MI
, BaseMO
, IndexMO
, AddrRegToHardenedReg
);
1408 // Test if this instruction is one of our post load instructions (and
1409 // remove it from the set if so).
1410 if (HardenPostLoad
.erase(&MI
)) {
1411 assert(!MI
.isCall() && "Must not try to post-load harden a call!");
1413 // If this is a data-invariant load and there is no EFLAGS
1414 // interference, we want to try and sink any hardening as far as
1416 if (X86InstrInfo::isDataInvariantLoad(MI
) && !isEFLAGSDefLive(MI
)) {
1417 // Sink the instruction we'll need to harden as far as we can down
1419 MachineInstr
*SunkMI
= sinkPostLoadHardenedInst(MI
, HardenPostLoad
);
1421 // If we managed to sink this instruction, update everything so we
1422 // harden that instruction when we reach it in the instruction
1424 if (SunkMI
!= &MI
) {
1425 // If in sinking there was no instruction needing to be hardened,
1430 // Otherwise, add this to the set of defs we harden.
1431 HardenPostLoad
.insert(SunkMI
);
1436 unsigned HardenedReg
= hardenPostLoad(MI
);
1438 // Mark the resulting hardened register as such so we don't re-harden.
1439 AddrRegToHardenedReg
[HardenedReg
] = HardenedReg
;
1444 // Check for an indirect call or branch that may need its input hardened
1445 // even if we couldn't find the specific load used, or were able to
1446 // avoid hardening it for some reason. Note that here we cannot break
1447 // out afterward as we may still need to handle any call aspect of this
1449 if ((MI
.isCall() || MI
.isBranch()) && HardenIndirectCallsAndJumps
)
1450 hardenIndirectCallOrJumpInstr(MI
, AddrRegToHardenedReg
);
1453 // After we finish hardening loads we handle interprocedural hardening if
1454 // enabled and relevant for this instruction.
1455 if (!HardenInterprocedurally
)
1457 if (!MI
.isCall() && !MI
.isReturn())
1460 // If this is a direct return (IE, not a tail call) just directly harden
1462 if (MI
.isReturn() && !MI
.isCall()) {
1463 hardenReturnInstr(MI
);
1467 // Otherwise we have a call. We need to handle transferring the predicate
1468 // state into a call and recovering it after the call returns (unless this
1470 assert(MI
.isCall() && "Should only reach here for calls!");
1471 tracePredStateThroughCall(MI
);
1474 HardenPostLoad
.clear();
1475 HardenLoadAddr
.clear();
1476 HardenedAddrRegs
.clear();
1477 AddrRegToHardenedReg
.clear();
1479 // Currently, we only track data-dependent loads within a basic block.
1480 // FIXME: We should see if this is necessary or if we could be more
1481 // aggressive here without opening up attack avenues.
1482 LoadDepRegs
.clear();
1486 /// Save EFLAGS into the returned GPR. This can in turn be restored with
1487 /// `restoreEFLAGS`.
1489 /// Note that LLVM can only lower very simple patterns of saved and restored
1490 /// EFLAGS registers. The restore should always be within the same basic block
1491 /// as the save so that no PHI nodes are inserted.
1492 unsigned X86SpeculativeLoadHardeningPass::saveEFLAGS(
1493 MachineBasicBlock
&MBB
, MachineBasicBlock::iterator InsertPt
,
1494 const DebugLoc
&Loc
) {
1495 // FIXME: Hard coding this to a 32-bit register class seems weird, but matches
1496 // what instruction selection does.
1497 Register Reg
= MRI
->createVirtualRegister(&X86::GR32RegClass
);
1498 // We directly copy the FLAGS register and rely on later lowering to clean
1499 // this up into the appropriate setCC instructions.
1500 BuildMI(MBB
, InsertPt
, Loc
, TII
->get(X86::COPY
), Reg
).addReg(X86::EFLAGS
);
1505 /// Restore EFLAGS from the provided GPR. This should be produced by
1508 /// This must be done within the same basic block as the save in order to
1510 void X86SpeculativeLoadHardeningPass::restoreEFLAGS(
1511 MachineBasicBlock
&MBB
, MachineBasicBlock::iterator InsertPt
,
1512 const DebugLoc
&Loc
, Register Reg
) {
1513 BuildMI(MBB
, InsertPt
, Loc
, TII
->get(X86::COPY
), X86::EFLAGS
).addReg(Reg
);
1517 /// Takes the current predicate state (in a register) and merges it into the
1518 /// stack pointer. The state is essentially a single bit, but we merge this in
1519 /// a way that won't form non-canonical pointers and also will be preserved
1520 /// across normal stack adjustments.
1521 void X86SpeculativeLoadHardeningPass::mergePredStateIntoSP(
1522 MachineBasicBlock
&MBB
, MachineBasicBlock::iterator InsertPt
,
1523 const DebugLoc
&Loc
, unsigned PredStateReg
) {
1524 Register TmpReg
= MRI
->createVirtualRegister(PS
->RC
);
1525 // FIXME: This hard codes a shift distance based on the number of bits needed
1526 // to stay canonical on 64-bit. We should compute this somehow and support
1527 // 32-bit as part of that.
1528 auto ShiftI
= BuildMI(MBB
, InsertPt
, Loc
, TII
->get(X86::SHL64ri
), TmpReg
)
1529 .addReg(PredStateReg
, RegState::Kill
)
1531 ShiftI
->addRegisterDead(X86::EFLAGS
, TRI
);
1533 auto OrI
= BuildMI(MBB
, InsertPt
, Loc
, TII
->get(X86::OR64rr
), X86::RSP
)
1535 .addReg(TmpReg
, RegState::Kill
);
1536 OrI
->addRegisterDead(X86::EFLAGS
, TRI
);
1540 /// Extracts the predicate state stored in the high bits of the stack pointer.
1541 unsigned X86SpeculativeLoadHardeningPass::extractPredStateFromSP(
1542 MachineBasicBlock
&MBB
, MachineBasicBlock::iterator InsertPt
,
1543 const DebugLoc
&Loc
) {
1544 Register PredStateReg
= MRI
->createVirtualRegister(PS
->RC
);
1545 Register TmpReg
= MRI
->createVirtualRegister(PS
->RC
);
1547 // We know that the stack pointer will have any preserved predicate state in
1548 // its high bit. We just want to smear this across the other bits. Turns out,
1549 // this is exactly what an arithmetic right shift does.
1550 BuildMI(MBB
, InsertPt
, Loc
, TII
->get(TargetOpcode::COPY
), TmpReg
)
1553 BuildMI(MBB
, InsertPt
, Loc
, TII
->get(X86::SAR64ri
), PredStateReg
)
1554 .addReg(TmpReg
, RegState::Kill
)
1555 .addImm(TRI
->getRegSizeInBits(*PS
->RC
) - 1);
1556 ShiftI
->addRegisterDead(X86::EFLAGS
, TRI
);
1559 return PredStateReg
;
1562 void X86SpeculativeLoadHardeningPass::hardenLoadAddr(
1563 MachineInstr
&MI
, MachineOperand
&BaseMO
, MachineOperand
&IndexMO
,
1564 SmallDenseMap
<unsigned, unsigned, 32> &AddrRegToHardenedReg
) {
1565 MachineBasicBlock
&MBB
= *MI
.getParent();
1566 const DebugLoc
&Loc
= MI
.getDebugLoc();
1568 // Check if EFLAGS are alive by seeing if there is a def of them or they
1569 // live-in, and then seeing if that def is in turn used.
1570 bool EFLAGSLive
= isEFLAGSLive(MBB
, MI
.getIterator(), *TRI
);
1572 SmallVector
<MachineOperand
*, 2> HardenOpRegs
;
1574 if (BaseMO
.isFI()) {
1575 // A frame index is never a dynamically controllable load, so only
1576 // harden it if we're covering fixed address loads as well.
1578 dbgs() << " Skipping hardening base of explicit stack frame load: ";
1579 MI
.dump(); dbgs() << "\n");
1580 } else if (BaseMO
.getReg() == X86::RSP
) {
1581 // Some idempotent atomic operations are lowered directly to a locked
1582 // OR with 0 to the top of stack(or slightly offset from top) which uses an
1583 // explicit RSP register as the base.
1584 assert(IndexMO
.getReg() == X86::NoRegister
&&
1585 "Explicit RSP access with dynamic index!");
1587 dbgs() << " Cannot harden base of explicit RSP offset in a load!");
1588 } else if (BaseMO
.getReg() == X86::RIP
||
1589 BaseMO
.getReg() == X86::NoRegister
) {
1590 // For both RIP-relative addressed loads or absolute loads, we cannot
1591 // meaningfully harden them because the address being loaded has no
1592 // dynamic component.
1594 // FIXME: When using a segment base (like TLS does) we end up with the
1595 // dynamic address being the base plus -1 because we can't mutate the
1596 // segment register here. This allows the signed 32-bit offset to point at
1597 // valid segment-relative addresses and load them successfully.
1599 dbgs() << " Cannot harden base of "
1600 << (BaseMO
.getReg() == X86::RIP
? "RIP-relative" : "no-base")
1601 << " address in a load!");
1603 assert(BaseMO
.isReg() &&
1604 "Only allowed to have a frame index or register base.");
1605 HardenOpRegs
.push_back(&BaseMO
);
1608 if (IndexMO
.getReg() != X86::NoRegister
&&
1609 (HardenOpRegs
.empty() ||
1610 HardenOpRegs
.front()->getReg() != IndexMO
.getReg()))
1611 HardenOpRegs
.push_back(&IndexMO
);
1613 assert((HardenOpRegs
.size() == 1 || HardenOpRegs
.size() == 2) &&
1614 "Should have exactly one or two registers to harden!");
1615 assert((HardenOpRegs
.size() == 1 ||
1616 HardenOpRegs
[0]->getReg() != HardenOpRegs
[1]->getReg()) &&
1617 "Should not have two of the same registers!");
1619 // Remove any registers that have alreaded been checked.
1620 llvm::erase_if(HardenOpRegs
, [&](MachineOperand
*Op
) {
1621 // See if this operand's register has already been checked.
1622 auto It
= AddrRegToHardenedReg
.find(Op
->getReg());
1623 if (It
== AddrRegToHardenedReg
.end())
1624 // Not checked, so retain this one.
1627 // Otherwise, we can directly update this operand and remove it.
1628 Op
->setReg(It
->second
);
1631 // If there are none left, we're done.
1632 if (HardenOpRegs
.empty())
1635 // Compute the current predicate state.
1636 Register StateReg
= PS
->SSA
.GetValueAtEndOfBlock(&MBB
);
1638 auto InsertPt
= MI
.getIterator();
1640 // If EFLAGS are live and we don't have access to instructions that avoid
1641 // clobbering EFLAGS we need to save and restore them. This in turn makes
1642 // the EFLAGS no longer live.
1643 unsigned FlagsReg
= 0;
1644 if (EFLAGSLive
&& !Subtarget
->hasBMI2()) {
1646 FlagsReg
= saveEFLAGS(MBB
, InsertPt
, Loc
);
1649 for (MachineOperand
*Op
: HardenOpRegs
) {
1650 Register OpReg
= Op
->getReg();
1651 auto *OpRC
= MRI
->getRegClass(OpReg
);
1652 Register TmpReg
= MRI
->createVirtualRegister(OpRC
);
1654 // If this is a vector register, we'll need somewhat custom logic to handle
1656 if (!Subtarget
->hasVLX() && (OpRC
->hasSuperClassEq(&X86::VR128RegClass
) ||
1657 OpRC
->hasSuperClassEq(&X86::VR256RegClass
))) {
1658 assert(Subtarget
->hasAVX2() && "AVX2-specific register classes!");
1659 bool Is128Bit
= OpRC
->hasSuperClassEq(&X86::VR128RegClass
);
1661 // Move our state into a vector register.
1662 // FIXME: We could skip this at the cost of longer encodings with AVX-512
1663 // but that doesn't seem likely worth it.
1664 Register VStateReg
= MRI
->createVirtualRegister(&X86::VR128RegClass
);
1666 BuildMI(MBB
, InsertPt
, Loc
, TII
->get(X86::VMOV64toPQIrr
), VStateReg
)
1670 LLVM_DEBUG(dbgs() << " Inserting mov: "; MovI
->dump(); dbgs() << "\n");
1672 // Broadcast it across the vector register.
1673 Register VBStateReg
= MRI
->createVirtualRegister(OpRC
);
1674 auto BroadcastI
= BuildMI(MBB
, InsertPt
, Loc
,
1675 TII
->get(Is128Bit
? X86::VPBROADCASTQrr
1676 : X86::VPBROADCASTQYrr
),
1681 LLVM_DEBUG(dbgs() << " Inserting broadcast: "; BroadcastI
->dump();
1684 // Merge our potential poison state into the value with a vector or.
1686 BuildMI(MBB
, InsertPt
, Loc
,
1687 TII
->get(Is128Bit
? X86::VPORrr
: X86::VPORYrr
), TmpReg
)
1692 LLVM_DEBUG(dbgs() << " Inserting or: "; OrI
->dump(); dbgs() << "\n");
1693 } else if (OpRC
->hasSuperClassEq(&X86::VR128XRegClass
) ||
1694 OpRC
->hasSuperClassEq(&X86::VR256XRegClass
) ||
1695 OpRC
->hasSuperClassEq(&X86::VR512RegClass
)) {
1696 assert(Subtarget
->hasAVX512() && "AVX512-specific register classes!");
1697 bool Is128Bit
= OpRC
->hasSuperClassEq(&X86::VR128XRegClass
);
1698 bool Is256Bit
= OpRC
->hasSuperClassEq(&X86::VR256XRegClass
);
1699 if (Is128Bit
|| Is256Bit
)
1700 assert(Subtarget
->hasVLX() && "AVX512VL-specific register classes!");
1702 // Broadcast our state into a vector register.
1703 Register VStateReg
= MRI
->createVirtualRegister(OpRC
);
1704 unsigned BroadcastOp
= Is128Bit
? X86::VPBROADCASTQrZ128rr
1705 : Is256Bit
? X86::VPBROADCASTQrZ256rr
1706 : X86::VPBROADCASTQrZrr
;
1708 BuildMI(MBB
, InsertPt
, Loc
, TII
->get(BroadcastOp
), VStateReg
)
1712 LLVM_DEBUG(dbgs() << " Inserting broadcast: "; BroadcastI
->dump();
1715 // Merge our potential poison state into the value with a vector or.
1716 unsigned OrOp
= Is128Bit
? X86::VPORQZ128rr
1717 : Is256Bit
? X86::VPORQZ256rr
: X86::VPORQZrr
;
1718 auto OrI
= BuildMI(MBB
, InsertPt
, Loc
, TII
->get(OrOp
), TmpReg
)
1723 LLVM_DEBUG(dbgs() << " Inserting or: "; OrI
->dump(); dbgs() << "\n");
1725 // FIXME: Need to support GR32 here for 32-bit code.
1726 assert(OpRC
->hasSuperClassEq(&X86::GR64RegClass
) &&
1727 "Not a supported register class for address hardening!");
1730 // Merge our potential poison state into the value with an or.
1731 auto OrI
= BuildMI(MBB
, InsertPt
, Loc
, TII
->get(X86::OR64rr
), TmpReg
)
1734 OrI
->addRegisterDead(X86::EFLAGS
, TRI
);
1736 LLVM_DEBUG(dbgs() << " Inserting or: "; OrI
->dump(); dbgs() << "\n");
1738 // We need to avoid touching EFLAGS so shift out all but the least
1739 // significant bit using the instruction that doesn't update flags.
1741 BuildMI(MBB
, InsertPt
, Loc
, TII
->get(X86::SHRX64rr
), TmpReg
)
1746 LLVM_DEBUG(dbgs() << " Inserting shrx: "; ShiftI
->dump();
1751 // Record this register as checked and update the operand.
1752 assert(!AddrRegToHardenedReg
.count(Op
->getReg()) &&
1753 "Should not have checked this register yet!");
1754 AddrRegToHardenedReg
[Op
->getReg()] = TmpReg
;
1756 ++NumAddrRegsHardened
;
1759 // And restore the flags if needed.
1761 restoreEFLAGS(MBB
, InsertPt
, Loc
, FlagsReg
);
1764 MachineInstr
*X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst(
1765 MachineInstr
&InitialMI
, SmallPtrSetImpl
<MachineInstr
*> &HardenedInstrs
) {
1766 assert(X86InstrInfo::isDataInvariantLoad(InitialMI
) &&
1767 "Cannot get here with a non-invariant load!");
1768 assert(!isEFLAGSDefLive(InitialMI
) &&
1769 "Cannot get here with a data invariant load "
1770 "that interferes with EFLAGS!");
1772 // See if we can sink hardening the loaded value.
1773 auto SinkCheckToSingleUse
=
1774 [&](MachineInstr
&MI
) -> std::optional
<MachineInstr
*> {
1775 Register DefReg
= MI
.getOperand(0).getReg();
1777 // We need to find a single use which we can sink the check. We can
1778 // primarily do this because many uses may already end up checked on their
1780 MachineInstr
*SingleUseMI
= nullptr;
1781 for (MachineInstr
&UseMI
: MRI
->use_instructions(DefReg
)) {
1782 // If we're already going to harden this use, it is data invariant, it
1783 // does not interfere with EFLAGS, and within our block.
1784 if (HardenedInstrs
.count(&UseMI
)) {
1785 if (!X86InstrInfo::isDataInvariantLoad(UseMI
) || isEFLAGSDefLive(UseMI
)) {
1786 // If we've already decided to harden a non-load, we must have sunk
1787 // some other post-load hardened instruction to it and it must itself
1788 // be data-invariant.
1789 assert(X86InstrInfo::isDataInvariant(UseMI
) &&
1790 "Data variant instruction being hardened!");
1794 // Otherwise, this is a load and the load component can't be data
1795 // invariant so check how this register is being used.
1796 const int MemRefBeginIdx
= X86::getFirstAddrOperandIdx(UseMI
);
1797 assert(MemRefBeginIdx
>= 0 &&
1798 "Should always have mem references here!");
1800 MachineOperand
&BaseMO
=
1801 UseMI
.getOperand(MemRefBeginIdx
+ X86::AddrBaseReg
);
1802 MachineOperand
&IndexMO
=
1803 UseMI
.getOperand(MemRefBeginIdx
+ X86::AddrIndexReg
);
1804 if ((BaseMO
.isReg() && BaseMO
.getReg() == DefReg
) ||
1805 (IndexMO
.isReg() && IndexMO
.getReg() == DefReg
))
1806 // The load uses the register as part of its address making it not
1814 // We already have a single use, this would make two. Bail.
1817 // If this single use isn't data invariant, isn't in this block, or has
1818 // interfering EFLAGS, we can't sink the hardening to it.
1819 if (!X86InstrInfo::isDataInvariant(UseMI
) || UseMI
.getParent() != MI
.getParent() ||
1820 isEFLAGSDefLive(UseMI
))
1823 // If this instruction defines multiple registers bail as we won't harden
1825 if (UseMI
.getDesc().getNumDefs() > 1)
1828 // If this register isn't a virtual register we can't walk uses of sanely,
1829 // just bail. Also check that its register class is one of the ones we
1831 Register UseDefReg
= UseMI
.getOperand(0).getReg();
1832 if (!canHardenRegister(UseDefReg
))
1835 SingleUseMI
= &UseMI
;
1838 // If SingleUseMI is still null, there is no use that needs its own
1839 // checking. Otherwise, it is the single use that needs checking.
1840 return {SingleUseMI
};
1843 MachineInstr
*MI
= &InitialMI
;
1844 while (std::optional
<MachineInstr
*> SingleUse
= SinkCheckToSingleUse(*MI
)) {
1845 // Update which MI we're checking now.
1854 bool X86SpeculativeLoadHardeningPass::canHardenRegister(Register Reg
) {
1855 // We only support hardening virtual registers.
1856 if (!Reg
.isVirtual())
1859 auto *RC
= MRI
->getRegClass(Reg
);
1860 int RegBytes
= TRI
->getRegSizeInBits(*RC
) / 8;
1862 // We don't support post-load hardening of vectors.
1865 unsigned RegIdx
= Log2_32(RegBytes
);
1866 assert(RegIdx
< 4 && "Unsupported register size");
1868 // If this register class is explicitly constrained to a class that doesn't
1869 // require REX prefix, we may not be able to satisfy that constraint when
1870 // emitting the hardening instructions, so bail out here.
1871 // FIXME: This seems like a pretty lame hack. The way this comes up is when we
1872 // end up both with a NOREX and REX-only register as operands to the hardening
1873 // instructions. It would be better to fix that code to handle this situation
1874 // rather than hack around it in this way.
1875 const TargetRegisterClass
*NOREXRegClasses
[] = {
1876 &X86::GR8_NOREXRegClass
, &X86::GR16_NOREXRegClass
,
1877 &X86::GR32_NOREXRegClass
, &X86::GR64_NOREXRegClass
};
1878 if (RC
== NOREXRegClasses
[RegIdx
])
1881 const TargetRegisterClass
*GPRRegClasses
[] = {
1882 &X86::GR8RegClass
, &X86::GR16RegClass
, &X86::GR32RegClass
,
1883 &X86::GR64RegClass
};
1884 return RC
->hasSuperClassEq(GPRRegClasses
[RegIdx
]);
1887 /// Harden a value in a register.
1889 /// This is the low-level logic to fully harden a value sitting in a register
1890 /// against leaking during speculative execution.
1892 /// Unlike hardening an address that is used by a load, this routine is required
1893 /// to hide *all* incoming bits in the register.
1895 /// `Reg` must be a virtual register. Currently, it is required to be a GPR no
1896 /// larger than the predicate state register. FIXME: We should support vector
1897 /// registers here by broadcasting the predicate state.
1899 /// The new, hardened virtual register is returned. It will have the same
1900 /// register class as `Reg`.
1901 unsigned X86SpeculativeLoadHardeningPass::hardenValueInRegister(
1902 Register Reg
, MachineBasicBlock
&MBB
, MachineBasicBlock::iterator InsertPt
,
1903 const DebugLoc
&Loc
) {
1904 assert(canHardenRegister(Reg
) && "Cannot harden this register!");
1906 auto *RC
= MRI
->getRegClass(Reg
);
1907 int Bytes
= TRI
->getRegSizeInBits(*RC
) / 8;
1908 Register StateReg
= PS
->SSA
.GetValueAtEndOfBlock(&MBB
);
1909 assert((Bytes
== 1 || Bytes
== 2 || Bytes
== 4 || Bytes
== 8) &&
1910 "Unknown register size");
1912 // FIXME: Need to teach this about 32-bit mode.
1914 unsigned SubRegImms
[] = {X86::sub_8bit
, X86::sub_16bit
, X86::sub_32bit
};
1915 unsigned SubRegImm
= SubRegImms
[Log2_32(Bytes
)];
1916 Register NarrowStateReg
= MRI
->createVirtualRegister(RC
);
1917 BuildMI(MBB
, InsertPt
, Loc
, TII
->get(TargetOpcode::COPY
), NarrowStateReg
)
1918 .addReg(StateReg
, 0, SubRegImm
);
1919 StateReg
= NarrowStateReg
;
1922 unsigned FlagsReg
= 0;
1923 if (isEFLAGSLive(MBB
, InsertPt
, *TRI
))
1924 FlagsReg
= saveEFLAGS(MBB
, InsertPt
, Loc
);
1926 Register NewReg
= MRI
->createVirtualRegister(RC
);
1927 unsigned OrOpCodes
[] = {X86::OR8rr
, X86::OR16rr
, X86::OR32rr
, X86::OR64rr
};
1928 unsigned OrOpCode
= OrOpCodes
[Log2_32(Bytes
)];
1929 auto OrI
= BuildMI(MBB
, InsertPt
, Loc
, TII
->get(OrOpCode
), NewReg
)
1932 OrI
->addRegisterDead(X86::EFLAGS
, TRI
);
1934 LLVM_DEBUG(dbgs() << " Inserting or: "; OrI
->dump(); dbgs() << "\n");
1937 restoreEFLAGS(MBB
, InsertPt
, Loc
, FlagsReg
);
1942 /// Harden a load by hardening the loaded value in the defined register.
1944 /// We can harden a non-leaking load into a register without touching the
1945 /// address by just hiding all of the loaded bits during misspeculation. We use
1946 /// an `or` instruction to do this because we set up our poison value as all
1947 /// ones. And the goal is just for the loaded bits to not be exposed to
1948 /// execution and coercing them to one is sufficient.
1950 /// Returns the newly hardened register.
1951 unsigned X86SpeculativeLoadHardeningPass::hardenPostLoad(MachineInstr
&MI
) {
1952 MachineBasicBlock
&MBB
= *MI
.getParent();
1953 const DebugLoc
&Loc
= MI
.getDebugLoc();
1955 auto &DefOp
= MI
.getOperand(0);
1956 Register OldDefReg
= DefOp
.getReg();
1957 auto *DefRC
= MRI
->getRegClass(OldDefReg
);
1959 // Because we want to completely replace the uses of this def'ed value with
1960 // the hardened value, create a dedicated new register that will only be used
1961 // to communicate the unhardened value to the hardening.
1962 Register UnhardenedReg
= MRI
->createVirtualRegister(DefRC
);
1963 DefOp
.setReg(UnhardenedReg
);
1965 // Now harden this register's value, getting a hardened reg that is safe to
1966 // use. Note that we insert the instructions to compute this *after* the
1967 // defining instruction, not before it.
1968 unsigned HardenedReg
= hardenValueInRegister(
1969 UnhardenedReg
, MBB
, std::next(MI
.getIterator()), Loc
);
1971 // Finally, replace the old register (which now only has the uses of the
1972 // original def) with the hardened register.
1973 MRI
->replaceRegWith(/*FromReg*/ OldDefReg
, /*ToReg*/ HardenedReg
);
1975 ++NumPostLoadRegsHardened
;
1979 /// Harden a return instruction.
1981 /// Returns implicitly perform a load which we need to harden. Without hardening
1982 /// this load, an attacker my speculatively write over the return address to
1983 /// steer speculation of the return to an attacker controlled address. This is
1984 /// called Spectre v1.1 or Bounds Check Bypass Store (BCBS) and is described in
1986 /// https://people.csail.mit.edu/vlk/spectre11.pdf
1988 /// We can harden this by introducing an LFENCE that will delay any load of the
1989 /// return address until prior instructions have retired (and thus are not being
1990 /// speculated), or we can harden the address used by the implicit load: the
1993 /// If we are not using an LFENCE, hardening the stack pointer has an additional
1994 /// benefit: it allows us to pass the predicate state accumulated in this
1995 /// function back to the caller. In the absence of a BCBS attack on the return,
1996 /// the caller will typically be resumed and speculatively executed due to the
1997 /// Return Stack Buffer (RSB) prediction which is very accurate and has a high
1998 /// priority. It is possible that some code from the caller will be executed
1999 /// speculatively even during a BCBS-attacked return until the steering takes
2000 /// effect. Whenever this happens, the caller can recover the (poisoned)
2001 /// predicate state from the stack pointer and continue to harden loads.
2002 void X86SpeculativeLoadHardeningPass::hardenReturnInstr(MachineInstr
&MI
) {
2003 MachineBasicBlock
&MBB
= *MI
.getParent();
2004 const DebugLoc
&Loc
= MI
.getDebugLoc();
2005 auto InsertPt
= MI
.getIterator();
2007 if (FenceCallAndRet
)
2008 // No need to fence here as we'll fence at the return site itself. That
2009 // handles more cases than we can handle here.
2012 // Take our predicate state, shift it to the high 17 bits (so that we keep
2013 // pointers canonical) and merge it into RSP. This will allow the caller to
2014 // extract it when we return (speculatively).
2015 mergePredStateIntoSP(MBB
, InsertPt
, Loc
, PS
->SSA
.GetValueAtEndOfBlock(&MBB
));
2018 /// Trace the predicate state through a call.
2020 /// There are several layers of this needed to handle the full complexity of
2023 /// First, we need to send the predicate state into the called function. We do
2024 /// this by merging it into the high bits of the stack pointer.
2026 /// For tail calls, this is all we need to do.
2028 /// For calls where we might return and resume the control flow, we need to
2029 /// extract the predicate state from the high bits of the stack pointer after
2030 /// control returns from the called function.
2032 /// We also need to verify that we intended to return to this location in the
2033 /// code. An attacker might arrange for the processor to mispredict the return
2034 /// to this valid but incorrect return address in the program rather than the
2035 /// correct one. See the paper on this attack, called "ret2spec" by the
2036 /// researchers, here:
2037 /// https://christian-rossow.de/publications/ret2spec-ccs2018.pdf
2039 /// The way we verify that we returned to the correct location is by preserving
2040 /// the expected return address across the call. One technique involves taking
2041 /// advantage of the red-zone to load the return address from `8(%rsp)` where it
2042 /// was left by the RET instruction when it popped `%rsp`. Alternatively, we can
2043 /// directly save the address into a register that will be preserved across the
2044 /// call. We compare this intended return address against the address
2045 /// immediately following the call (the observed return address). If these
2046 /// mismatch, we have detected misspeculation and can poison our predicate
2048 void X86SpeculativeLoadHardeningPass::tracePredStateThroughCall(
2050 MachineBasicBlock
&MBB
= *MI
.getParent();
2051 MachineFunction
&MF
= *MBB
.getParent();
2052 auto InsertPt
= MI
.getIterator();
2053 const DebugLoc
&Loc
= MI
.getDebugLoc();
2055 if (FenceCallAndRet
) {
2057 // Tail call, we don't return to this function.
2058 // FIXME: We should also handle noreturn calls.
2061 // We don't need to fence before the call because the function should fence
2062 // in its entry. However, we do need to fence after the call returns.
2063 // Fencing before the return doesn't correctly handle cases where the return
2064 // itself is mispredicted.
2065 BuildMI(MBB
, std::next(InsertPt
), Loc
, TII
->get(X86::LFENCE
));
2067 ++NumLFENCEsInserted
;
2071 // First, we transfer the predicate state into the called function by merging
2072 // it into the stack pointer. This will kill the current def of the state.
2073 Register StateReg
= PS
->SSA
.GetValueAtEndOfBlock(&MBB
);
2074 mergePredStateIntoSP(MBB
, InsertPt
, Loc
, StateReg
);
2076 // If this call is also a return, it is a tail call and we don't need anything
2077 // else to handle it so just return. Also, if there are no further
2078 // instructions and no successors, this call does not return so we can also
2080 if (MI
.isReturn() || (std::next(InsertPt
) == MBB
.end() && MBB
.succ_empty()))
2083 // Create a symbol to track the return address and attach it to the call
2084 // machine instruction. We will lower extra symbols attached to call
2085 // instructions as label immediately following the call.
2086 MCSymbol
*RetSymbol
=
2087 MF
.getContext().createTempSymbol("slh_ret_addr",
2088 /*AlwaysAddSuffix*/ true);
2089 MI
.setPostInstrSymbol(MF
, RetSymbol
);
2091 const TargetRegisterClass
*AddrRC
= &X86::GR64RegClass
;
2092 unsigned ExpectedRetAddrReg
= 0;
2094 // If we have no red zones or if the function returns twice (possibly without
2095 // using the `ret` instruction) like setjmp, we need to save the expected
2096 // return address prior to the call.
2097 if (!Subtarget
->getFrameLowering()->has128ByteRedZone(MF
) ||
2098 MF
.exposesReturnsTwice()) {
2099 // If we don't have red zones, we need to compute the expected return
2100 // address prior to the call and store it in a register that lives across
2103 // In some ways, this is doubly satisfying as a mitigation because it will
2104 // also successfully detect stack smashing bugs in some cases (typically,
2105 // when a callee-saved register is used and the callee doesn't push it onto
2106 // the stack). But that isn't our primary goal, so we only use it as
2109 // FIXME: It isn't clear that this is reliable in the face of
2110 // rematerialization in the register allocator. We somehow need to force
2111 // that to not occur for this particular instruction, and instead to spill
2112 // or otherwise preserve the value computed *prior* to the call.
2114 // FIXME: It is even less clear why MachineCSE can't just fold this when we
2115 // end up having to use identical instructions both before and after the
2116 // call to feed the comparison.
2117 ExpectedRetAddrReg
= MRI
->createVirtualRegister(AddrRC
);
2118 if (MF
.getTarget().getCodeModel() == CodeModel::Small
&&
2119 !Subtarget
->isPositionIndependent()) {
2120 BuildMI(MBB
, InsertPt
, Loc
, TII
->get(X86::MOV64ri32
), ExpectedRetAddrReg
)
2123 BuildMI(MBB
, InsertPt
, Loc
, TII
->get(X86::LEA64r
), ExpectedRetAddrReg
)
2124 .addReg(/*Base*/ X86::RIP
)
2125 .addImm(/*Scale*/ 1)
2126 .addReg(/*Index*/ 0)
2128 .addReg(/*Segment*/ 0);
2132 // Step past the call to handle when it returns.
2135 // If we didn't pre-compute the expected return address into a register, then
2136 // red zones are enabled and the return address is still available on the
2137 // stack immediately after the call. As the very first instruction, we load it
2139 if (!ExpectedRetAddrReg
) {
2140 ExpectedRetAddrReg
= MRI
->createVirtualRegister(AddrRC
);
2141 BuildMI(MBB
, InsertPt
, Loc
, TII
->get(X86::MOV64rm
), ExpectedRetAddrReg
)
2142 .addReg(/*Base*/ X86::RSP
)
2143 .addImm(/*Scale*/ 1)
2144 .addReg(/*Index*/ 0)
2145 .addImm(/*Displacement*/ -8) // The stack pointer has been popped, so
2146 // the return address is 8-bytes past it.
2147 .addReg(/*Segment*/ 0);
2150 // Now we extract the callee's predicate state from the stack pointer.
2151 unsigned NewStateReg
= extractPredStateFromSP(MBB
, InsertPt
, Loc
);
2153 // Test the expected return address against our actual address. If we can
2154 // form this basic block's address as an immediate, this is easy. Otherwise
2156 if (MF
.getTarget().getCodeModel() == CodeModel::Small
&&
2157 !Subtarget
->isPositionIndependent()) {
2158 // FIXME: Could we fold this with the load? It would require careful EFLAGS
2160 BuildMI(MBB
, InsertPt
, Loc
, TII
->get(X86::CMP64ri32
))
2161 .addReg(ExpectedRetAddrReg
, RegState::Kill
)
2164 Register ActualRetAddrReg
= MRI
->createVirtualRegister(AddrRC
);
2165 BuildMI(MBB
, InsertPt
, Loc
, TII
->get(X86::LEA64r
), ActualRetAddrReg
)
2166 .addReg(/*Base*/ X86::RIP
)
2167 .addImm(/*Scale*/ 1)
2168 .addReg(/*Index*/ 0)
2170 .addReg(/*Segment*/ 0);
2171 BuildMI(MBB
, InsertPt
, Loc
, TII
->get(X86::CMP64rr
))
2172 .addReg(ExpectedRetAddrReg
, RegState::Kill
)
2173 .addReg(ActualRetAddrReg
, RegState::Kill
);
2176 // Now conditionally update the predicate state we just extracted if we ended
2177 // up at a different return address than expected.
2178 int PredStateSizeInBytes
= TRI
->getRegSizeInBits(*PS
->RC
) / 8;
2179 auto CMovOp
= X86::getCMovOpcode(PredStateSizeInBytes
);
2181 Register UpdatedStateReg
= MRI
->createVirtualRegister(PS
->RC
);
2182 auto CMovI
= BuildMI(MBB
, InsertPt
, Loc
, TII
->get(CMovOp
), UpdatedStateReg
)
2183 .addReg(NewStateReg
, RegState::Kill
)
2184 .addReg(PS
->PoisonReg
)
2185 .addImm(X86::COND_NE
);
2186 CMovI
->findRegisterUseOperand(X86::EFLAGS
, /*TRI=*/nullptr)->setIsKill(true);
2188 LLVM_DEBUG(dbgs() << " Inserting cmov: "; CMovI
->dump(); dbgs() << "\n");
2190 PS
->SSA
.AddAvailableValue(&MBB
, UpdatedStateReg
);
2193 /// An attacker may speculatively store over a value that is then speculatively
2194 /// loaded and used as the target of an indirect call or jump instruction. This
2195 /// is called Spectre v1.2 or Bounds Check Bypass Store (BCBS) and is described
2197 /// https://people.csail.mit.edu/vlk/spectre11.pdf
2199 /// When this happens, the speculative execution of the call or jump will end up
2200 /// being steered to this attacker controlled address. While most such loads
2201 /// will be adequately hardened already, we want to ensure that they are
2202 /// definitively treated as needing post-load hardening. While address hardening
2203 /// is sufficient to prevent secret data from leaking to the attacker, it may
2204 /// not be sufficient to prevent an attacker from steering speculative
2205 /// execution. We forcibly unfolded all relevant loads above and so will always
2206 /// have an opportunity to post-load harden here, we just need to scan for cases
2207 /// not already flagged and add them.
2208 void X86SpeculativeLoadHardeningPass::hardenIndirectCallOrJumpInstr(
2210 SmallDenseMap
<unsigned, unsigned, 32> &AddrRegToHardenedReg
) {
2211 switch (MI
.getOpcode()) {
2212 case X86::FARCALL16m
:
2213 case X86::FARCALL32m
:
2214 case X86::FARCALL64m
:
2215 case X86::FARJMP16m
:
2216 case X86::FARJMP32m
:
2217 case X86::FARJMP64m
:
2218 // We don't need to harden either far calls or far jumps as they are
2219 // safe from Spectre.
2226 // We should never see a loading instruction at this point, as those should
2227 // have been unfolded.
2228 assert(!MI
.mayLoad() && "Found a lingering loading instruction!");
2230 // If the first operand isn't a register, this is a branch or call
2231 // instruction with an immediate operand which doesn't need to be hardened.
2232 if (!MI
.getOperand(0).isReg())
2235 // For all of these, the target register is the first operand of the
2237 auto &TargetOp
= MI
.getOperand(0);
2238 Register OldTargetReg
= TargetOp
.getReg();
2240 // Try to lookup a hardened version of this register. We retain a reference
2241 // here as we want to update the map to track any newly computed hardened
2243 unsigned &HardenedTargetReg
= AddrRegToHardenedReg
[OldTargetReg
];
2245 // If we don't have a hardened register yet, compute one. Otherwise, just use
2246 // the already hardened register.
2248 // FIXME: It is a little suspect that we use partially hardened registers that
2249 // only feed addresses. The complexity of partial hardening with SHRX
2250 // continues to pile up. Should definitively measure its value and consider
2252 if (!HardenedTargetReg
)
2253 HardenedTargetReg
= hardenValueInRegister(
2254 OldTargetReg
, *MI
.getParent(), MI
.getIterator(), MI
.getDebugLoc());
2256 // Set the target operand to the hardened register.
2257 TargetOp
.setReg(HardenedTargetReg
);
2259 ++NumCallsOrJumpsHardened
;
2262 INITIALIZE_PASS_BEGIN(X86SpeculativeLoadHardeningPass
, PASS_KEY
,
2263 "X86 speculative load hardener", false, false)
2264 INITIALIZE_PASS_END(X86SpeculativeLoadHardeningPass
, PASS_KEY
,
2265 "X86 speculative load hardener", false, false)
2267 FunctionPass
*llvm::createX86SpeculativeLoadHardeningPass() {
2268 return new X86SpeculativeLoadHardeningPass();