lld/ELF/Arch/X86_64.cpp

   1 //===- X86_64.cpp ---------------------------------------------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8
   9 #include "OutputSections.h"
  10 #include "Relocations.h"
  11 #include "Symbols.h"
  12 #include "SyntheticSections.h"
  13 #include "Target.h"
  14 #include "lld/Common/ErrorHandler.h"
  15 #include "llvm/BinaryFormat/ELF.h"
  16 #include "llvm/Support/Endian.h"
  17 #include "llvm/Support/MathExtras.h"
  18
  19 using namespace llvm;
  20 using namespace llvm::object;
  21 using namespace llvm::support::endian;
  22 using namespace llvm::ELF;
  23 using namespace lld;
  24 using namespace lld::elf;
  25
  26 namespace {
  27 class X86_64 : public TargetInfo {
  28 public:
  29   X86_64();
  30   int getTlsGdRelaxSkip(RelType type) const override;
  31   RelExpr getRelExpr(RelType type, const Symbol &s,
  32                      const uint8_t *loc) const override;
  33   RelType getDynRel(RelType type) const override;
  34   void writeGotPltHeader(uint8_t *buf) const override;
  35   void writeGotPlt(uint8_t *buf, const Symbol &s) const override;
  36   void writeIgotPlt(uint8_t *buf, const Symbol &s) const override;
  37   void writePltHeader(uint8_t *buf) const override;
  38   void writePlt(uint8_t *buf, const Symbol &sym,
  39                 uint64_t pltEntryAddr) const override;
  40   void relocate(uint8_t *loc, const Relocation &rel,
  41                 uint64_t val) const override;
  42   int64_t getImplicitAddend(const uint8_t *buf, RelType type) const override;
  43   void applyJumpInstrMod(uint8_t *loc, JumpModType type,
  44                          unsigned size) const override;
  45   RelExpr adjustGotPcExpr(RelType type, int64_t addend,
  46                           const uint8_t *loc) const override;
  47   void relocateAlloc(InputSectionBase &sec, uint8_t *buf) const override;
  48   bool adjustPrologueForCrossSplitStack(uint8_t *loc, uint8_t *end,
  49                                         uint8_t stOther) const override;
  50   bool deleteFallThruJmpInsn(InputSection &is, InputFile *file,
  51                              InputSection *nextIS) const override;
  52   bool relaxOnce(int pass) const override;
  53 };
  54 } // namespace
  55
  56 // This is vector of NOP instructions of sizes from 1 to 8 bytes.  The
  57 // appropriately sized instructions are used to fill the gaps between sections
  58 // which are executed during fall through.
  59 static const std::vector<std::vector<uint8_t>> nopInstructions = {
  60     {0x90},
  61     {0x66, 0x90},
  62     {0x0f, 0x1f, 0x00},
  63     {0x0f, 0x1f, 0x40, 0x00},
  64     {0x0f, 0x1f, 0x44, 0x00, 0x00},
  65     {0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00},
  66     {0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00},
  67     {0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
  68     {0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}};
  69
  70 X86_64::X86_64() {
  71   copyRel = R_X86_64_COPY;
  72   gotRel = R_X86_64_GLOB_DAT;
  73   pltRel = R_X86_64_JUMP_SLOT;
  74   relativeRel = R_X86_64_RELATIVE;
  75   iRelativeRel = R_X86_64_IRELATIVE;
  76   symbolicRel = R_X86_64_64;
  77   tlsDescRel = R_X86_64_TLSDESC;
  78   tlsGotRel = R_X86_64_TPOFF64;
  79   tlsModuleIndexRel = R_X86_64_DTPMOD64;
  80   tlsOffsetRel = R_X86_64_DTPOFF64;
  81   gotBaseSymInGotPlt = true;
  82   gotEntrySize = 8;
  83   pltHeaderSize = 16;
  84   pltEntrySize = 16;
  85   ipltEntrySize = 16;
  86   trapInstr = {0xcc, 0xcc, 0xcc, 0xcc}; // 0xcc = INT3
  87   nopInstrs = nopInstructions;
  88
  89   // Align to the large page size (known as a superpage or huge page).
  90   // FreeBSD automatically promotes large, superpage-aligned allocations.
  91   defaultImageBase = 0x200000;
  92 }
  93
  94 int X86_64::getTlsGdRelaxSkip(RelType type) const {
  95   // TLSDESC relocations are processed separately. See relaxTlsGdToLe below.
  96   return type == R_X86_64_GOTPC32_TLSDESC || type == R_X86_64_TLSDESC_CALL ? 1
  97                                                                            : 2;
  98 }
  99
 100 // Opcodes for the different X86_64 jmp instructions.
 101 enum JmpInsnOpcode : uint32_t {
 102   J_JMP_32,
 103   J_JNE_32,
 104   J_JE_32,
 105   J_JG_32,
 106   J_JGE_32,
 107   J_JB_32,
 108   J_JBE_32,
 109   J_JL_32,
 110   J_JLE_32,
 111   J_JA_32,
 112   J_JAE_32,
 113   J_UNKNOWN,
 114 };
 115
 116 // Given the first (optional) and second byte of the insn's opcode, this
 117 // returns the corresponding enum value.
 118 static JmpInsnOpcode getJmpInsnType(const uint8_t *first,
 119                                     const uint8_t *second) {
 120   if (*second == 0xe9)
 121     return J_JMP_32;
 122
 123   if (first == nullptr)
 124     return J_UNKNOWN;
 125
 126   if (*first == 0x0f) {
 127     switch (*second) {
 128     case 0x84:
 129       return J_JE_32;
 130     case 0x85:
 131       return J_JNE_32;
 132     case 0x8f:
 133       return J_JG_32;
 134     case 0x8d:
 135       return J_JGE_32;
 136     case 0x82:
 137       return J_JB_32;
 138     case 0x86:
 139       return J_JBE_32;
 140     case 0x8c:
 141       return J_JL_32;
 142     case 0x8e:
 143       return J_JLE_32;
 144     case 0x87:
 145       return J_JA_32;
 146     case 0x83:
 147       return J_JAE_32;
 148     }
 149   }
 150   return J_UNKNOWN;
 151 }
 152
 153 // Return the relocation index for input section IS with a specific Offset.
 154 // Returns the maximum size of the vector if no such relocation is found.
 155 static unsigned getRelocationWithOffset(const InputSection &is,
 156                                         uint64_t offset) {
 157   unsigned size = is.relocs().size();
 158   for (unsigned i = size - 1; i + 1 > 0; --i) {
 159     if (is.relocs()[i].offset == offset && is.relocs()[i].expr != R_NONE)
 160       return i;
 161   }
 162   return size;
 163 }
 164
 165 // Returns true if R corresponds to a relocation used for a jump instruction.
 166 // TODO: Once special relocations for relaxable jump instructions are available,
 167 // this should be modified to use those relocations.
 168 static bool isRelocationForJmpInsn(Relocation &R) {
 169   return R.type == R_X86_64_PLT32 || R.type == R_X86_64_PC32 ||
 170          R.type == R_X86_64_PC8;
 171 }
 172
 173 // Return true if Relocation R points to the first instruction in the
 174 // next section.
 175 // TODO: Delete this once psABI reserves a new relocation type for fall thru
 176 // jumps.
 177 static bool isFallThruRelocation(InputSection &is, InputFile *file,
 178                                  InputSection *nextIS, Relocation &r) {
 179   if (!isRelocationForJmpInsn(r))
 180     return false;
 181
 182   uint64_t addrLoc = is.getOutputSection()->addr + is.outSecOff + r.offset;
 183   uint64_t targetOffset = InputSectionBase::getRelocTargetVA(
 184       file, r.type, r.addend, addrLoc, *r.sym, r.expr);
 185
 186   // If this jmp is a fall thru, the target offset is the beginning of the
 187   // next section.
 188   uint64_t nextSectionOffset =
 189       nextIS->getOutputSection()->addr + nextIS->outSecOff;
 190   return (addrLoc + 4 + targetOffset) == nextSectionOffset;
 191 }
 192
 193 // Return the jmp instruction opcode that is the inverse of the given
 194 // opcode.  For example, JE inverted is JNE.
 195 static JmpInsnOpcode invertJmpOpcode(const JmpInsnOpcode opcode) {
 196   switch (opcode) {
 197   case J_JE_32:
 198     return J_JNE_32;
 199   case J_JNE_32:
 200     return J_JE_32;
 201   case J_JG_32:
 202     return J_JLE_32;
 203   case J_JGE_32:
 204     return J_JL_32;
 205   case J_JB_32:
 206     return J_JAE_32;
 207   case J_JBE_32:
 208     return J_JA_32;
 209   case J_JL_32:
 210     return J_JGE_32;
 211   case J_JLE_32:
 212     return J_JG_32;
 213   case J_JA_32:
 214     return J_JBE_32;
 215   case J_JAE_32:
 216     return J_JB_32;
 217   default:
 218     return J_UNKNOWN;
 219   }
 220 }
 221
 222 // Deletes direct jump instruction in input sections that jumps to the
 223 // following section as it is not required.  If there are two consecutive jump
 224 // instructions, it checks if they can be flipped and one can be deleted.
 225 // For example:
 226 // .section .text
 227 // a.BB.foo:
 228 //    ...
 229 //    10: jne aa.BB.foo
 230 //    16: jmp bar
 231 // aa.BB.foo:
 232 //    ...
 233 //
 234 // can be converted to:
 235 // a.BB.foo:
 236 //   ...
 237 //   10: je bar  #jne flipped to je and the jmp is deleted.
 238 // aa.BB.foo:
 239 //   ...
 240 bool X86_64::deleteFallThruJmpInsn(InputSection &is, InputFile *file,
 241                                    InputSection *nextIS) const {
 242   const unsigned sizeOfDirectJmpInsn = 5;
 243
 244   if (nextIS == nullptr)
 245     return false;
 246
 247   if (is.getSize() < sizeOfDirectJmpInsn)
 248     return false;
 249
 250   // If this jmp insn can be removed, it is the last insn and the
 251   // relocation is 4 bytes before the end.
 252   unsigned rIndex = getRelocationWithOffset(is, is.getSize() - 4);
 253   if (rIndex == is.relocs().size())
 254     return false;
 255
 256   Relocation &r = is.relocs()[rIndex];
 257
 258   // Check if the relocation corresponds to a direct jmp.
 259   const uint8_t *secContents = is.content().data();
 260   // If it is not a direct jmp instruction, there is nothing to do here.
 261   if (*(secContents + r.offset - 1) != 0xe9)
 262     return false;
 263
 264   if (isFallThruRelocation(is, file, nextIS, r)) {
 265     // This is a fall thru and can be deleted.
 266     r.expr = R_NONE;
 267     r.offset = 0;
 268     is.drop_back(sizeOfDirectJmpInsn);
 269     is.nopFiller = true;
 270     return true;
 271   }
 272
 273   // Now, check if flip and delete is possible.
 274   const unsigned sizeOfJmpCCInsn = 6;
 275   // To flip, there must be at least one JmpCC and one direct jmp.
 276   if (is.getSize() < sizeOfDirectJmpInsn + sizeOfJmpCCInsn)
 277     return false;
 278
 279   unsigned rbIndex =
 280       getRelocationWithOffset(is, (is.getSize() - sizeOfDirectJmpInsn - 4));
 281   if (rbIndex == is.relocs().size())
 282     return false;
 283
 284   Relocation &rB = is.relocs()[rbIndex];
 285
 286   const uint8_t *jmpInsnB = secContents + rB.offset - 1;
 287   JmpInsnOpcode jmpOpcodeB = getJmpInsnType(jmpInsnB - 1, jmpInsnB);
 288   if (jmpOpcodeB == J_UNKNOWN)
 289     return false;
 290
 291   if (!isFallThruRelocation(is, file, nextIS, rB))
 292     return false;
 293
 294   // jmpCC jumps to the fall thru block, the branch can be flipped and the
 295   // jmp can be deleted.
 296   JmpInsnOpcode jInvert = invertJmpOpcode(jmpOpcodeB);
 297   if (jInvert == J_UNKNOWN)
 298     return false;
 299   is.jumpInstrMod = make<JumpInstrMod>();
 300   *is.jumpInstrMod = {rB.offset - 1, jInvert, 4};
 301   // Move R's values to rB except the offset.
 302   rB = {r.expr, r.type, rB.offset, r.addend, r.sym};
 303   // Cancel R
 304   r.expr = R_NONE;
 305   r.offset = 0;
 306   is.drop_back(sizeOfDirectJmpInsn);
 307   is.nopFiller = true;
 308   return true;
 309 }
 310
 311 bool X86_64::relaxOnce(int pass) const {
 312   uint64_t minVA = UINT64_MAX, maxVA = 0;
 313   for (OutputSection *osec : outputSections) {
 314     minVA = std::min(minVA, osec->addr);
 315     maxVA = std::max(maxVA, osec->addr + osec->size);
 316   }
 317   // If the max VA difference is under 2^31, GOT-generating relocations with a 32-bit range cannot overflow.
 318   if (isUInt<31>(maxVA - minVA))
 319     return false;
 320
 321   SmallVector<InputSection *, 0> storage;
 322   bool changed = false;
 323   for (OutputSection *osec : outputSections) {
 324     if (!(osec->flags & SHF_EXECINSTR))
 325       continue;
 326     for (InputSection *sec : getInputSections(*osec, storage)) {
 327       for (Relocation &rel : sec->relocs()) {
 328         if (rel.expr != R_RELAX_GOT_PC)
 329           continue;
 330
 331         uint64_t v = sec->getRelocTargetVA(
 332             sec->file, rel.type, rel.addend,
 333             sec->getOutputSection()->addr + rel.offset, *rel.sym, rel.expr);
 334         if (isInt<32>(v))
 335           continue;
 336         if (rel.sym->auxIdx == 0) {
 337           rel.sym->allocateAux();
 338           addGotEntry(*rel.sym);
 339           changed = true;
 340         }
 341         rel.expr = R_GOT_PC;
 342       }
 343     }
 344   }
 345   return changed;
 346 }
 347
 348 RelExpr X86_64::getRelExpr(RelType type, const Symbol &s,
 349                            const uint8_t *loc) const {
 350   switch (type) {
 351   case R_X86_64_8:
 352   case R_X86_64_16:
 353   case R_X86_64_32:
 354   case R_X86_64_32S:
 355   case R_X86_64_64:
 356     return R_ABS;
 357   case R_X86_64_DTPOFF32:
 358   case R_X86_64_DTPOFF64:
 359     return R_DTPREL;
 360   case R_X86_64_TPOFF32:
 361     return R_TPREL;
 362   case R_X86_64_TLSDESC_CALL:
 363     return R_TLSDESC_CALL;
 364   case R_X86_64_TLSLD:
 365     return R_TLSLD_PC;
 366   case R_X86_64_TLSGD:
 367     return R_TLSGD_PC;
 368   case R_X86_64_SIZE32:
 369   case R_X86_64_SIZE64:
 370     return R_SIZE;
 371   case R_X86_64_PLT32:
 372     return R_PLT_PC;
 373   case R_X86_64_PC8:
 374   case R_X86_64_PC16:
 375   case R_X86_64_PC32:
 376   case R_X86_64_PC64:
 377     return R_PC;
 378   case R_X86_64_GOT32:
 379   case R_X86_64_GOT64:
 380     return R_GOTPLT;
 381   case R_X86_64_GOTPC32_TLSDESC:
 382     return R_TLSDESC_PC;
 383   case R_X86_64_GOTPCREL:
 384   case R_X86_64_GOTPCRELX:
 385   case R_X86_64_REX_GOTPCRELX:
 386   case R_X86_64_GOTTPOFF:
 387     return R_GOT_PC;
 388   case R_X86_64_GOTOFF64:
 389     return R_GOTPLTREL;
 390   case R_X86_64_PLTOFF64:
 391     return R_PLT_GOTPLT;
 392   case R_X86_64_GOTPC32:
 393   case R_X86_64_GOTPC64:
 394     return R_GOTPLTONLY_PC;
 395   case R_X86_64_NONE:
 396     return R_NONE;
 397   default:
 398     error(getErrorLocation(loc) + "unknown relocation (" + Twine(type) +
 399           ") against symbol " + toString(s));
 400     return R_NONE;
 401   }
 402 }
 403
 404 void X86_64::writeGotPltHeader(uint8_t *buf) const {
 405   // The first entry holds the value of _DYNAMIC. It is not clear why that is
 406   // required, but it is documented in the psabi and the glibc dynamic linker
 407   // seems to use it (note that this is relevant for linking ld.so, not any
 408   // other program).
 409   write64le(buf, mainPart->dynamic->getVA());
 410 }
 411
 412 void X86_64::writeGotPlt(uint8_t *buf, const Symbol &s) const {
 413   // See comments in X86::writeGotPlt.
 414   write64le(buf, s.getPltVA() + 6);
 415 }
 416
 417 void X86_64::writeIgotPlt(uint8_t *buf, const Symbol &s) const {
 418   // An x86 entry is the address of the ifunc resolver function (for -z rel).
 419   if (config->writeAddends)
 420     write64le(buf, s.getVA());
 421 }
 422
 423 void X86_64::writePltHeader(uint8_t *buf) const {
 424   const uint8_t pltData[] = {
 425       0xff, 0x35, 0, 0, 0, 0, // pushq GOTPLT+8(%rip)
 426       0xff, 0x25, 0, 0, 0, 0, // jmp *GOTPLT+16(%rip)
 427       0x0f, 0x1f, 0x40, 0x00, // nop
 428   };
 429   memcpy(buf, pltData, sizeof(pltData));
 430   uint64_t gotPlt = in.gotPlt->getVA();
 431   uint64_t plt = in.ibtPlt ? in.ibtPlt->getVA() : in.plt->getVA();
 432   write32le(buf + 2, gotPlt - plt + 2); // GOTPLT+8
 433   write32le(buf + 8, gotPlt - plt + 4); // GOTPLT+16
 434 }
 435
 436 void X86_64::writePlt(uint8_t *buf, const Symbol &sym,
 437                       uint64_t pltEntryAddr) const {
 438   const uint8_t inst[] = {
 439       0xff, 0x25, 0, 0, 0, 0, // jmpq *got(%rip)
 440       0x68, 0, 0, 0, 0,       // pushq <relocation index>
 441       0xe9, 0, 0, 0, 0,       // jmpq plt[0]
 442   };
 443   memcpy(buf, inst, sizeof(inst));
 444
 445   write32le(buf + 2, sym.getGotPltVA() - pltEntryAddr - 6);
 446   write32le(buf + 7, sym.getPltIdx());
 447   write32le(buf + 12, in.plt->getVA() - pltEntryAddr - 16);
 448 }
 449
 450 RelType X86_64::getDynRel(RelType type) const {
 451   if (type == R_X86_64_64 || type == R_X86_64_PC64 || type == R_X86_64_SIZE32 ||
 452       type == R_X86_64_SIZE64)
 453     return type;
 454   return R_X86_64_NONE;
 455 }
 456
 457 static void relaxTlsGdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) {
 458   if (rel.type == R_X86_64_TLSGD) {
 459     // Convert
 460     //   .byte 0x66
 461     //   leaq x@tlsgd(%rip), %rdi
 462     //   .word 0x6666
 463     //   rex64
 464     //   call __tls_get_addr@plt
 465     // to the following two instructions.
 466     const uint8_t inst[] = {
 467         0x64, 0x48, 0x8b, 0x04, 0x25, 0x00, 0x00,
 468         0x00, 0x00,                            // mov %fs:0x0,%rax
 469         0x48, 0x8d, 0x80, 0,    0,    0,    0, // lea x@tpoff,%rax
 470     };
 471     memcpy(loc - 4, inst, sizeof(inst));
 472
 473     // The original code used a pc relative relocation and so we have to
 474     // compensate for the -4 in had in the addend.
 475     write32le(loc + 8, val + 4);
 476   } else if (rel.type == R_X86_64_GOTPC32_TLSDESC) {
 477     // Convert leaq x@tlsdesc(%rip), %REG to movq $x@tpoff, %REG.
 478     if ((loc[-3] & 0xfb) != 0x48 || loc[-2] != 0x8d ||
 479         (loc[-1] & 0xc7) != 0x05) {
 480       errorOrWarn(getErrorLocation(loc - 3) +
 481                   "R_X86_64_GOTPC32_TLSDESC must be used "
 482                   "in leaq x@tlsdesc(%rip), %REG");
 483       return;
 484     }
 485     loc[-3] = 0x48 | ((loc[-3] >> 2) & 1);
 486     loc[-2] = 0xc7;
 487     loc[-1] = 0xc0 | ((loc[-1] >> 3) & 7);
 488     write32le(loc, val + 4);
 489   } else {
 490     // Convert call *x@tlsdesc(%REG) to xchg ax, ax.
 491     assert(rel.type == R_X86_64_TLSDESC_CALL);
 492     loc[0] = 0x66;
 493     loc[1] = 0x90;
 494   }
 495 }
 496
 497 static void relaxTlsGdToIe(uint8_t *loc, const Relocation &rel, uint64_t val) {
 498   if (rel.type == R_X86_64_TLSGD) {
 499     // Convert
 500     //   .byte 0x66
 501     //   leaq x@tlsgd(%rip), %rdi
 502     //   .word 0x6666
 503     //   rex64
 504     //   call __tls_get_addr@plt
 505     // to the following two instructions.
 506     const uint8_t inst[] = {
 507         0x64, 0x48, 0x8b, 0x04, 0x25, 0x00, 0x00,
 508         0x00, 0x00,                            // mov %fs:0x0,%rax
 509         0x48, 0x03, 0x05, 0,    0,    0,    0, // addq x@gottpoff(%rip),%rax
 510     };
 511     memcpy(loc - 4, inst, sizeof(inst));
 512
 513     // Both code sequences are PC relatives, but since we are moving the
 514     // constant forward by 8 bytes we have to subtract the value by 8.
 515     write32le(loc + 8, val - 8);
 516   } else if (rel.type == R_X86_64_GOTPC32_TLSDESC) {
 517     // Convert leaq x@tlsdesc(%rip), %REG to movq x@gottpoff(%rip), %REG.
 518     assert(rel.type == R_X86_64_GOTPC32_TLSDESC);
 519     if ((loc[-3] & 0xfb) != 0x48 || loc[-2] != 0x8d ||
 520         (loc[-1] & 0xc7) != 0x05) {
 521       errorOrWarn(getErrorLocation(loc - 3) +
 522                   "R_X86_64_GOTPC32_TLSDESC must be used "
 523                   "in leaq x@tlsdesc(%rip), %REG");
 524       return;
 525     }
 526     loc[-2] = 0x8b;
 527     write32le(loc, val);
 528   } else {
 529     // Convert call *x@tlsdesc(%rax) to xchg ax, ax.
 530     assert(rel.type == R_X86_64_TLSDESC_CALL);
 531     loc[0] = 0x66;
 532     loc[1] = 0x90;
 533   }
 534 }
 535
 536 // In some conditions, R_X86_64_GOTTPOFF relocation can be optimized to
 537 // R_X86_64_TPOFF32 so that it does not use GOT.
 538 static void relaxTlsIeToLe(uint8_t *loc, const Relocation &, uint64_t val) {
 539   uint8_t *inst = loc - 3;
 540   uint8_t reg = loc[-1] >> 3;
 541   uint8_t *regSlot = loc - 1;
 542
 543   // Note that ADD with RSP or R12 is converted to ADD instead of LEA
 544   // because LEA with these registers needs 4 bytes to encode and thus
 545   // wouldn't fit the space.
 546
 547   if (memcmp(inst, "\x48\x03\x25", 3) == 0) {
 548     // "addq foo@gottpoff(%rip),%rsp" -> "addq $foo,%rsp"
 549     memcpy(inst, "\x48\x81\xc4", 3);
 550   } else if (memcmp(inst, "\x4c\x03\x25", 3) == 0) {
 551     // "addq foo@gottpoff(%rip),%r12" -> "addq $foo,%r12"
 552     memcpy(inst, "\x49\x81\xc4", 3);
 553   } else if (memcmp(inst, "\x4c\x03", 2) == 0) {
 554     // "addq foo@gottpoff(%rip),%r[8-15]" -> "leaq foo(%r[8-15]),%r[8-15]"
 555     memcpy(inst, "\x4d\x8d", 2);
 556     *regSlot = 0x80 | (reg << 3) | reg;
 557   } else if (memcmp(inst, "\x48\x03", 2) == 0) {
 558     // "addq foo@gottpoff(%rip),%reg -> "leaq foo(%reg),%reg"
 559     memcpy(inst, "\x48\x8d", 2);
 560     *regSlot = 0x80 | (reg << 3) | reg;
 561   } else if (memcmp(inst, "\x4c\x8b", 2) == 0) {
 562     // "movq foo@gottpoff(%rip),%r[8-15]" -> "movq $foo,%r[8-15]"
 563     memcpy(inst, "\x49\xc7", 2);
 564     *regSlot = 0xc0 | reg;
 565   } else if (memcmp(inst, "\x48\x8b", 2) == 0) {
 566     // "movq foo@gottpoff(%rip),%reg" -> "movq $foo,%reg"
 567     memcpy(inst, "\x48\xc7", 2);
 568     *regSlot = 0xc0 | reg;
 569   } else {
 570     error(getErrorLocation(loc - 3) +
 571           "R_X86_64_GOTTPOFF must be used in MOVQ or ADDQ instructions only");
 572   }
 573
 574   // The original code used a PC relative relocation.
 575   // Need to compensate for the -4 it had in the addend.
 576   write32le(loc, val + 4);
 577 }
 578
 579 static void relaxTlsLdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) {
 580   const uint8_t inst[] = {
 581       0x66, 0x66,                                           // .word 0x6666
 582       0x66,                                                 // .byte 0x66
 583       0x64, 0x48, 0x8b, 0x04, 0x25, 0x00, 0x00, 0x00, 0x00, // mov %fs:0,%rax
 584   };
 585
 586   if (loc[4] == 0xe8) {
 587     // Convert
 588     //   leaq bar@tlsld(%rip), %rdi           # 48 8d 3d <Loc>
 589     //   callq __tls_get_addr@PLT             # e8 <disp32>
 590     //   leaq bar@dtpoff(%rax), %rcx
 591     // to
 592     //   .word 0x6666
 593     //   .byte 0x66
 594     //   mov %fs:0,%rax
 595     //   leaq bar@tpoff(%rax), %rcx
 596     memcpy(loc - 3, inst, sizeof(inst));
 597     return;
 598   }
 599
 600   if (loc[4] == 0xff && loc[5] == 0x15) {
 601     // Convert
 602     //   leaq  x@tlsld(%rip),%rdi               # 48 8d 3d <Loc>
 603     //   call *__tls_get_addr@GOTPCREL(%rip)    # ff 15 <disp32>
 604     // to
 605     //   .long  0x66666666
 606     //   movq   %fs:0,%rax
 607     // See "Table 11.9: LD -> LE Code Transition (LP64)" in
 608     // https://raw.githubusercontent.com/wiki/hjl-tools/x86-psABI/x86-64-psABI-1.0.pdf
 609     loc[-3] = 0x66;
 610     memcpy(loc - 2, inst, sizeof(inst));
 611     return;
 612   }
 613
 614   error(getErrorLocation(loc - 3) +
 615         "expected R_X86_64_PLT32 or R_X86_64_GOTPCRELX after R_X86_64_TLSLD");
 616 }
 617
 618 // A JumpInstrMod at a specific offset indicates that the jump instruction
 619 // opcode at that offset must be modified.  This is specifically used to relax
 620 // jump instructions with basic block sections.  This function looks at the
 621 // JumpMod and effects the change.
 622 void X86_64::applyJumpInstrMod(uint8_t *loc, JumpModType type,
 623                                unsigned size) const {
 624   switch (type) {
 625   case J_JMP_32:
 626     if (size == 4)
 627       *loc = 0xe9;
 628     else
 629       *loc = 0xeb;
 630     break;
 631   case J_JE_32:
 632     if (size == 4) {
 633       loc[-1] = 0x0f;
 634       *loc = 0x84;
 635     } else
 636       *loc = 0x74;
 637     break;
 638   case J_JNE_32:
 639     if (size == 4) {
 640       loc[-1] = 0x0f;
 641       *loc = 0x85;
 642     } else
 643       *loc = 0x75;
 644     break;
 645   case J_JG_32:
 646     if (size == 4) {
 647       loc[-1] = 0x0f;
 648       *loc = 0x8f;
 649     } else
 650       *loc = 0x7f;
 651     break;
 652   case J_JGE_32:
 653     if (size == 4) {
 654       loc[-1] = 0x0f;
 655       *loc = 0x8d;
 656     } else
 657       *loc = 0x7d;
 658     break;
 659   case J_JB_32:
 660     if (size == 4) {
 661       loc[-1] = 0x0f;
 662       *loc = 0x82;
 663     } else
 664       *loc = 0x72;
 665     break;
 666   case J_JBE_32:
 667     if (size == 4) {
 668       loc[-1] = 0x0f;
 669       *loc = 0x86;
 670     } else
 671       *loc = 0x76;
 672     break;
 673   case J_JL_32:
 674     if (size == 4) {
 675       loc[-1] = 0x0f;
 676       *loc = 0x8c;
 677     } else
 678       *loc = 0x7c;
 679     break;
 680   case J_JLE_32:
 681     if (size == 4) {
 682       loc[-1] = 0x0f;
 683       *loc = 0x8e;
 684     } else
 685       *loc = 0x7e;
 686     break;
 687   case J_JA_32:
 688     if (size == 4) {
 689       loc[-1] = 0x0f;
 690       *loc = 0x87;
 691     } else
 692       *loc = 0x77;
 693     break;
 694   case J_JAE_32:
 695     if (size == 4) {
 696       loc[-1] = 0x0f;
 697       *loc = 0x83;
 698     } else
 699       *loc = 0x73;
 700     break;
 701   case J_UNKNOWN:
 702     llvm_unreachable("Unknown Jump Relocation");
 703   }
 704 }
 705
 706 int64_t X86_64::getImplicitAddend(const uint8_t *buf, RelType type) const {
 707   switch (type) {
 708   case R_X86_64_8:
 709   case R_X86_64_PC8:
 710     return SignExtend64<8>(*buf);
 711   case R_X86_64_16:
 712   case R_X86_64_PC16:
 713     return SignExtend64<16>(read16le(buf));
 714   case R_X86_64_32:
 715   case R_X86_64_32S:
 716   case R_X86_64_TPOFF32:
 717   case R_X86_64_GOT32:
 718   case R_X86_64_GOTPC32:
 719   case R_X86_64_GOTPC32_TLSDESC:
 720   case R_X86_64_GOTPCREL:
 721   case R_X86_64_GOTPCRELX:
 722   case R_X86_64_REX_GOTPCRELX:
 723   case R_X86_64_PC32:
 724   case R_X86_64_GOTTPOFF:
 725   case R_X86_64_PLT32:
 726   case R_X86_64_TLSGD:
 727   case R_X86_64_TLSLD:
 728   case R_X86_64_DTPOFF32:
 729   case R_X86_64_SIZE32:
 730     return SignExtend64<32>(read32le(buf));
 731   case R_X86_64_64:
 732   case R_X86_64_TPOFF64:
 733   case R_X86_64_DTPOFF64:
 734   case R_X86_64_DTPMOD64:
 735   case R_X86_64_PC64:
 736   case R_X86_64_SIZE64:
 737   case R_X86_64_GLOB_DAT:
 738   case R_X86_64_GOT64:
 739   case R_X86_64_GOTOFF64:
 740   case R_X86_64_GOTPC64:
 741   case R_X86_64_PLTOFF64:
 742   case R_X86_64_IRELATIVE:
 743   case R_X86_64_RELATIVE:
 744     return read64le(buf);
 745   case R_X86_64_TLSDESC:
 746     return read64le(buf + 8);
 747   case R_X86_64_JUMP_SLOT:
 748   case R_X86_64_NONE:
 749     // These relocations are defined as not having an implicit addend.
 750     return 0;
 751   default:
 752     internalLinkerError(getErrorLocation(buf),
 753                         "cannot read addend for relocation " + toString(type));
 754     return 0;
 755   }
 756 }
 757
 758 static void relaxGot(uint8_t *loc, const Relocation &rel, uint64_t val);
 759
 760 void X86_64::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const {
 761   switch (rel.type) {
 762   case R_X86_64_8:
 763     checkIntUInt(loc, val, 8, rel);
 764     *loc = val;
 765     break;
 766   case R_X86_64_PC8:
 767     checkInt(loc, val, 8, rel);
 768     *loc = val;
 769     break;
 770   case R_X86_64_16:
 771     checkIntUInt(loc, val, 16, rel);
 772     write16le(loc, val);
 773     break;
 774   case R_X86_64_PC16:
 775     checkInt(loc, val, 16, rel);
 776     write16le(loc, val);
 777     break;
 778   case R_X86_64_32:
 779     checkUInt(loc, val, 32, rel);
 780     write32le(loc, val);
 781     break;
 782   case R_X86_64_32S:
 783   case R_X86_64_GOT32:
 784   case R_X86_64_GOTPC32:
 785   case R_X86_64_GOTPCREL:
 786   case R_X86_64_PC32:
 787   case R_X86_64_PLT32:
 788   case R_X86_64_DTPOFF32:
 789   case R_X86_64_SIZE32:
 790     checkInt(loc, val, 32, rel);
 791     write32le(loc, val);
 792     break;
 793   case R_X86_64_64:
 794   case R_X86_64_DTPOFF64:
 795   case R_X86_64_PC64:
 796   case R_X86_64_SIZE64:
 797   case R_X86_64_GOT64:
 798   case R_X86_64_GOTOFF64:
 799   case R_X86_64_GOTPC64:
 800   case R_X86_64_PLTOFF64:
 801     write64le(loc, val);
 802     break;
 803   case R_X86_64_GOTPCRELX:
 804   case R_X86_64_REX_GOTPCRELX:
 805     if (rel.expr != R_GOT_PC) {
 806       relaxGot(loc, rel, val);
 807     } else {
 808       checkInt(loc, val, 32, rel);
 809       write32le(loc, val);
 810     }
 811     break;
 812   case R_X86_64_GOTPC32_TLSDESC:
 813   case R_X86_64_TLSDESC_CALL:
 814   case R_X86_64_TLSGD:
 815     if (rel.expr == R_RELAX_TLS_GD_TO_LE) {
 816       relaxTlsGdToLe(loc, rel, val);
 817     } else if (rel.expr == R_RELAX_TLS_GD_TO_IE) {
 818       relaxTlsGdToIe(loc, rel, val);
 819     } else {
 820       checkInt(loc, val, 32, rel);
 821       write32le(loc, val);
 822     }
 823     break;
 824   case R_X86_64_TLSLD:
 825     if (rel.expr == R_RELAX_TLS_LD_TO_LE) {
 826       relaxTlsLdToLe(loc, rel, val);
 827     } else {
 828       checkInt(loc, val, 32, rel);
 829       write32le(loc, val);
 830     }
 831     break;
 832   case R_X86_64_GOTTPOFF:
 833     if (rel.expr == R_RELAX_TLS_IE_TO_LE) {
 834       relaxTlsIeToLe(loc, rel, val);
 835     } else {
 836       checkInt(loc, val, 32, rel);
 837       write32le(loc, val);
 838     }
 839     break;
 840   case R_X86_64_TPOFF32:
 841     checkInt(loc, val, 32, rel);
 842     write32le(loc, val);
 843     break;
 844
 845   case R_X86_64_TLSDESC:
 846     // The addend is stored in the second 64-bit word.
 847     write64le(loc + 8, val);
 848     break;
 849   default:
 850     llvm_unreachable("unknown relocation");
 851   }
 852 }
 853
 854 RelExpr X86_64::adjustGotPcExpr(RelType type, int64_t addend,
 855                                 const uint8_t *loc) const {
 856   // Only R_X86_64_[REX_]GOTPCRELX can be relaxed. GNU as may emit GOTPCRELX
 857   // with addend != -4. Such an instruction does not load the full GOT entry, so
 858   // we cannot relax the relocation. E.g. movl x@GOTPCREL+4(%rip), %rax
 859   // (addend=0) loads the high 32 bits of the GOT entry.
 860   if (!config->relax || addend != -4 ||
 861       (type != R_X86_64_GOTPCRELX && type != R_X86_64_REX_GOTPCRELX))
 862     return R_GOT_PC;
 863   const uint8_t op = loc[-2];
 864   const uint8_t modRm = loc[-1];
 865
 866   // FIXME: When PIC is disabled and foo is defined locally in the
 867   // lower 32 bit address space, memory operand in mov can be converted into
 868   // immediate operand. Otherwise, mov must be changed to lea. We support only
 869   // latter relaxation at this moment.
 870   if (op == 0x8b)
 871     return R_RELAX_GOT_PC;
 872
 873   // Relax call and jmp.
 874   if (op == 0xff && (modRm == 0x15 || modRm == 0x25))
 875     return R_RELAX_GOT_PC;
 876
 877   // We don't support test/binop instructions without a REX prefix.
 878   if (type == R_X86_64_GOTPCRELX)
 879     return R_GOT_PC;
 880
 881   // Relaxation of test, adc, add, and, cmp, or, sbb, sub, xor.
 882   // If PIC then no relaxation is available.
 883   return config->isPic ? R_GOT_PC : R_RELAX_GOT_PC_NOPIC;
 884 }
 885
 886 // A subset of relaxations can only be applied for no-PIC. This method
 887 // handles such relaxations. Instructions encoding information was taken from:
 888 // "Intel 64 and IA-32 Architectures Software Developer's Manual V2"
 889 // (http://www.intel.com/content/dam/www/public/us/en/documents/manuals/
 890 //    64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf)
 891 static void relaxGotNoPic(uint8_t *loc, uint64_t val, uint8_t op,
 892                           uint8_t modRm) {
 893   const uint8_t rex = loc[-3];
 894   // Convert "test %reg, foo@GOTPCREL(%rip)" to "test $foo, %reg".
 895   if (op == 0x85) {
 896     // See "TEST-Logical Compare" (4-428 Vol. 2B),
 897     // TEST r/m64, r64 uses "full" ModR / M byte (no opcode extension).
 898
 899     // ModR/M byte has form XX YYY ZZZ, where
 900     // YYY is MODRM.reg(register 2), ZZZ is MODRM.rm(register 1).
 901     // XX has different meanings:
 902     // 00: The operand's memory address is in reg1.
 903     // 01: The operand's memory address is reg1 + a byte-sized displacement.
 904     // 10: The operand's memory address is reg1 + a word-sized displacement.
 905     // 11: The operand is reg1 itself.
 906     // If an instruction requires only one operand, the unused reg2 field
 907     // holds extra opcode bits rather than a register code
 908     // 0xC0 == 11 000 000 binary.
 909     // 0x38 == 00 111 000 binary.
 910     // We transfer reg2 to reg1 here as operand.
 911     // See "2.1.3 ModR/M and SIB Bytes" (Vol. 2A 2-3).
 912     loc[-1] = 0xc0 | (modRm & 0x38) >> 3; // ModR/M byte.
 913
 914     // Change opcode from TEST r/m64, r64 to TEST r/m64, imm32
 915     // See "TEST-Logical Compare" (4-428 Vol. 2B).
 916     loc[-2] = 0xf7;
 917
 918     // Move R bit to the B bit in REX byte.
 919     // REX byte is encoded as 0100WRXB, where
 920     // 0100 is 4bit fixed pattern.
 921     // REX.W When 1, a 64-bit operand size is used. Otherwise, when 0, the
 922     //   default operand size is used (which is 32-bit for most but not all
 923     //   instructions).
 924     // REX.R This 1-bit value is an extension to the MODRM.reg field.
 925     // REX.X This 1-bit value is an extension to the SIB.index field.
 926     // REX.B This 1-bit value is an extension to the MODRM.rm field or the
 927     // SIB.base field.
 928     // See "2.2.1.2 More on REX Prefix Fields " (2-8 Vol. 2A).
 929     loc[-3] = (rex & ~0x4) | (rex & 0x4) >> 2;
 930     write32le(loc, val);
 931     return;
 932   }
 933
 934   // If we are here then we need to relax the adc, add, and, cmp, or, sbb, sub
 935   // or xor operations.
 936
 937   // Convert "binop foo@GOTPCREL(%rip), %reg" to "binop $foo, %reg".
 938   // Logic is close to one for test instruction above, but we also
 939   // write opcode extension here, see below for details.
 940   loc[-1] = 0xc0 | (modRm & 0x38) >> 3 | (op & 0x3c); // ModR/M byte.
 941
 942   // Primary opcode is 0x81, opcode extension is one of:
 943   // 000b = ADD, 001b is OR, 010b is ADC, 011b is SBB,
 944   // 100b is AND, 101b is SUB, 110b is XOR, 111b is CMP.
 945   // This value was wrote to MODRM.reg in a line above.
 946   // See "3.2 INSTRUCTIONS (A-M)" (Vol. 2A 3-15),
 947   // "INSTRUCTION SET REFERENCE, N-Z" (Vol. 2B 4-1) for
 948   // descriptions about each operation.
 949   loc[-2] = 0x81;
 950   loc[-3] = (rex & ~0x4) | (rex & 0x4) >> 2;
 951   write32le(loc, val);
 952 }
 953
 954 static void relaxGot(uint8_t *loc, const Relocation &rel, uint64_t val) {
 955   assert(isInt<32>(val) &&
 956          "GOTPCRELX should not have been relaxed if it overflows");
 957   const uint8_t op = loc[-2];
 958   const uint8_t modRm = loc[-1];
 959
 960   // Convert "mov foo@GOTPCREL(%rip),%reg" to "lea foo(%rip),%reg".
 961   if (op == 0x8b) {
 962     loc[-2] = 0x8d;
 963     write32le(loc, val);
 964     return;
 965   }
 966
 967   if (op != 0xff) {
 968     // We are relaxing a rip relative to an absolute, so compensate
 969     // for the old -4 addend.
 970     assert(!config->isPic);
 971     relaxGotNoPic(loc, val + 4, op, modRm);
 972     return;
 973   }
 974
 975   // Convert call/jmp instructions.
 976   if (modRm == 0x15) {
 977     // ABI says we can convert "call *foo@GOTPCREL(%rip)" to "nop; call foo".
 978     // Instead we convert to "addr32 call foo" where addr32 is an instruction
 979     // prefix. That makes result expression to be a single instruction.
 980     loc[-2] = 0x67; // addr32 prefix
 981     loc[-1] = 0xe8; // call
 982     write32le(loc, val);
 983     return;
 984   }
 985
 986   // Convert "jmp *foo@GOTPCREL(%rip)" to "jmp foo; nop".
 987   // jmp doesn't return, so it is fine to use nop here, it is just a stub.
 988   assert(modRm == 0x25);
 989   loc[-2] = 0xe9; // jmp
 990   loc[3] = 0x90;  // nop
 991   write32le(loc - 1, val + 1);
 992 }
 993
 994 // A split-stack prologue starts by checking the amount of stack remaining
 995 // in one of two ways:
 996 // A) Comparing of the stack pointer to a field in the tcb.
 997 // B) Or a load of a stack pointer offset with an lea to r10 or r11.
 998 bool X86_64::adjustPrologueForCrossSplitStack(uint8_t *loc, uint8_t *end,
 999                                               uint8_t stOther) const {
1000   if (!config->is64) {
1001     error("target doesn't support split stacks");
1002     return false;
1003   }
1004
1005   if (loc + 8 >= end)
1006     return false;
1007
1008   // Replace "cmp %fs:0x70,%rsp" and subsequent branch
1009   // with "stc, nopl 0x0(%rax,%rax,1)"
1010   if (memcmp(loc, "\x64\x48\x3b\x24\x25", 5) == 0) {
1011     memcpy(loc, "\xf9\x0f\x1f\x84\x00\x00\x00\x00", 8);
1012     return true;
1013   }
1014
1015   // Adjust "lea X(%rsp),%rYY" to lea "(X - 0x4000)(%rsp),%rYY" where rYY could
1016   // be r10 or r11. The lea instruction feeds a subsequent compare which checks
1017   // if there is X available stack space. Making X larger effectively reserves
1018   // that much additional space. The stack grows downward so subtract the value.
1019   if (memcmp(loc, "\x4c\x8d\x94\x24", 4) == 0 ||
1020       memcmp(loc, "\x4c\x8d\x9c\x24", 4) == 0) {
1021     // The offset bytes are encoded four bytes after the start of the
1022     // instruction.
1023     write32le(loc + 4, read32le(loc + 4) - 0x4000);
1024     return true;
1025   }
1026   return false;
1027 }
1028
1029 void X86_64::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
1030   uint64_t secAddr = sec.getOutputSection()->addr;
1031   if (auto *s = dyn_cast<InputSection>(&sec))
1032     secAddr += s->outSecOff;
1033   else if (auto *ehIn = dyn_cast<EhInputSection>(&sec))
1034     secAddr += ehIn->getParent()->outSecOff;
1035   for (const Relocation &rel : sec.relocs()) {
1036     if (rel.expr == R_NONE) // See deleteFallThruJmpInsn
1037       continue;
1038     uint8_t *loc = buf + rel.offset;
1039     const uint64_t val =
1040         sec.getRelocTargetVA(sec.file, rel.type, rel.addend,
1041                              secAddr + rel.offset, *rel.sym, rel.expr);
1042     relocate(loc, rel, val);
1043   }
1044   if (sec.jumpInstrMod) {
1045     applyJumpInstrMod(buf + sec.jumpInstrMod->offset,
1046                       sec.jumpInstrMod->original, sec.jumpInstrMod->size);
1047   }
1048 }
1049
1050 // If Intel Indirect Branch Tracking is enabled, we have to emit special PLT
1051 // entries containing endbr64 instructions. A PLT entry will be split into two
1052 // parts, one in .plt.sec (writePlt), and the other in .plt (writeIBTPlt).
1053 namespace {
1054 class IntelIBT : public X86_64 {
1055 public:
1056   IntelIBT();
1057   void writeGotPlt(uint8_t *buf, const Symbol &s) const override;
1058   void writePlt(uint8_t *buf, const Symbol &sym,
1059                 uint64_t pltEntryAddr) const override;
1060   void writeIBTPlt(uint8_t *buf, size_t numEntries) const override;
1061
1062   static const unsigned IBTPltHeaderSize = 16;
1063 };
1064 } // namespace
1065
1066 IntelIBT::IntelIBT() { pltHeaderSize = 0; }
1067
1068 void IntelIBT::writeGotPlt(uint8_t *buf, const Symbol &s) const {
1069   uint64_t va =
1070       in.ibtPlt->getVA() + IBTPltHeaderSize + s.getPltIdx() * pltEntrySize;
1071   write64le(buf, va);
1072 }
1073
1074 void IntelIBT::writePlt(uint8_t *buf, const Symbol &sym,
1075                         uint64_t pltEntryAddr) const {
1076   const uint8_t Inst[] = {
1077       0xf3, 0x0f, 0x1e, 0xfa,       // endbr64
1078       0xff, 0x25, 0,    0,    0, 0, // jmpq *got(%rip)
1079       0x66, 0x0f, 0x1f, 0x44, 0, 0, // nop
1080   };
1081   memcpy(buf, Inst, sizeof(Inst));
1082   write32le(buf + 6, sym.getGotPltVA() - pltEntryAddr - 10);
1083 }
1084
1085 void IntelIBT::writeIBTPlt(uint8_t *buf, size_t numEntries) const {
1086   writePltHeader(buf);
1087   buf += IBTPltHeaderSize;
1088
1089   const uint8_t inst[] = {
1090       0xf3, 0x0f, 0x1e, 0xfa,    // endbr64
1091       0x68, 0,    0,    0,    0, // pushq <relocation index>
1092       0xe9, 0,    0,    0,    0, // jmpq plt[0]
1093       0x66, 0x90,                // nop
1094   };
1095
1096   for (size_t i = 0; i < numEntries; ++i) {
1097     memcpy(buf, inst, sizeof(inst));
1098     write32le(buf + 5, i);
1099     write32le(buf + 10, -pltHeaderSize - sizeof(inst) * i - 30);
1100     buf += sizeof(inst);
1101   }
1102 }
1103
1104 // These nonstandard PLT entries are to migtigate Spectre v2 security
1105 // vulnerability. In order to mitigate Spectre v2, we want to avoid indirect
1106 // branch instructions such as `jmp *GOTPLT(%rip)`. So, in the following PLT
1107 // entries, we use a CALL followed by MOV and RET to do the same thing as an
1108 // indirect jump. That instruction sequence is so-called "retpoline".
1109 //
1110 // We have two types of retpoline PLTs as a size optimization. If `-z now`
1111 // is specified, all dynamic symbols are resolved at load-time. Thus, when
1112 // that option is given, we can omit code for symbol lazy resolution.
1113 namespace {
1114 class Retpoline : public X86_64 {
1115 public:
1116   Retpoline();
1117   void writeGotPlt(uint8_t *buf, const Symbol &s) const override;
1118   void writePltHeader(uint8_t *buf) const override;
1119   void writePlt(uint8_t *buf, const Symbol &sym,
1120                 uint64_t pltEntryAddr) const override;
1121 };
1122
1123 class RetpolineZNow : public X86_64 {
1124 public:
1125   RetpolineZNow();
1126   void writeGotPlt(uint8_t *buf, const Symbol &s) const override {}
1127   void writePltHeader(uint8_t *buf) const override;
1128   void writePlt(uint8_t *buf, const Symbol &sym,
1129                 uint64_t pltEntryAddr) const override;
1130 };
1131 } // namespace
1132
1133 Retpoline::Retpoline() {
1134   pltHeaderSize = 48;
1135   pltEntrySize = 32;
1136   ipltEntrySize = 32;
1137 }
1138
1139 void Retpoline::writeGotPlt(uint8_t *buf, const Symbol &s) const {
1140   write64le(buf, s.getPltVA() + 17);
1141 }
1142
1143 void Retpoline::writePltHeader(uint8_t *buf) const {
1144   const uint8_t insn[] = {
1145       0xff, 0x35, 0,    0,    0,    0,          // 0:    pushq GOTPLT+8(%rip)
1146       0x4c, 0x8b, 0x1d, 0,    0,    0,    0,    // 6:    mov GOTPLT+16(%rip), %r11
1147       0xe8, 0x0e, 0x00, 0x00, 0x00,             // d:    callq next
1148       0xf3, 0x90,                               // 12: loop: pause
1149       0x0f, 0xae, 0xe8,                         // 14:   lfence
1150       0xeb, 0xf9,                               // 17:   jmp loop
1151       0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // 19:   int3; .align 16
1152       0x4c, 0x89, 0x1c, 0x24,                   // 20: next: mov %r11, (%rsp)
1153       0xc3,                                     // 24:   ret
1154       0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // 25:   int3; padding
1155       0xcc, 0xcc, 0xcc, 0xcc,                   // 2c:   int3; padding
1156   };
1157   memcpy(buf, insn, sizeof(insn));
1158
1159   uint64_t gotPlt = in.gotPlt->getVA();
1160   uint64_t plt = in.plt->getVA();
1161   write32le(buf + 2, gotPlt - plt - 6 + 8);
1162   write32le(buf + 9, gotPlt - plt - 13 + 16);
1163 }
1164
1165 void Retpoline::writePlt(uint8_t *buf, const Symbol &sym,
1166                          uint64_t pltEntryAddr) const {
1167   const uint8_t insn[] = {
1168       0x4c, 0x8b, 0x1d, 0, 0, 0, 0, // 0:  mov foo@GOTPLT(%rip), %r11
1169       0xe8, 0,    0,    0,    0,    // 7:  callq plt+0x20
1170       0xe9, 0,    0,    0,    0,    // c:  jmp plt+0x12
1171       0x68, 0,    0,    0,    0,    // 11: pushq <relocation index>
1172       0xe9, 0,    0,    0,    0,    // 16: jmp plt+0
1173       0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // 1b: int3; padding
1174   };
1175   memcpy(buf, insn, sizeof(insn));
1176
1177   uint64_t off = pltEntryAddr - in.plt->getVA();
1178
1179   write32le(buf + 3, sym.getGotPltVA() - pltEntryAddr - 7);
1180   write32le(buf + 8, -off - 12 + 32);
1181   write32le(buf + 13, -off - 17 + 18);
1182   write32le(buf + 18, sym.getPltIdx());
1183   write32le(buf + 23, -off - 27);
1184 }
1185
1186 RetpolineZNow::RetpolineZNow() {
1187   pltHeaderSize = 32;
1188   pltEntrySize = 16;
1189   ipltEntrySize = 16;
1190 }
1191
1192 void RetpolineZNow::writePltHeader(uint8_t *buf) const {
1193   const uint8_t insn[] = {
1194       0xe8, 0x0b, 0x00, 0x00, 0x00, // 0:    call next
1195       0xf3, 0x90,                   // 5:  loop: pause
1196       0x0f, 0xae, 0xe8,             // 7:    lfence
1197       0xeb, 0xf9,                   // a:    jmp loop
1198       0xcc, 0xcc, 0xcc, 0xcc,       // c:    int3; .align 16
1199       0x4c, 0x89, 0x1c, 0x24,       // 10: next: mov %r11, (%rsp)
1200       0xc3,                         // 14:   ret
1201       0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // 15:   int3; padding
1202       0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // 1a:   int3; padding
1203       0xcc,                         // 1f:   int3; padding
1204   };
1205   memcpy(buf, insn, sizeof(insn));
1206 }
1207
1208 void RetpolineZNow::writePlt(uint8_t *buf, const Symbol &sym,
1209                              uint64_t pltEntryAddr) const {
1210   const uint8_t insn[] = {
1211       0x4c, 0x8b, 0x1d, 0,    0, 0, 0, // mov foo@GOTPLT(%rip), %r11
1212       0xe9, 0,    0,    0,    0,       // jmp plt+0
1213       0xcc, 0xcc, 0xcc, 0xcc,          // int3; padding
1214   };
1215   memcpy(buf, insn, sizeof(insn));
1216
1217   write32le(buf + 3, sym.getGotPltVA() - pltEntryAddr - 7);
1218   write32le(buf + 8, in.plt->getVA() - pltEntryAddr - 12);
1219 }
1220
1221 static TargetInfo *getTargetInfo() {
1222   if (config->zRetpolineplt) {
1223     if (config->zNow) {
1224       static RetpolineZNow t;
1225       return &t;
1226     }
1227     static Retpoline t;
1228     return &t;
1229   }
1230
1231   if (config->andFeatures & GNU_PROPERTY_X86_FEATURE_1_IBT) {
1232     static IntelIBT t;
1233     return &t;
1234   }
1235
1236   static X86_64 t;
1237   return &t;
1238 }
1239
1240 TargetInfo *elf::getX86_64TargetInfo() { return getTargetInfo(); }