llvm/lib/Target/AArch64/AArch64ExpandImm.cpp

   1 //===- AArch64ExpandImm.h - AArch64 Immediate Expansion -------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file implements the AArch64ExpandImm stuff.
  10 //
  11 //===----------------------------------------------------------------------===//
  12
  13 #include "AArch64.h"
  14 #include "AArch64ExpandImm.h"
  15 #include "MCTargetDesc/AArch64AddressingModes.h"
  16
  17 using namespace llvm;
  18 using namespace llvm::AArch64_IMM;
  19
  20 /// Helper function which extracts the specified 16-bit chunk from a
  21 /// 64-bit value.
  22 static uint64_t getChunk(uint64_t Imm, unsigned ChunkIdx) {
  23   assert(ChunkIdx < 4 && "Out of range chunk index specified!");
  24
  25   return (Imm >> (ChunkIdx * 16)) & 0xFFFF;
  26 }
  27
  28 /// Check whether the given 16-bit chunk replicated to full 64-bit width
  29 /// can be materialized with an ORR instruction.
  30 static bool canUseOrr(uint64_t Chunk, uint64_t &Encoding) {
  31   Chunk = (Chunk << 48) | (Chunk << 32) | (Chunk << 16) | Chunk;
  32
  33   return AArch64_AM::processLogicalImmediate(Chunk, 64, Encoding);
  34 }
  35
  36 /// Check for identical 16-bit chunks within the constant and if so
  37 /// materialize them with a single ORR instruction. The remaining one or two
  38 /// 16-bit chunks will be materialized with MOVK instructions.
  39 ///
  40 /// This allows us to materialize constants like |A|B|A|A| or |A|B|C|A| (order
  41 /// of the chunks doesn't matter), assuming |A|A|A|A| can be materialized with
  42 /// an ORR instruction.
  43 static bool tryToreplicateChunks(uint64_t UImm,
  44                                  SmallVectorImpl<ImmInsnModel> &Insn) {
  45   using CountMap = DenseMap<uint64_t, unsigned>;
  46
  47   CountMap Counts;
  48
  49   // Scan the constant and count how often every chunk occurs.
  50   for (unsigned Idx = 0; Idx < 4; ++Idx)
  51     ++Counts[getChunk(UImm, Idx)];
  52
  53   // Traverse the chunks to find one which occurs more than once.
  54   for (const auto &Chunk : Counts) {
  55     const uint64_t ChunkVal = Chunk.first;
  56     const unsigned Count = Chunk.second;
  57
  58     uint64_t Encoding = 0;
  59
  60     // We are looking for chunks which have two or three instances and can be
  61     // materialized with an ORR instruction.
  62     if ((Count != 2 && Count != 3) || !canUseOrr(ChunkVal, Encoding))
  63       continue;
  64
  65     const bool CountThree = Count == 3;
  66
  67     Insn.push_back({ AArch64::ORRXri, 0, Encoding });
  68
  69     unsigned ShiftAmt = 0;
  70     uint64_t Imm16 = 0;
  71     // Find the first chunk not materialized with the ORR instruction.
  72     for (; ShiftAmt < 64; ShiftAmt += 16) {
  73       Imm16 = (UImm >> ShiftAmt) & 0xFFFF;
  74
  75       if (Imm16 != ChunkVal)
  76         break;
  77     }
  78
  79     // Create the first MOVK instruction.
  80     Insn.push_back({ AArch64::MOVKXi, Imm16,
  81                      AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt) });
  82
  83     // In case we have three instances the whole constant is now materialized
  84     // and we can exit.
  85     if (CountThree)
  86       return true;
  87
  88     // Find the remaining chunk which needs to be materialized.
  89     for (ShiftAmt += 16; ShiftAmt < 64; ShiftAmt += 16) {
  90       Imm16 = (UImm >> ShiftAmt) & 0xFFFF;
  91
  92       if (Imm16 != ChunkVal)
  93         break;
  94     }
  95     Insn.push_back({ AArch64::MOVKXi, Imm16,
  96                      AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt) });
  97     return true;
  98   }
  99
 100   return false;
 101 }
 102
 103 /// Check whether this chunk matches the pattern '1...0...'. This pattern
 104 /// starts a contiguous sequence of ones if we look at the bits from the LSB
 105 /// towards the MSB.
 106 static bool isStartChunk(uint64_t Chunk) {
 107   if (Chunk == 0 || Chunk == std::numeric_limits<uint64_t>::max())
 108     return false;
 109
 110   return isMask_64(~Chunk);
 111 }
 112
 113 /// Check whether this chunk matches the pattern '0...1...' This pattern
 114 /// ends a contiguous sequence of ones if we look at the bits from the LSB
 115 /// towards the MSB.
 116 static bool isEndChunk(uint64_t Chunk) {
 117   if (Chunk == 0 || Chunk == std::numeric_limits<uint64_t>::max())
 118     return false;
 119
 120   return isMask_64(Chunk);
 121 }
 122
 123 /// Clear or set all bits in the chunk at the given index.
 124 static uint64_t updateImm(uint64_t Imm, unsigned Idx, bool Clear) {
 125   const uint64_t Mask = 0xFFFF;
 126
 127   if (Clear)
 128     // Clear chunk in the immediate.
 129     Imm &= ~(Mask << (Idx * 16));
 130   else
 131     // Set all bits in the immediate for the particular chunk.
 132     Imm |= Mask << (Idx * 16);
 133
 134   return Imm;
 135 }
 136
 137 /// Check whether the constant contains a sequence of contiguous ones,
 138 /// which might be interrupted by one or two chunks. If so, materialize the
 139 /// sequence of contiguous ones with an ORR instruction.
 140 /// Materialize the chunks which are either interrupting the sequence or outside
 141 /// of the sequence with a MOVK instruction.
 142 ///
 143 /// Assuming S is a chunk which starts the sequence (1...0...), E is a chunk
 144 /// which ends the sequence (0...1...). Then we are looking for constants which
 145 /// contain at least one S and E chunk.
 146 /// E.g. |E|A|B|S|, |A|E|B|S| or |A|B|E|S|.
 147 ///
 148 /// We are also looking for constants like |S|A|B|E| where the contiguous
 149 /// sequence of ones wraps around the MSB into the LSB.
 150 static bool trySequenceOfOnes(uint64_t UImm,
 151                               SmallVectorImpl<ImmInsnModel> &Insn) {
 152   const int NotSet = -1;
 153   const uint64_t Mask = 0xFFFF;
 154
 155   int StartIdx = NotSet;
 156   int EndIdx = NotSet;
 157   // Try to find the chunks which start/end a contiguous sequence of ones.
 158   for (int Idx = 0; Idx < 4; ++Idx) {
 159     int64_t Chunk = getChunk(UImm, Idx);
 160     // Sign extend the 16-bit chunk to 64-bit.
 161     Chunk = (Chunk << 48) >> 48;
 162
 163     if (isStartChunk(Chunk))
 164       StartIdx = Idx;
 165     else if (isEndChunk(Chunk))
 166       EndIdx = Idx;
 167   }
 168
 169   // Early exit in case we can't find a start/end chunk.
 170   if (StartIdx == NotSet || EndIdx == NotSet)
 171     return false;
 172
 173   // Outside of the contiguous sequence of ones everything needs to be zero.
 174   uint64_t Outside = 0;
 175   // Chunks between the start and end chunk need to have all their bits set.
 176   uint64_t Inside = Mask;
 177
 178   // If our contiguous sequence of ones wraps around from the MSB into the LSB,
 179   // just swap indices and pretend we are materializing a contiguous sequence
 180   // of zeros surrounded by a contiguous sequence of ones.
 181   if (StartIdx > EndIdx) {
 182     std::swap(StartIdx, EndIdx);
 183     std::swap(Outside, Inside);
 184   }
 185
 186   uint64_t OrrImm = UImm;
 187   int FirstMovkIdx = NotSet;
 188   int SecondMovkIdx = NotSet;
 189
 190   // Find out which chunks we need to patch up to obtain a contiguous sequence
 191   // of ones.
 192   for (int Idx = 0; Idx < 4; ++Idx) {
 193     const uint64_t Chunk = getChunk(UImm, Idx);
 194
 195     // Check whether we are looking at a chunk which is not part of the
 196     // contiguous sequence of ones.
 197     if ((Idx < StartIdx || EndIdx < Idx) && Chunk != Outside) {
 198       OrrImm = updateImm(OrrImm, Idx, Outside == 0);
 199
 200       // Remember the index we need to patch.
 201       if (FirstMovkIdx == NotSet)
 202         FirstMovkIdx = Idx;
 203       else
 204         SecondMovkIdx = Idx;
 205
 206       // Check whether we are looking a chunk which is part of the contiguous
 207       // sequence of ones.
 208     } else if (Idx > StartIdx && Idx < EndIdx && Chunk != Inside) {
 209       OrrImm = updateImm(OrrImm, Idx, Inside != Mask);
 210
 211       // Remember the index we need to patch.
 212       if (FirstMovkIdx == NotSet)
 213         FirstMovkIdx = Idx;
 214       else
 215         SecondMovkIdx = Idx;
 216     }
 217   }
 218   assert(FirstMovkIdx != NotSet && "Constant materializable with single ORR!");
 219
 220   // Create the ORR-immediate instruction.
 221   uint64_t Encoding = 0;
 222   AArch64_AM::processLogicalImmediate(OrrImm, 64, Encoding);
 223   Insn.push_back({ AArch64::ORRXri, 0, Encoding });
 224
 225   const bool SingleMovk = SecondMovkIdx == NotSet;
 226   Insn.push_back({ AArch64::MOVKXi, getChunk(UImm, FirstMovkIdx),
 227                    AArch64_AM::getShifterImm(AArch64_AM::LSL,
 228                                              FirstMovkIdx * 16) });
 229
 230   // Early exit in case we only need to emit a single MOVK instruction.
 231   if (SingleMovk)
 232     return true;
 233
 234   // Create the second MOVK instruction.
 235   Insn.push_back({ AArch64::MOVKXi, getChunk(UImm, SecondMovkIdx),
 236                    AArch64_AM::getShifterImm(AArch64_AM::LSL,
 237                                              SecondMovkIdx * 16) });
 238
 239   return true;
 240 }
 241
 242 /// \brief Expand a MOVi32imm or MOVi64imm pseudo instruction to a
 243 /// MOVZ or MOVN of width BitSize followed by up to 3 MOVK instructions.
 244 static inline void expandMOVImmSimple(uint64_t Imm, unsigned BitSize,
 245                                       unsigned OneChunks, unsigned ZeroChunks,
 246                                       SmallVectorImpl<ImmInsnModel> &Insn) {
 247   const unsigned Mask = 0xFFFF;
 248
 249   // Use a MOVZ or MOVN instruction to set the high bits, followed by one or
 250   // more MOVK instructions to insert additional 16-bit portions into the
 251   // lower bits.
 252   bool isNeg = false;
 253
 254   // Use MOVN to materialize the high bits if we have more all one chunks
 255   // than all zero chunks.
 256   if (OneChunks > ZeroChunks) {
 257     isNeg = true;
 258     Imm = ~Imm;
 259   }
 260
 261   unsigned FirstOpc;
 262   if (BitSize == 32) {
 263     Imm &= (1LL << 32) - 1;
 264     FirstOpc = (isNeg ? AArch64::MOVNWi : AArch64::MOVZWi);
 265   } else {
 266     FirstOpc = (isNeg ? AArch64::MOVNXi : AArch64::MOVZXi);
 267   }
 268   unsigned Shift = 0;     // LSL amount for high bits with MOVZ/MOVN
 269   unsigned LastShift = 0; // LSL amount for last MOVK
 270   if (Imm != 0) {
 271     unsigned LZ = countLeadingZeros(Imm);
 272     unsigned TZ = countTrailingZeros(Imm);
 273     Shift = (TZ / 16) * 16;
 274     LastShift = ((63 - LZ) / 16) * 16;
 275   }
 276   unsigned Imm16 = (Imm >> Shift) & Mask;
 277
 278   Insn.push_back({ FirstOpc, Imm16,
 279                    AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift) });
 280
 281   if (Shift == LastShift)
 282     return;
 283
 284   // If a MOVN was used for the high bits of a negative value, flip the rest
 285   // of the bits back for use with MOVK.
 286   if (isNeg)
 287     Imm = ~Imm;
 288
 289   unsigned Opc = (BitSize == 32 ? AArch64::MOVKWi : AArch64::MOVKXi);
 290   while (Shift < LastShift) {
 291     Shift += 16;
 292     Imm16 = (Imm >> Shift) & Mask;
 293     if (Imm16 == (isNeg ? Mask : 0))
 294       continue; // This 16-bit portion is already set correctly.
 295
 296     Insn.push_back({ Opc, Imm16,
 297                      AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift) });
 298   }
 299 }
 300
 301 /// Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more
 302 /// real move-immediate instructions to synthesize the immediate.
 303 void AArch64_IMM::expandMOVImm(uint64_t Imm, unsigned BitSize,
 304                                SmallVectorImpl<ImmInsnModel> &Insn) {
 305   const unsigned Mask = 0xFFFF;
 306
 307   // Scan the immediate and count the number of 16-bit chunks which are either
 308   // all ones or all zeros.
 309   unsigned OneChunks = 0;
 310   unsigned ZeroChunks = 0;
 311   for (unsigned Shift = 0; Shift < BitSize; Shift += 16) {
 312     const unsigned Chunk = (Imm >> Shift) & Mask;
 313     if (Chunk == Mask)
 314       OneChunks++;
 315     else if (Chunk == 0)
 316       ZeroChunks++;
 317   }
 318
 319   // Prefer MOVZ/MOVN over ORR because of the rules for the "mov" alias.
 320   if ((BitSize / 16) - OneChunks <= 1 || (BitSize / 16) - ZeroChunks <= 1) {
 321     expandMOVImmSimple(Imm, BitSize, OneChunks, ZeroChunks, Insn);
 322     return;
 323   }
 324
 325   // Try a single ORR.
 326   uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
 327   uint64_t Encoding;
 328   if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
 329     unsigned Opc = (BitSize == 32 ? AArch64::ORRWri : AArch64::ORRXri);
 330     Insn.push_back({ Opc, 0, Encoding });
 331     return;
 332   }
 333
 334   // One to up three instruction sequences.
 335   //
 336   // Prefer MOVZ/MOVN followed by MOVK; it's more readable, and possibly the
 337   // fastest sequence with fast literal generation.
 338   if (OneChunks >= (BitSize / 16) - 2 || ZeroChunks >= (BitSize / 16) - 2) {
 339     expandMOVImmSimple(Imm, BitSize, OneChunks, ZeroChunks, Insn);
 340     return;
 341   }
 342
 343   assert(BitSize == 64 && "All 32-bit immediates can be expanded with a"
 344                           "MOVZ/MOVK pair");
 345
 346   // Try other two-instruction sequences.
 347
 348   // 64-bit ORR followed by MOVK.
 349   // We try to construct the ORR immediate in three different ways: either we
 350   // zero out the chunk which will be replaced, we fill the chunk which will
 351   // be replaced with ones, or we take the bit pattern from the other half of
 352   // the 64-bit immediate. This is comprehensive because of the way ORR
 353   // immediates are constructed.
 354   for (unsigned Shift = 0; Shift < BitSize; Shift += 16) {
 355     uint64_t ShiftedMask = (0xFFFFULL << Shift);
 356     uint64_t ZeroChunk = UImm & ~ShiftedMask;
 357     uint64_t OneChunk = UImm | ShiftedMask;
 358     uint64_t RotatedImm = (UImm << 32) | (UImm >> 32);
 359     uint64_t ReplicateChunk = ZeroChunk | (RotatedImm & ShiftedMask);
 360     if (AArch64_AM::processLogicalImmediate(ZeroChunk, BitSize, Encoding) ||
 361         AArch64_AM::processLogicalImmediate(OneChunk, BitSize, Encoding) ||
 362         AArch64_AM::processLogicalImmediate(ReplicateChunk, BitSize,
 363                                             Encoding)) {
 364       // Create the ORR-immediate instruction.
 365       Insn.push_back({ AArch64::ORRXri, 0, Encoding });
 366
 367       // Create the MOVK instruction.
 368       const unsigned Imm16 = getChunk(UImm, Shift / 16);
 369       Insn.push_back({ AArch64::MOVKXi, Imm16,
 370                        AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift) });
 371       return;
 372     }
 373   }
 374
 375   // FIXME: Add more two-instruction sequences.
 376
 377   // Three instruction sequences.
 378   //
 379   // Prefer MOVZ/MOVN followed by two MOVK; it's more readable, and possibly
 380   // the fastest sequence with fast literal generation. (If neither MOVK is
 381   // part of a fast literal generation pair, it could be slower than the
 382   // four-instruction sequence, but we won't worry about that for now.)
 383   if (OneChunks || ZeroChunks) {
 384     expandMOVImmSimple(Imm, BitSize, OneChunks, ZeroChunks, Insn);
 385     return;
 386   }
 387
 388   // Check for identical 16-bit chunks within the constant and if so materialize
 389   // them with a single ORR instruction. The remaining one or two 16-bit chunks
 390   // will be materialized with MOVK instructions.
 391   if (BitSize == 64 && tryToreplicateChunks(UImm, Insn))
 392     return;
 393
 394   // Check whether the constant contains a sequence of contiguous ones, which
 395   // might be interrupted by one or two chunks. If so, materialize the sequence
 396   // of contiguous ones with an ORR instruction. Materialize the chunks which
 397   // are either interrupting the sequence or outside of the sequence with a
 398   // MOVK instruction.
 399   if (BitSize == 64 && trySequenceOfOnes(UImm, Insn))
 400     return;
 401
 402   // We found no possible two or three instruction sequence; use the general
 403   // four-instruction sequence.
 404   expandMOVImmSimple(Imm, BitSize, OneChunks, ZeroChunks, Insn);
 405 }