lib/Target/CellSPU/SPUISelLowering.cpp

   1 //===-- SPUISelLowering.cpp - Cell SPU DAG Lowering Implementation --------===//
   2 //                     The LLVM Compiler Infrastructure
   3 //
   4 // This file is distributed under the University of Illinois Open Source
   5 // License. See LICENSE.TXT for details.
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This file implements the SPUTargetLowering class.
  10 //
  11 //===----------------------------------------------------------------------===//
  12
  13 #include "SPURegisterNames.h"
  14 #include "SPUISelLowering.h"
  15 #include "SPUTargetMachine.h"
  16 #include "SPUFrameLowering.h"
  17 #include "SPUMachineFunction.h"
  18 #include "llvm/Constants.h"
  19 #include "llvm/Function.h"
  20 #include "llvm/Intrinsics.h"
  21 #include "llvm/CallingConv.h"
  22 #include "llvm/Type.h"
  23 #include "llvm/CodeGen/CallingConvLower.h"
  24 #include "llvm/CodeGen/MachineFrameInfo.h"
  25 #include "llvm/CodeGen/MachineFunction.h"
  26 #include "llvm/CodeGen/MachineInstrBuilder.h"
  27 #include "llvm/CodeGen/MachineRegisterInfo.h"
  28 #include "llvm/CodeGen/SelectionDAG.h"
  29 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
  30 #include "llvm/Target/TargetOptions.h"
  31 #include "llvm/ADT/VectorExtras.h"
  32 #include "llvm/Support/Debug.h"
  33 #include "llvm/Support/ErrorHandling.h"
  34 #include "llvm/Support/MathExtras.h"
  35 #include "llvm/Support/raw_ostream.h"
  36 #include <map>
  37
  38 using namespace llvm;
  39
  40 // Used in getTargetNodeName() below
  41 namespace {
  42   std::map<unsigned, const char *> node_names;
  43
  44   // Byte offset of the preferred slot (counted from the MSB)
  45   int prefslotOffset(EVT VT) {
  46     int retval=0;
  47     if (VT==MVT::i1) retval=3;
  48     if (VT==MVT::i8) retval=3;
  49     if (VT==MVT::i16) retval=2;
  50
  51     return retval;
  52   }
  53
  54   //! Expand a library call into an actual call DAG node
  55   /*!
  56    \note
  57    This code is taken from SelectionDAGLegalize, since it is not exposed as
  58    part of the LLVM SelectionDAG API.
  59    */
  60
  61   SDValue
  62   ExpandLibCall(RTLIB::Libcall LC, SDValue Op, SelectionDAG &DAG,
  63                 bool isSigned, SDValue &Hi, const SPUTargetLowering &TLI) {
  64     // The input chain to this libcall is the entry node of the function.
  65     // Legalizing the call will automatically add the previous call to the
  66     // dependence.
  67     SDValue InChain = DAG.getEntryNode();
  68
  69     TargetLowering::ArgListTy Args;
  70     TargetLowering::ArgListEntry Entry;
  71     for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
  72       EVT ArgVT = Op.getOperand(i).getValueType();
  73       const Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
  74       Entry.Node = Op.getOperand(i);
  75       Entry.Ty = ArgTy;
  76       Entry.isSExt = isSigned;
  77       Entry.isZExt = !isSigned;
  78       Args.push_back(Entry);
  79     }
  80     SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
  81                                            TLI.getPointerTy());
  82
  83     // Splice the libcall in wherever FindInputOutputChains tells us to.
  84     const Type *RetTy =
  85                 Op.getNode()->getValueType(0).getTypeForEVT(*DAG.getContext());
  86     std::pair<SDValue, SDValue> CallInfo =
  87             TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false,
  88                             0, TLI.getLibcallCallingConv(LC), false,
  89                             /*isReturnValueUsed=*/true,
  90                             Callee, Args, DAG, Op.getDebugLoc());
  91
  92     return CallInfo.first;
  93   }
  94 }
  95
  96 SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
  97   : TargetLowering(TM, new TargetLoweringObjectFileELF()),
  98     SPUTM(TM) {
  99
 100   // Use _setjmp/_longjmp instead of setjmp/longjmp.
 101   setUseUnderscoreSetJmp(true);
 102   setUseUnderscoreLongJmp(true);
 103
 104   // Set RTLIB libcall names as used by SPU:
 105   setLibcallName(RTLIB::DIV_F64, "__fast_divdf3");
 106
 107   // Set up the SPU's register classes:
 108   addRegisterClass(MVT::i8,   SPU::R8CRegisterClass);
 109   addRegisterClass(MVT::i16,  SPU::R16CRegisterClass);
 110   addRegisterClass(MVT::i32,  SPU::R32CRegisterClass);
 111   addRegisterClass(MVT::i64,  SPU::R64CRegisterClass);
 112   addRegisterClass(MVT::f32,  SPU::R32FPRegisterClass);
 113   addRegisterClass(MVT::f64,  SPU::R64FPRegisterClass);
 114   addRegisterClass(MVT::i128, SPU::GPRCRegisterClass);
 115
 116   // SPU has no sign or zero extended loads for i1, i8, i16:
 117   setLoadExtAction(ISD::EXTLOAD,  MVT::i1, Promote);
 118   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
 119   setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
 120
 121   setLoadExtAction(ISD::EXTLOAD,  MVT::f32, Expand);
 122   setLoadExtAction(ISD::EXTLOAD,  MVT::f64, Expand);
 123
 124   setTruncStoreAction(MVT::i128, MVT::i64, Expand);
 125   setTruncStoreAction(MVT::i128, MVT::i32, Expand);
 126   setTruncStoreAction(MVT::i128, MVT::i16, Expand);
 127   setTruncStoreAction(MVT::i128, MVT::i8, Expand);
 128
 129   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 130
 131   // SPU constant load actions are custom lowered:
 132   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
 133   setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
 134
 135   // SPU's loads and stores have to be custom lowered:
 136   for (unsigned sctype = (unsigned) MVT::i8; sctype < (unsigned) MVT::i128;
 137        ++sctype) {
 138     MVT::SimpleValueType VT = (MVT::SimpleValueType)sctype;
 139
 140     setOperationAction(ISD::LOAD,   VT, Custom);
 141     setOperationAction(ISD::STORE,  VT, Custom);
 142     setLoadExtAction(ISD::EXTLOAD,  VT, Custom);
 143     setLoadExtAction(ISD::ZEXTLOAD, VT, Custom);
 144     setLoadExtAction(ISD::SEXTLOAD, VT, Custom);
 145
 146     for (unsigned stype = sctype - 1; stype >= (unsigned) MVT::i8; --stype) {
 147       MVT::SimpleValueType StoreVT = (MVT::SimpleValueType) stype;
 148       setTruncStoreAction(VT, StoreVT, Expand);
 149     }
 150   }
 151
 152   for (unsigned sctype = (unsigned) MVT::f32; sctype < (unsigned) MVT::f64;
 153        ++sctype) {
 154     MVT::SimpleValueType VT = (MVT::SimpleValueType) sctype;
 155
 156     setOperationAction(ISD::LOAD,   VT, Custom);
 157     setOperationAction(ISD::STORE,  VT, Custom);
 158
 159     for (unsigned stype = sctype - 1; stype >= (unsigned) MVT::f32; --stype) {
 160       MVT::SimpleValueType StoreVT = (MVT::SimpleValueType) stype;
 161       setTruncStoreAction(VT, StoreVT, Expand);
 162     }
 163   }
 164
 165   // Expand the jumptable branches
 166   setOperationAction(ISD::BR_JT,        MVT::Other, Expand);
 167   setOperationAction(ISD::BR_CC,        MVT::Other, Expand);
 168
 169   // Custom lower SELECT_CC for most cases, but expand by default
 170   setOperationAction(ISD::SELECT_CC,    MVT::Other, Expand);
 171   setOperationAction(ISD::SELECT_CC,    MVT::i8,    Custom);
 172   setOperationAction(ISD::SELECT_CC,    MVT::i16,   Custom);
 173   setOperationAction(ISD::SELECT_CC,    MVT::i32,   Custom);
 174   setOperationAction(ISD::SELECT_CC,    MVT::i64,   Custom);
 175
 176   // SPU has no intrinsics for these particular operations:
 177   setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
 178
 179   // SPU has no division/remainder instructions
 180   setOperationAction(ISD::SREM,    MVT::i8,   Expand);
 181   setOperationAction(ISD::UREM,    MVT::i8,   Expand);
 182   setOperationAction(ISD::SDIV,    MVT::i8,   Expand);
 183   setOperationAction(ISD::UDIV,    MVT::i8,   Expand);
 184   setOperationAction(ISD::SDIVREM, MVT::i8,   Expand);
 185   setOperationAction(ISD::UDIVREM, MVT::i8,   Expand);
 186   setOperationAction(ISD::SREM,    MVT::i16,  Expand);
 187   setOperationAction(ISD::UREM,    MVT::i16,  Expand);
 188   setOperationAction(ISD::SDIV,    MVT::i16,  Expand);
 189   setOperationAction(ISD::UDIV,    MVT::i16,  Expand);
 190   setOperationAction(ISD::SDIVREM, MVT::i16,  Expand);
 191   setOperationAction(ISD::UDIVREM, MVT::i16,  Expand);
 192   setOperationAction(ISD::SREM,    MVT::i32,  Expand);
 193   setOperationAction(ISD::UREM,    MVT::i32,  Expand);
 194   setOperationAction(ISD::SDIV,    MVT::i32,  Expand);
 195   setOperationAction(ISD::UDIV,    MVT::i32,  Expand);
 196   setOperationAction(ISD::SDIVREM, MVT::i32,  Expand);
 197   setOperationAction(ISD::UDIVREM, MVT::i32,  Expand);
 198   setOperationAction(ISD::SREM,    MVT::i64,  Expand);
 199   setOperationAction(ISD::UREM,    MVT::i64,  Expand);
 200   setOperationAction(ISD::SDIV,    MVT::i64,  Expand);
 201   setOperationAction(ISD::UDIV,    MVT::i64,  Expand);
 202   setOperationAction(ISD::SDIVREM, MVT::i64,  Expand);
 203   setOperationAction(ISD::UDIVREM, MVT::i64,  Expand);
 204   setOperationAction(ISD::SREM,    MVT::i128, Expand);
 205   setOperationAction(ISD::UREM,    MVT::i128, Expand);
 206   setOperationAction(ISD::SDIV,    MVT::i128, Expand);
 207   setOperationAction(ISD::UDIV,    MVT::i128, Expand);
 208   setOperationAction(ISD::SDIVREM, MVT::i128, Expand);
 209   setOperationAction(ISD::UDIVREM, MVT::i128, Expand);
 210
 211   // We don't support sin/cos/sqrt/fmod
 212   setOperationAction(ISD::FSIN , MVT::f64, Expand);
 213   setOperationAction(ISD::FCOS , MVT::f64, Expand);
 214   setOperationAction(ISD::FREM , MVT::f64, Expand);
 215   setOperationAction(ISD::FSIN , MVT::f32, Expand);
 216   setOperationAction(ISD::FCOS , MVT::f32, Expand);
 217   setOperationAction(ISD::FREM , MVT::f32, Expand);
 218
 219   // Expand fsqrt to the appropriate libcall (NOTE: should use h/w fsqrt
 220   // for f32!)
 221   setOperationAction(ISD::FSQRT, MVT::f64, Expand);
 222   setOperationAction(ISD::FSQRT, MVT::f32, Expand);
 223
 224   setOperationAction(ISD::FMA, MVT::f64, Expand);
 225   setOperationAction(ISD::FMA, MVT::f32, Expand);
 226
 227   setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
 228   setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
 229
 230   // SPU can do rotate right and left, so legalize it... but customize for i8
 231   // because instructions don't exist.
 232
 233   // FIXME: Change from "expand" to appropriate type once ROTR is supported in
 234   //        .td files.
 235   setOperationAction(ISD::ROTR, MVT::i32,    Expand /*Legal*/);
 236   setOperationAction(ISD::ROTR, MVT::i16,    Expand /*Legal*/);
 237   setOperationAction(ISD::ROTR, MVT::i8,     Expand /*Custom*/);
 238
 239   setOperationAction(ISD::ROTL, MVT::i32,    Legal);
 240   setOperationAction(ISD::ROTL, MVT::i16,    Legal);
 241   setOperationAction(ISD::ROTL, MVT::i8,     Custom);
 242
 243   // SPU has no native version of shift left/right for i8
 244   setOperationAction(ISD::SHL,  MVT::i8,     Custom);
 245   setOperationAction(ISD::SRL,  MVT::i8,     Custom);
 246   setOperationAction(ISD::SRA,  MVT::i8,     Custom);
 247
 248   // Make these operations legal and handle them during instruction selection:
 249   setOperationAction(ISD::SHL,  MVT::i64,    Legal);
 250   setOperationAction(ISD::SRL,  MVT::i64,    Legal);
 251   setOperationAction(ISD::SRA,  MVT::i64,    Legal);
 252
 253   // Custom lower i8, i32 and i64 multiplications
 254   setOperationAction(ISD::MUL,  MVT::i8,     Custom);
 255   setOperationAction(ISD::MUL,  MVT::i32,    Legal);
 256   setOperationAction(ISD::MUL,  MVT::i64,    Legal);
 257
 258   // Expand double-width multiplication
 259   // FIXME: It would probably be reasonable to support some of these operations
 260   setOperationAction(ISD::UMUL_LOHI, MVT::i8,  Expand);
 261   setOperationAction(ISD::SMUL_LOHI, MVT::i8,  Expand);
 262   setOperationAction(ISD::MULHU,     MVT::i8,  Expand);
 263   setOperationAction(ISD::MULHS,     MVT::i8,  Expand);
 264   setOperationAction(ISD::UMUL_LOHI, MVT::i16, Expand);
 265   setOperationAction(ISD::SMUL_LOHI, MVT::i16, Expand);
 266   setOperationAction(ISD::MULHU,     MVT::i16, Expand);
 267   setOperationAction(ISD::MULHS,     MVT::i16, Expand);
 268   setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
 269   setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
 270   setOperationAction(ISD::MULHU,     MVT::i32, Expand);
 271   setOperationAction(ISD::MULHS,     MVT::i32, Expand);
 272   setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
 273   setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
 274   setOperationAction(ISD::MULHU,     MVT::i64, Expand);
 275   setOperationAction(ISD::MULHS,     MVT::i64, Expand);
 276
 277   // Need to custom handle (some) common i8, i64 math ops
 278   setOperationAction(ISD::ADD,  MVT::i8,     Custom);
 279   setOperationAction(ISD::ADD,  MVT::i64,    Legal);
 280   setOperationAction(ISD::SUB,  MVT::i8,     Custom);
 281   setOperationAction(ISD::SUB,  MVT::i64,    Legal);
 282
 283   // SPU does not have BSWAP. It does have i32 support CTLZ.
 284   // CTPOP has to be custom lowered.
 285   setOperationAction(ISD::BSWAP, MVT::i32,   Expand);
 286   setOperationAction(ISD::BSWAP, MVT::i64,   Expand);
 287
 288   setOperationAction(ISD::CTPOP, MVT::i8,    Custom);
 289   setOperationAction(ISD::CTPOP, MVT::i16,   Custom);
 290   setOperationAction(ISD::CTPOP, MVT::i32,   Custom);
 291   setOperationAction(ISD::CTPOP, MVT::i64,   Custom);
 292   setOperationAction(ISD::CTPOP, MVT::i128,  Expand);
 293
 294   setOperationAction(ISD::CTTZ , MVT::i8,    Expand);
 295   setOperationAction(ISD::CTTZ , MVT::i16,   Expand);
 296   setOperationAction(ISD::CTTZ , MVT::i32,   Expand);
 297   setOperationAction(ISD::CTTZ , MVT::i64,   Expand);
 298   setOperationAction(ISD::CTTZ , MVT::i128,  Expand);
 299
 300   setOperationAction(ISD::CTLZ , MVT::i8,    Promote);
 301   setOperationAction(ISD::CTLZ , MVT::i16,   Promote);
 302   setOperationAction(ISD::CTLZ , MVT::i32,   Legal);
 303   setOperationAction(ISD::CTLZ , MVT::i64,   Expand);
 304   setOperationAction(ISD::CTLZ , MVT::i128,  Expand);
 305
 306   // SPU has a version of select that implements (a&~c)|(b&c), just like
 307   // select ought to work:
 308   setOperationAction(ISD::SELECT, MVT::i8,   Legal);
 309   setOperationAction(ISD::SELECT, MVT::i16,  Legal);
 310   setOperationAction(ISD::SELECT, MVT::i32,  Legal);
 311   setOperationAction(ISD::SELECT, MVT::i64,  Legal);
 312
 313   setOperationAction(ISD::SETCC, MVT::i8,    Legal);
 314   setOperationAction(ISD::SETCC, MVT::i16,   Legal);
 315   setOperationAction(ISD::SETCC, MVT::i32,   Legal);
 316   setOperationAction(ISD::SETCC, MVT::i64,   Legal);
 317   setOperationAction(ISD::SETCC, MVT::f64,   Custom);
 318
 319   // Custom lower i128 -> i64 truncates
 320   setOperationAction(ISD::TRUNCATE, MVT::i64, Custom);
 321
 322   // Custom lower i32/i64 -> i128 sign extend
 323   setOperationAction(ISD::SIGN_EXTEND, MVT::i128, Custom);
 324
 325   setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
 326   setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
 327   setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
 328   setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
 329   // SPU has a legal FP -> signed INT instruction for f32, but for f64, need
 330   // to expand to a libcall, hence the custom lowering:
 331   setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
 332   setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
 333   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Expand);
 334   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
 335   setOperationAction(ISD::FP_TO_SINT, MVT::i128, Expand);
 336   setOperationAction(ISD::FP_TO_UINT, MVT::i128, Expand);
 337
 338   // FDIV on SPU requires custom lowering
 339   setOperationAction(ISD::FDIV, MVT::f64, Expand);      // to libcall
 340
 341   // SPU has [U|S]INT_TO_FP for f32->i32, but not for f64->i32, f64->i64:
 342   setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
 343   setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
 344   setOperationAction(ISD::SINT_TO_FP, MVT::i8,  Promote);
 345   setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
 346   setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
 347   setOperationAction(ISD::UINT_TO_FP, MVT::i8,  Promote);
 348   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
 349   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
 350
 351   setOperationAction(ISD::BITCAST, MVT::i32, Legal);
 352   setOperationAction(ISD::BITCAST, MVT::f32, Legal);
 353   setOperationAction(ISD::BITCAST, MVT::i64, Legal);
 354   setOperationAction(ISD::BITCAST, MVT::f64, Legal);
 355
 356   // We cannot sextinreg(i1).  Expand to shifts.
 357   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
 358
 359   // We want to legalize GlobalAddress and ConstantPool nodes into the
 360   // appropriate instructions to materialize the address.
 361   for (unsigned sctype = (unsigned) MVT::i8; sctype < (unsigned) MVT::f128;
 362        ++sctype) {
 363     MVT::SimpleValueType VT = (MVT::SimpleValueType)sctype;
 364
 365     setOperationAction(ISD::GlobalAddress,  VT, Custom);
 366     setOperationAction(ISD::ConstantPool,   VT, Custom);
 367     setOperationAction(ISD::JumpTable,      VT, Custom);
 368   }
 369
 370   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
 371   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
 372
 373   // Use the default implementation.
 374   setOperationAction(ISD::VAARG             , MVT::Other, Expand);
 375   setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
 376   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
 377   setOperationAction(ISD::STACKSAVE         , MVT::Other, Expand);
 378   setOperationAction(ISD::STACKRESTORE      , MVT::Other, Expand);
 379   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Expand);
 380   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64  , Expand);
 381
 382   // Cell SPU has instructions for converting between i64 and fp.
 383   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
 384   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
 385
 386   // To take advantage of the above i64 FP_TO_SINT, promote i32 FP_TO_UINT
 387   setOperationAction(ISD::FP_TO_UINT, MVT::i32, Promote);
 388
 389   // BUILD_PAIR can't be handled natively, and should be expanded to shl/or
 390   setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
 391
 392   // First set operation action for all vector types to expand. Then we
 393   // will selectively turn on ones that can be effectively codegen'd.
 394   addRegisterClass(MVT::v16i8, SPU::VECREGRegisterClass);
 395   addRegisterClass(MVT::v8i16, SPU::VECREGRegisterClass);
 396   addRegisterClass(MVT::v4i32, SPU::VECREGRegisterClass);
 397   addRegisterClass(MVT::v2i64, SPU::VECREGRegisterClass);
 398   addRegisterClass(MVT::v4f32, SPU::VECREGRegisterClass);
 399   addRegisterClass(MVT::v2f64, SPU::VECREGRegisterClass);
 400
 401   for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
 402        i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
 403     MVT::SimpleValueType VT = (MVT::SimpleValueType)i;
 404
 405     // add/sub are legal for all supported vector VT's.
 406     setOperationAction(ISD::ADD,     VT, Legal);
 407     setOperationAction(ISD::SUB,     VT, Legal);
 408     // mul has to be custom lowered.
 409     setOperationAction(ISD::MUL,     VT, Legal);
 410
 411     setOperationAction(ISD::AND,     VT, Legal);
 412     setOperationAction(ISD::OR,      VT, Legal);
 413     setOperationAction(ISD::XOR,     VT, Legal);
 414     setOperationAction(ISD::LOAD,    VT, Custom);
 415     setOperationAction(ISD::SELECT,  VT, Legal);
 416     setOperationAction(ISD::STORE,   VT, Custom);
 417
 418     // These operations need to be expanded:
 419     setOperationAction(ISD::SDIV,    VT, Expand);
 420     setOperationAction(ISD::SREM,    VT, Expand);
 421     setOperationAction(ISD::UDIV,    VT, Expand);
 422     setOperationAction(ISD::UREM,    VT, Expand);
 423
 424     // Custom lower build_vector, constant pool spills, insert and
 425     // extract vector elements:
 426     setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
 427     setOperationAction(ISD::ConstantPool, VT, Custom);
 428     setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
 429     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
 430     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
 431     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
 432   }
 433
 434   setOperationAction(ISD::AND, MVT::v16i8, Custom);
 435   setOperationAction(ISD::OR,  MVT::v16i8, Custom);
 436   setOperationAction(ISD::XOR, MVT::v16i8, Custom);
 437   setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
 438
 439   setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
 440
 441   setBooleanContents(ZeroOrNegativeOneBooleanContent);
 442
 443   setStackPointerRegisterToSaveRestore(SPU::R1);
 444
 445   // We have target-specific dag combine patterns for the following nodes:
 446   setTargetDAGCombine(ISD::ADD);
 447   setTargetDAGCombine(ISD::ZERO_EXTEND);
 448   setTargetDAGCombine(ISD::SIGN_EXTEND);
 449   setTargetDAGCombine(ISD::ANY_EXTEND);
 450
 451   setMinFunctionAlignment(3);
 452
 453   computeRegisterProperties();
 454
 455   // Set pre-RA register scheduler default to BURR, which produces slightly
 456   // better code than the default (could also be TDRR, but TargetLowering.h
 457   // needs a mod to support that model):
 458   setSchedulingPreference(Sched::RegPressure);
 459 }
 460
 461 const char *
 462 SPUTargetLowering::getTargetNodeName(unsigned Opcode) const
 463 {
 464   if (node_names.empty()) {
 465     node_names[(unsigned) SPUISD::RET_FLAG] = "SPUISD::RET_FLAG";
 466     node_names[(unsigned) SPUISD::Hi] = "SPUISD::Hi";
 467     node_names[(unsigned) SPUISD::Lo] = "SPUISD::Lo";
 468     node_names[(unsigned) SPUISD::PCRelAddr] = "SPUISD::PCRelAddr";
 469     node_names[(unsigned) SPUISD::AFormAddr] = "SPUISD::AFormAddr";
 470     node_names[(unsigned) SPUISD::IndirectAddr] = "SPUISD::IndirectAddr";
 471     node_names[(unsigned) SPUISD::LDRESULT] = "SPUISD::LDRESULT";
 472     node_names[(unsigned) SPUISD::CALL] = "SPUISD::CALL";
 473     node_names[(unsigned) SPUISD::SHUFB] = "SPUISD::SHUFB";
 474     node_names[(unsigned) SPUISD::SHUFFLE_MASK] = "SPUISD::SHUFFLE_MASK";
 475     node_names[(unsigned) SPUISD::CNTB] = "SPUISD::CNTB";
 476     node_names[(unsigned) SPUISD::PREFSLOT2VEC] = "SPUISD::PREFSLOT2VEC";
 477     node_names[(unsigned) SPUISD::VEC2PREFSLOT] = "SPUISD::VEC2PREFSLOT";
 478     node_names[(unsigned) SPUISD::SHL_BITS] = "SPUISD::SHL_BITS";
 479     node_names[(unsigned) SPUISD::SHL_BYTES] = "SPUISD::SHL_BYTES";
 480     node_names[(unsigned) SPUISD::VEC_ROTL] = "SPUISD::VEC_ROTL";
 481     node_names[(unsigned) SPUISD::VEC_ROTR] = "SPUISD::VEC_ROTR";
 482     node_names[(unsigned) SPUISD::ROTBYTES_LEFT] = "SPUISD::ROTBYTES_LEFT";
 483     node_names[(unsigned) SPUISD::ROTBYTES_LEFT_BITS] =
 484             "SPUISD::ROTBYTES_LEFT_BITS";
 485     node_names[(unsigned) SPUISD::SELECT_MASK] = "SPUISD::SELECT_MASK";
 486     node_names[(unsigned) SPUISD::SELB] = "SPUISD::SELB";
 487     node_names[(unsigned) SPUISD::ADD64_MARKER] = "SPUISD::ADD64_MARKER";
 488     node_names[(unsigned) SPUISD::SUB64_MARKER] = "SPUISD::SUB64_MARKER";
 489     node_names[(unsigned) SPUISD::MUL64_MARKER] = "SPUISD::MUL64_MARKER";
 490   }
 491
 492   std::map<unsigned, const char *>::iterator i = node_names.find(Opcode);
 493
 494   return ((i != node_names.end()) ? i->second : 0);
 495 }
 496
 497 //===----------------------------------------------------------------------===//
 498 // Return the Cell SPU's SETCC result type
 499 //===----------------------------------------------------------------------===//
 500
 501 MVT::SimpleValueType SPUTargetLowering::getSetCCResultType(EVT VT) const {
 502   // i8, i16 and i32 are valid SETCC result types
 503   MVT::SimpleValueType retval;
 504
 505   switch(VT.getSimpleVT().SimpleTy){
 506     case MVT::i1:
 507     case MVT::i8:
 508       retval = MVT::i8; break;
 509     case MVT::i16:
 510       retval = MVT::i16; break;
 511     case MVT::i32:
 512     default:
 513       retval = MVT::i32;
 514   }
 515   return retval;
 516 }
 517
 518 //===----------------------------------------------------------------------===//
 519 // Calling convention code:
 520 //===----------------------------------------------------------------------===//
 521
 522 #include "SPUGenCallingConv.inc"
 523
 524 //===----------------------------------------------------------------------===//
 525 //  LowerOperation implementation
 526 //===----------------------------------------------------------------------===//
 527
 528 /// Custom lower loads for CellSPU
 529 /*!
 530  All CellSPU loads and stores are aligned to 16-byte boundaries, so for elements
 531  within a 16-byte block, we have to rotate to extract the requested element.
 532
 533  For extending loads, we also want to ensure that the following sequence is
 534  emitted, e.g. for MVT::f32 extending load to MVT::f64:
 535
 536 \verbatim
 537 %1  v16i8,ch = load
 538 %2  v16i8,ch = rotate %1
 539 %3  v4f8, ch = bitconvert %2
 540 %4  f32      = vec2perfslot %3
 541 %5  f64      = fp_extend %4
 542 \endverbatim
 543 */
 544 static SDValue
 545 LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
 546   LoadSDNode *LN = cast<LoadSDNode>(Op);
 547   SDValue the_chain = LN->getChain();
 548   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
 549   EVT InVT = LN->getMemoryVT();
 550   EVT OutVT = Op.getValueType();
 551   ISD::LoadExtType ExtType = LN->getExtensionType();
 552   unsigned alignment = LN->getAlignment();
 553   int pso = prefslotOffset(InVT);
 554   DebugLoc dl = Op.getDebugLoc();
 555   EVT vecVT = InVT.isVector()? InVT: EVT::getVectorVT(*DAG.getContext(), InVT,
 556                                                   (128 / InVT.getSizeInBits()));
 557
 558   // two sanity checks
 559   assert( LN->getAddressingMode() == ISD::UNINDEXED
 560           && "we should get only UNINDEXED adresses");
 561   // clean aligned loads can be selected as-is
 562   if (InVT.getSizeInBits() == 128 && (alignment%16) == 0)
 563     return SDValue();
 564
 565   // Get pointerinfos to the memory chunk(s) that contain the data to load
 566   uint64_t mpi_offset = LN->getPointerInfo().Offset;
 567   mpi_offset -= mpi_offset%16;
 568   MachinePointerInfo lowMemPtr(LN->getPointerInfo().V, mpi_offset);
 569   MachinePointerInfo highMemPtr(LN->getPointerInfo().V, mpi_offset+16);
 570
 571   SDValue result;
 572   SDValue basePtr = LN->getBasePtr();
 573   SDValue rotate;
 574
 575   if ((alignment%16) == 0) {
 576     ConstantSDNode *CN;
 577
 578     // Special cases for a known aligned load to simplify the base pointer
 579     // and the rotation amount:
 580     if (basePtr.getOpcode() == ISD::ADD
 581         && (CN = dyn_cast<ConstantSDNode > (basePtr.getOperand(1))) != 0) {
 582       // Known offset into basePtr
 583       int64_t offset = CN->getSExtValue();
 584       int64_t rotamt = int64_t((offset & 0xf) - pso);
 585
 586       if (rotamt < 0)
 587         rotamt += 16;
 588
 589       rotate = DAG.getConstant(rotamt, MVT::i16);
 590
 591       // Simplify the base pointer for this case:
 592       basePtr = basePtr.getOperand(0);
 593       if ((offset & ~0xf) > 0) {
 594         basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
 595                               basePtr,
 596                               DAG.getConstant((offset & ~0xf), PtrVT));
 597       }
 598     } else if ((basePtr.getOpcode() == SPUISD::AFormAddr)
 599                || (basePtr.getOpcode() == SPUISD::IndirectAddr
 600                    && basePtr.getOperand(0).getOpcode() == SPUISD::Hi
 601                    && basePtr.getOperand(1).getOpcode() == SPUISD::Lo)) {
 602       // Plain aligned a-form address: rotate into preferred slot
 603       // Same for (SPUindirect (SPUhi ...), (SPUlo ...))
 604       int64_t rotamt = -pso;
 605       if (rotamt < 0)
 606         rotamt += 16;
 607       rotate = DAG.getConstant(rotamt, MVT::i16);
 608     } else {
 609       // Offset the rotate amount by the basePtr and the preferred slot
 610       // byte offset
 611       int64_t rotamt = -pso;
 612       if (rotamt < 0)
 613         rotamt += 16;
 614       rotate = DAG.getNode(ISD::ADD, dl, PtrVT,
 615                            basePtr,
 616                            DAG.getConstant(rotamt, PtrVT));
 617     }
 618   } else {
 619     // Unaligned load: must be more pessimistic about addressing modes:
 620     if (basePtr.getOpcode() == ISD::ADD) {
 621       MachineFunction &MF = DAG.getMachineFunction();
 622       MachineRegisterInfo &RegInfo = MF.getRegInfo();
 623       unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
 624       SDValue Flag;
 625
 626       SDValue Op0 = basePtr.getOperand(0);
 627       SDValue Op1 = basePtr.getOperand(1);
 628
 629       if (isa<ConstantSDNode>(Op1)) {
 630         // Convert the (add <ptr>, <const>) to an indirect address contained
 631         // in a register. Note that this is done because we need to avoid
 632         // creating a 0(reg) d-form address due to the SPU's block loads.
 633         basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
 634         the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag);
 635         basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT);
 636       } else {
 637         // Convert the (add <arg1>, <arg2>) to an indirect address, which
 638         // will likely be lowered as a reg(reg) x-form address.
 639         basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
 640       }
 641     } else {
 642       basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
 643                             basePtr,
 644                             DAG.getConstant(0, PtrVT));
 645    }
 646
 647     // Offset the rotate amount by the basePtr and the preferred slot
 648     // byte offset
 649     rotate = DAG.getNode(ISD::ADD, dl, PtrVT,
 650                          basePtr,
 651                          DAG.getConstant(-pso, PtrVT));
 652   }
 653
 654   // Do the load as a i128 to allow possible shifting
 655   SDValue low = DAG.getLoad(MVT::i128, dl, the_chain, basePtr,
 656                        lowMemPtr,
 657                        LN->isVolatile(), LN->isNonTemporal(), 16);
 658
 659   // When the size is not greater than alignment we get all data with just
 660   // one load
 661   if (alignment >= InVT.getSizeInBits()/8) {
 662     // Update the chain
 663     the_chain = low.getValue(1);
 664
 665     // Rotate into the preferred slot:
 666     result = DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, MVT::i128,
 667                          low.getValue(0), rotate);
 668
 669     // Convert the loaded v16i8 vector to the appropriate vector type
 670     // specified by the operand:
 671     EVT vecVT = EVT::getVectorVT(*DAG.getContext(),
 672                                  InVT, (128 / InVT.getSizeInBits()));
 673     result = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, InVT,
 674                          DAG.getNode(ISD::BITCAST, dl, vecVT, result));
 675   }
 676   // When alignment is less than the size, we might need (known only at
 677   // run-time) two loads
 678   // TODO: if the memory address is composed only from constants, we have
 679   // extra kowledge, and might avoid the second load
 680   else {
 681     // storage position offset from lower 16 byte aligned memory chunk
 682     SDValue offset = DAG.getNode(ISD::AND, dl, MVT::i32,
 683                                   basePtr, DAG.getConstant( 0xf, MVT::i32 ) );
 684     // get a registerfull of ones. (this implementation is a workaround: LLVM
 685     // cannot handle 128 bit signed int constants)
 686     SDValue ones = DAG.getConstant(-1, MVT::v4i32 );
 687     ones = DAG.getNode(ISD::BITCAST, dl, MVT::i128, ones);
 688
 689     SDValue high = DAG.getLoad(MVT::i128, dl, the_chain,
 690                                DAG.getNode(ISD::ADD, dl, PtrVT,
 691                                            basePtr,
 692                                            DAG.getConstant(16, PtrVT)),
 693                                highMemPtr,
 694                                LN->isVolatile(), LN->isNonTemporal(), 16);
 695
 696     the_chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(1),
 697                                                               high.getValue(1));
 698
 699     // Shift the (possible) high part right to compensate the misalignemnt.
 700     // if there is no highpart (i.e. value is i64 and offset is 4), this
 701     // will zero out the high value.
 702     high = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, high,
 703                                      DAG.getNode(ISD::SUB, dl, MVT::i32,
 704                                                  DAG.getConstant( 16, MVT::i32),
 705                                                  offset
 706                                                 ));
 707
 708     // Shift the low similarly
 709     // TODO: add SPUISD::SHL_BYTES
 710     low = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, low, offset );
 711
 712     // Merge the two parts
 713     result = DAG.getNode(ISD::BITCAST, dl, vecVT,
 714                           DAG.getNode(ISD::OR, dl, MVT::i128, low, high));
 715
 716     if (!InVT.isVector()) {
 717       result = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, InVT, result );
 718      }
 719
 720   }
 721     // Handle extending loads by extending the scalar result:
 722     if (ExtType == ISD::SEXTLOAD) {
 723       result = DAG.getNode(ISD::SIGN_EXTEND, dl, OutVT, result);
 724     } else if (ExtType == ISD::ZEXTLOAD) {
 725       result = DAG.getNode(ISD::ZERO_EXTEND, dl, OutVT, result);
 726     } else if (ExtType == ISD::EXTLOAD) {
 727       unsigned NewOpc = ISD::ANY_EXTEND;
 728
 729       if (OutVT.isFloatingPoint())
 730         NewOpc = ISD::FP_EXTEND;
 731
 732       result = DAG.getNode(NewOpc, dl, OutVT, result);
 733     }
 734
 735     SDVTList retvts = DAG.getVTList(OutVT, MVT::Other);
 736     SDValue retops[2] = {
 737       result,
 738       the_chain
 739     };
 740
 741     result = DAG.getNode(SPUISD::LDRESULT, dl, retvts,
 742                          retops, sizeof(retops) / sizeof(retops[0]));
 743     return result;
 744 }
 745
 746 /// Custom lower stores for CellSPU
 747 /*!
 748  All CellSPU stores are aligned to 16-byte boundaries, so for elements
 749  within a 16-byte block, we have to generate a shuffle to insert the
 750  requested element into its place, then store the resulting block.
 751  */
 752 static SDValue
 753 LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
 754   StoreSDNode *SN = cast<StoreSDNode>(Op);
 755   SDValue Value = SN->getValue();
 756   EVT VT = Value.getValueType();
 757   EVT StVT = (!SN->isTruncatingStore() ? VT : SN->getMemoryVT());
 758   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
 759   DebugLoc dl = Op.getDebugLoc();
 760   unsigned alignment = SN->getAlignment();
 761   SDValue result;
 762   EVT vecVT = StVT.isVector()? StVT: EVT::getVectorVT(*DAG.getContext(), StVT,
 763                                                  (128 / StVT.getSizeInBits()));
 764   // Get pointerinfos to the memory chunk(s) that contain the data to load
 765   uint64_t mpi_offset = SN->getPointerInfo().Offset;
 766   mpi_offset -= mpi_offset%16;
 767   MachinePointerInfo lowMemPtr(SN->getPointerInfo().V, mpi_offset);
 768   MachinePointerInfo highMemPtr(SN->getPointerInfo().V, mpi_offset+16);
 769
 770
 771   // two sanity checks
 772   assert( SN->getAddressingMode() == ISD::UNINDEXED
 773           && "we should get only UNINDEXED adresses");
 774   // clean aligned loads can be selected as-is
 775   if (StVT.getSizeInBits() == 128 && (alignment%16) == 0)
 776     return SDValue();
 777
 778   SDValue alignLoadVec;
 779   SDValue basePtr = SN->getBasePtr();
 780   SDValue the_chain = SN->getChain();
 781   SDValue insertEltOffs;
 782
 783   if ((alignment%16) == 0) {
 784     ConstantSDNode *CN;
 785     // Special cases for a known aligned load to simplify the base pointer
 786     // and insertion byte:
 787     if (basePtr.getOpcode() == ISD::ADD
 788         && (CN = dyn_cast<ConstantSDNode>(basePtr.getOperand(1))) != 0) {
 789       // Known offset into basePtr
 790       int64_t offset = CN->getSExtValue();
 791
 792       // Simplify the base pointer for this case:
 793       basePtr = basePtr.getOperand(0);
 794       insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
 795                                   basePtr,
 796                                   DAG.getConstant((offset & 0xf), PtrVT));
 797
 798       if ((offset & ~0xf) > 0) {
 799         basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
 800                               basePtr,
 801                               DAG.getConstant((offset & ~0xf), PtrVT));
 802       }
 803     } else {
 804       // Otherwise, assume it's at byte 0 of basePtr
 805       insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
 806                                   basePtr,
 807                                   DAG.getConstant(0, PtrVT));
 808       basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
 809                                   basePtr,
 810                                   DAG.getConstant(0, PtrVT));
 811     }
 812   } else {
 813     // Unaligned load: must be more pessimistic about addressing modes:
 814     if (basePtr.getOpcode() == ISD::ADD) {
 815       MachineFunction &MF = DAG.getMachineFunction();
 816       MachineRegisterInfo &RegInfo = MF.getRegInfo();
 817       unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
 818       SDValue Flag;
 819
 820       SDValue Op0 = basePtr.getOperand(0);
 821       SDValue Op1 = basePtr.getOperand(1);
 822
 823       if (isa<ConstantSDNode>(Op1)) {
 824         // Convert the (add <ptr>, <const>) to an indirect address contained
 825         // in a register. Note that this is done because we need to avoid
 826         // creating a 0(reg) d-form address due to the SPU's block loads.
 827         basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
 828         the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag);
 829         basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT);
 830       } else {
 831         // Convert the (add <arg1>, <arg2>) to an indirect address, which
 832         // will likely be lowered as a reg(reg) x-form address.
 833         basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
 834       }
 835     } else {
 836       basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
 837                             basePtr,
 838                             DAG.getConstant(0, PtrVT));
 839     }
 840
 841     // Insertion point is solely determined by basePtr's contents
 842     insertEltOffs = DAG.getNode(ISD::ADD, dl, PtrVT,
 843                                 basePtr,
 844                                 DAG.getConstant(0, PtrVT));
 845   }
 846
 847   // Load the lower part of the memory to which to store.
 848   SDValue low = DAG.getLoad(vecVT, dl, the_chain, basePtr,
 849                           lowMemPtr, SN->isVolatile(), SN->isNonTemporal(), 16);
 850
 851   // if we don't need to store over the 16 byte boundary, one store suffices
 852   if (alignment >= StVT.getSizeInBits()/8) {
 853     // Update the chain
 854     the_chain = low.getValue(1);
 855
 856     LoadSDNode *LN = cast<LoadSDNode>(low);
 857     SDValue theValue = SN->getValue();
 858
 859     if (StVT != VT
 860         && (theValue.getOpcode() == ISD::AssertZext
 861             || theValue.getOpcode() == ISD::AssertSext)) {
 862       // Drill down and get the value for zero- and sign-extended
 863       // quantities
 864       theValue = theValue.getOperand(0);
 865     }
 866
 867     // If the base pointer is already a D-form address, then just create
 868     // a new D-form address with a slot offset and the orignal base pointer.
 869     // Otherwise generate a D-form address with the slot offset relative
 870     // to the stack pointer, which is always aligned.
 871 #if !defined(NDEBUG)
 872       if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
 873         errs() << "CellSPU LowerSTORE: basePtr = ";
 874         basePtr.getNode()->dump(&DAG);
 875         errs() << "\n";
 876       }
 877 #endif
 878
 879     SDValue insertEltOp = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, vecVT,
 880                                       insertEltOffs);
 881     SDValue vectorizeOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, vecVT,
 882                                       theValue);
 883
 884     result = DAG.getNode(SPUISD::SHUFB, dl, vecVT,
 885                          vectorizeOp, low,
 886                          DAG.getNode(ISD::BITCAST, dl,
 887                                      MVT::v4i32, insertEltOp));
 888
 889     result = DAG.getStore(the_chain, dl, result, basePtr,
 890                           lowMemPtr,
 891                           LN->isVolatile(), LN->isNonTemporal(),
 892                           16);
 893
 894   }
 895   // do the store when it might cross the 16 byte memory access boundary.
 896   else {
 897     // TODO issue a warning if SN->isVolatile()== true? This is likely not
 898     // what the user wanted.
 899
 900     // address offset from nearest lower 16byte alinged address
 901     SDValue offset = DAG.getNode(ISD::AND, dl, MVT::i32,
 902                                     SN->getBasePtr(),
 903                                     DAG.getConstant(0xf, MVT::i32));
 904     // 16 - offset
 905     SDValue offset_compl = DAG.getNode(ISD::SUB, dl, MVT::i32,
 906                                            DAG.getConstant( 16, MVT::i32),
 907                                            offset);
 908     // 16 - sizeof(Value)
 909     SDValue surplus = DAG.getNode(ISD::SUB, dl, MVT::i32,
 910                                      DAG.getConstant( 16, MVT::i32),
 911                                      DAG.getConstant( VT.getSizeInBits()/8,
 912                                                       MVT::i32));
 913     // get a registerfull of ones
 914     SDValue ones = DAG.getConstant(-1, MVT::v4i32);
 915     ones = DAG.getNode(ISD::BITCAST, dl, MVT::i128, ones);
 916
 917     // Create the 128 bit masks that have ones where the data to store is
 918     // located.
 919     SDValue lowmask, himask;
 920     // if the value to store don't fill up the an entire 128 bits, zero
 921     // out the last bits of the mask so that only the value we want to store
 922     // is masked.
 923     // this is e.g. in the case of store i32, align 2
 924     if (!VT.isVector()){
 925       Value = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, Value);
 926       lowmask = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, ones, surplus);
 927       lowmask = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, lowmask,
 928                                                                surplus);
 929       Value = DAG.getNode(ISD::BITCAST, dl, MVT::i128, Value);
 930       Value = DAG.getNode(ISD::AND, dl, MVT::i128, Value, lowmask);
 931
 932     }
 933     else {
 934       lowmask = ones;
 935       Value = DAG.getNode(ISD::BITCAST, dl, MVT::i128, Value);
 936     }
 937     // this will zero, if there are no data that goes to the high quad
 938     himask = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, lowmask,
 939                                                             offset_compl);
 940     lowmask = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, lowmask,
 941                                                              offset);
 942
 943     // Load in the old data and zero out the parts that will be overwritten with
 944     // the new data to store.
 945     SDValue hi = DAG.getLoad(MVT::i128, dl, the_chain,
 946                                DAG.getNode(ISD::ADD, dl, PtrVT, basePtr,
 947                                            DAG.getConstant( 16, PtrVT)),
 948                                highMemPtr,
 949                                SN->isVolatile(), SN->isNonTemporal(), 16);
 950     the_chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(1),
 951                                                               hi.getValue(1));
 952
 953     low = DAG.getNode(ISD::AND, dl, MVT::i128,
 954                         DAG.getNode( ISD::BITCAST, dl, MVT::i128, low),
 955                         DAG.getNode( ISD::XOR, dl, MVT::i128, lowmask, ones));
 956     hi = DAG.getNode(ISD::AND, dl, MVT::i128,
 957                         DAG.getNode( ISD::BITCAST, dl, MVT::i128, hi),
 958                         DAG.getNode( ISD::XOR, dl, MVT::i128, himask, ones));
 959
 960     // Shift the Value to store into place. rlow contains the parts that go to
 961     // the lower memory chunk, rhi has the parts that go to the upper one.
 962     SDValue rlow = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, Value, offset);
 963     rlow = DAG.getNode(ISD::AND, dl, MVT::i128, rlow, lowmask);
 964     SDValue rhi = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, Value,
 965                                                             offset_compl);
 966
 967     // Merge the old data and the new data and store the results
 968     // Need to convert vectors here to integer as 'OR'ing floats assert
 969     rlow = DAG.getNode(ISD::OR, dl, MVT::i128,
 970                           DAG.getNode(ISD::BITCAST, dl, MVT::i128, low),
 971                           DAG.getNode(ISD::BITCAST, dl, MVT::i128, rlow));
 972     rhi = DAG.getNode(ISD::OR, dl, MVT::i128,
 973                          DAG.getNode(ISD::BITCAST, dl, MVT::i128, hi),
 974                          DAG.getNode(ISD::BITCAST, dl, MVT::i128, rhi));
 975
 976     low = DAG.getStore(the_chain, dl, rlow, basePtr,
 977                           lowMemPtr,
 978                           SN->isVolatile(), SN->isNonTemporal(), 16);
 979     hi  = DAG.getStore(the_chain, dl, rhi,
 980                             DAG.getNode(ISD::ADD, dl, PtrVT, basePtr,
 981                                         DAG.getConstant( 16, PtrVT)),
 982                             highMemPtr,
 983                             SN->isVolatile(), SN->isNonTemporal(), 16);
 984     result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(0),
 985                                                            hi.getValue(0));
 986   }
 987
 988   return result;
 989 }
 990
 991 //! Generate the address of a constant pool entry.
 992 static SDValue
 993 LowerConstantPool(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
 994   EVT PtrVT = Op.getValueType();
 995   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
 996   const Constant *C = CP->getConstVal();
 997   SDValue CPI = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment());
 998   SDValue Zero = DAG.getConstant(0, PtrVT);
 999   const TargetMachine &TM = DAG.getTarget();
1000   // FIXME there is no actual debug info here
1001   DebugLoc dl = Op.getDebugLoc();
1002
1003   if (TM.getRelocationModel() == Reloc::Static) {
1004     if (!ST->usingLargeMem()) {
1005       // Just return the SDValue with the constant pool address in it.
1006       return DAG.getNode(SPUISD::AFormAddr, dl, PtrVT, CPI, Zero);
1007     } else {
1008       SDValue Hi = DAG.getNode(SPUISD::Hi, dl, PtrVT, CPI, Zero);
1009       SDValue Lo = DAG.getNode(SPUISD::Lo, dl, PtrVT, CPI, Zero);
1010       return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
1011     }
1012   }
1013
1014   llvm_unreachable("LowerConstantPool: Relocation model other than static"
1015                    " not supported.");
1016   return SDValue();
1017 }
1018
1019 //! Alternate entry point for generating the address of a constant pool entry
1020 SDValue
1021 SPU::LowerConstantPool(SDValue Op, SelectionDAG &DAG, const SPUTargetMachine &TM) {
1022   return ::LowerConstantPool(Op, DAG, TM.getSubtargetImpl());
1023 }
1024
1025 static SDValue
1026 LowerJumpTable(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
1027   EVT PtrVT = Op.getValueType();
1028   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
1029   SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
1030   SDValue Zero = DAG.getConstant(0, PtrVT);
1031   const TargetMachine &TM = DAG.getTarget();
1032   // FIXME there is no actual debug info here
1033   DebugLoc dl = Op.getDebugLoc();
1034
1035   if (TM.getRelocationModel() == Reloc::Static) {
1036     if (!ST->usingLargeMem()) {
1037       return DAG.getNode(SPUISD::AFormAddr, dl, PtrVT, JTI, Zero);
1038     } else {
1039       SDValue Hi = DAG.getNode(SPUISD::Hi, dl, PtrVT, JTI, Zero);
1040       SDValue Lo = DAG.getNode(SPUISD::Lo, dl, PtrVT, JTI, Zero);
1041       return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
1042     }
1043   }
1044
1045   llvm_unreachable("LowerJumpTable: Relocation model other than static"
1046                    " not supported.");
1047   return SDValue();
1048 }
1049
1050 static SDValue
1051 LowerGlobalAddress(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
1052   EVT PtrVT = Op.getValueType();
1053   GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
1054   const GlobalValue *GV = GSDN->getGlobal();
1055   SDValue GA = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(),
1056                                           PtrVT, GSDN->getOffset());
1057   const TargetMachine &TM = DAG.getTarget();
1058   SDValue Zero = DAG.getConstant(0, PtrVT);
1059   // FIXME there is no actual debug info here
1060   DebugLoc dl = Op.getDebugLoc();
1061
1062   if (TM.getRelocationModel() == Reloc::Static) {
1063     if (!ST->usingLargeMem()) {
1064       return DAG.getNode(SPUISD::AFormAddr, dl, PtrVT, GA, Zero);
1065     } else {
1066       SDValue Hi = DAG.getNode(SPUISD::Hi, dl, PtrVT, GA, Zero);
1067       SDValue Lo = DAG.getNode(SPUISD::Lo, dl, PtrVT, GA, Zero);
1068       return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
1069     }
1070   } else {
1071     report_fatal_error("LowerGlobalAddress: Relocation model other than static"
1072                       "not supported.");
1073     /*NOTREACHED*/
1074   }
1075
1076   return SDValue();
1077 }
1078
1079 //! Custom lower double precision floating point constants
1080 static SDValue
1081 LowerConstantFP(SDValue Op, SelectionDAG &DAG) {
1082   EVT VT = Op.getValueType();
1083   // FIXME there is no actual debug info here
1084   DebugLoc dl = Op.getDebugLoc();
1085
1086   if (VT == MVT::f64) {
1087     ConstantFPSDNode *FP = cast<ConstantFPSDNode>(Op.getNode());
1088
1089     assert((FP != 0) &&
1090            "LowerConstantFP: Node is not ConstantFPSDNode");
1091
1092     uint64_t dbits = DoubleToBits(FP->getValueAPF().convertToDouble());
1093     SDValue T = DAG.getConstant(dbits, MVT::i64);
1094     SDValue Tvec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, T, T);
1095     return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
1096                        DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Tvec));
1097   }
1098
1099   return SDValue();
1100 }
1101
1102 SDValue
1103 SPUTargetLowering::LowerFormalArguments(SDValue Chain,
1104                                         CallingConv::ID CallConv, bool isVarArg,
1105                                         const SmallVectorImpl<ISD::InputArg>
1106                                           &Ins,
1107                                         DebugLoc dl, SelectionDAG &DAG,
1108                                         SmallVectorImpl<SDValue> &InVals)
1109                                           const {
1110
1111   MachineFunction &MF = DAG.getMachineFunction();
1112   MachineFrameInfo *MFI = MF.getFrameInfo();
1113   MachineRegisterInfo &RegInfo = MF.getRegInfo();
1114   SPUFunctionInfo *FuncInfo = MF.getInfo<SPUFunctionInfo>();
1115
1116   unsigned ArgOffset = SPUFrameLowering::minStackSize();
1117   unsigned ArgRegIdx = 0;
1118   unsigned StackSlotSize = SPUFrameLowering::stackSlotSize();
1119
1120   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
1121
1122   SmallVector<CCValAssign, 16> ArgLocs;
1123   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1124                  getTargetMachine(), ArgLocs, *DAG.getContext());
1125   // FIXME: allow for other calling conventions
1126   CCInfo.AnalyzeFormalArguments(Ins, CCC_SPU);
1127
1128   // Add DAG nodes to load the arguments or copy them out of registers.
1129   for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
1130     EVT ObjectVT = Ins[ArgNo].VT;
1131     unsigned ObjSize = ObjectVT.getSizeInBits()/8;
1132     SDValue ArgVal;
1133     CCValAssign &VA = ArgLocs[ArgNo];
1134
1135     if (VA.isRegLoc()) {
1136       const TargetRegisterClass *ArgRegClass;
1137
1138       switch (ObjectVT.getSimpleVT().SimpleTy) {
1139       default:
1140         report_fatal_error("LowerFormalArguments Unhandled argument type: " +
1141                            Twine(ObjectVT.getEVTString()));
1142       case MVT::i8:
1143         ArgRegClass = &SPU::R8CRegClass;
1144         break;
1145       case MVT::i16:
1146         ArgRegClass = &SPU::R16CRegClass;
1147         break;
1148       case MVT::i32:
1149         ArgRegClass = &SPU::R32CRegClass;
1150         break;
1151       case MVT::i64:
1152         ArgRegClass = &SPU::R64CRegClass;
1153         break;
1154       case MVT::i128:
1155         ArgRegClass = &SPU::GPRCRegClass;
1156         break;
1157       case MVT::f32:
1158         ArgRegClass = &SPU::R32FPRegClass;
1159         break;
1160       case MVT::f64:
1161         ArgRegClass = &SPU::R64FPRegClass;
1162         break;
1163       case MVT::v2f64:
1164       case MVT::v4f32:
1165       case MVT::v2i64:
1166       case MVT::v4i32:
1167       case MVT::v8i16:
1168       case MVT::v16i8:
1169         ArgRegClass = &SPU::VECREGRegClass;
1170         break;
1171       }
1172
1173       unsigned VReg = RegInfo.createVirtualRegister(ArgRegClass);
1174       RegInfo.addLiveIn(VA.getLocReg(), VReg);
1175       ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
1176       ++ArgRegIdx;
1177     } else {
1178       // We need to load the argument to a virtual register if we determined
1179       // above that we ran out of physical registers of the appropriate type
1180       // or we're forced to do vararg
1181       int FI = MFI->CreateFixedObject(ObjSize, ArgOffset, true);
1182       SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
1183       ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo(),
1184                            false, false, 0);
1185       ArgOffset += StackSlotSize;
1186     }
1187
1188     InVals.push_back(ArgVal);
1189     // Update the chain
1190     Chain = ArgVal.getOperand(0);
1191   }
1192
1193   // vararg handling:
1194   if (isVarArg) {
1195     // FIXME: we should be able to query the argument registers from
1196     //        tablegen generated code.
1197     static const unsigned ArgRegs[] = {
1198       SPU::R3,  SPU::R4,  SPU::R5,  SPU::R6,  SPU::R7,  SPU::R8,  SPU::R9,
1199       SPU::R10, SPU::R11, SPU::R12, SPU::R13, SPU::R14, SPU::R15, SPU::R16,
1200       SPU::R17, SPU::R18, SPU::R19, SPU::R20, SPU::R21, SPU::R22, SPU::R23,
1201       SPU::R24, SPU::R25, SPU::R26, SPU::R27, SPU::R28, SPU::R29, SPU::R30,
1202       SPU::R31, SPU::R32, SPU::R33, SPU::R34, SPU::R35, SPU::R36, SPU::R37,
1203       SPU::R38, SPU::R39, SPU::R40, SPU::R41, SPU::R42, SPU::R43, SPU::R44,
1204       SPU::R45, SPU::R46, SPU::R47, SPU::R48, SPU::R49, SPU::R50, SPU::R51,
1205       SPU::R52, SPU::R53, SPU::R54, SPU::R55, SPU::R56, SPU::R57, SPU::R58,
1206       SPU::R59, SPU::R60, SPU::R61, SPU::R62, SPU::R63, SPU::R64, SPU::R65,
1207       SPU::R66, SPU::R67, SPU::R68, SPU::R69, SPU::R70, SPU::R71, SPU::R72,
1208       SPU::R73, SPU::R74, SPU::R75, SPU::R76, SPU::R77, SPU::R78, SPU::R79
1209     };
1210     // size of ArgRegs array
1211     unsigned NumArgRegs = 77;
1212
1213     // We will spill (79-3)+1 registers to the stack
1214     SmallVector<SDValue, 79-3+1> MemOps;
1215
1216     // Create the frame slot
1217     for (; ArgRegIdx != NumArgRegs; ++ArgRegIdx) {
1218       FuncInfo->setVarArgsFrameIndex(
1219         MFI->CreateFixedObject(StackSlotSize, ArgOffset, true));
1220       SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
1221       unsigned VReg = MF.addLiveIn(ArgRegs[ArgRegIdx], &SPU::VECREGRegClass);
1222       SDValue ArgVal = DAG.getRegister(VReg, MVT::v16i8);
1223       SDValue Store = DAG.getStore(Chain, dl, ArgVal, FIN, MachinePointerInfo(),
1224                                    false, false, 0);
1225       Chain = Store.getOperand(0);
1226       MemOps.push_back(Store);
1227
1228       // Increment address by stack slot size for the next stored argument
1229       ArgOffset += StackSlotSize;
1230     }
1231     if (!MemOps.empty())
1232       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
1233                           &MemOps[0], MemOps.size());
1234   }
1235
1236   return Chain;
1237 }
1238
1239 /// isLSAAddress - Return the immediate to use if the specified
1240 /// value is representable as a LSA address.
1241 static SDNode *isLSAAddress(SDValue Op, SelectionDAG &DAG) {
1242   ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
1243   if (!C) return 0;
1244
1245   int Addr = C->getZExtValue();
1246   if ((Addr & 3) != 0 ||  // Low 2 bits are implicitly zero.
1247       (Addr << 14 >> 14) != Addr)
1248     return 0;  // Top 14 bits have to be sext of immediate.
1249
1250   return DAG.getConstant((int)C->getZExtValue() >> 2, MVT::i32).getNode();
1251 }
1252
1253 SDValue
1254 SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
1255                              CallingConv::ID CallConv, bool isVarArg,
1256                              bool &isTailCall,
1257                              const SmallVectorImpl<ISD::OutputArg> &Outs,
1258                              const SmallVectorImpl<SDValue> &OutVals,
1259                              const SmallVectorImpl<ISD::InputArg> &Ins,
1260                              DebugLoc dl, SelectionDAG &DAG,
1261                              SmallVectorImpl<SDValue> &InVals) const {
1262   // CellSPU target does not yet support tail call optimization.
1263   isTailCall = false;
1264
1265   const SPUSubtarget *ST = SPUTM.getSubtargetImpl();
1266   unsigned NumOps     = Outs.size();
1267   unsigned StackSlotSize = SPUFrameLowering::stackSlotSize();
1268
1269   SmallVector<CCValAssign, 16> ArgLocs;
1270   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1271                  getTargetMachine(), ArgLocs, *DAG.getContext());
1272   // FIXME: allow for other calling conventions
1273   CCInfo.AnalyzeCallOperands(Outs, CCC_SPU);
1274
1275   const unsigned NumArgRegs = ArgLocs.size();
1276
1277
1278   // Handy pointer type
1279   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
1280
1281   // Set up a copy of the stack pointer for use loading and storing any
1282   // arguments that may not fit in the registers available for argument
1283   // passing.
1284   SDValue StackPtr = DAG.getRegister(SPU::R1, MVT::i32);
1285
1286   // Figure out which arguments are going to go in registers, and which in
1287   // memory.
1288   unsigned ArgOffset = SPUFrameLowering::minStackSize(); // Just below [LR]
1289   unsigned ArgRegIdx = 0;
1290
1291   // Keep track of registers passing arguments
1292   std::vector<std::pair<unsigned, SDValue> > RegsToPass;
1293   // And the arguments passed on the stack
1294   SmallVector<SDValue, 8> MemOpChains;
1295
1296   for (; ArgRegIdx != NumOps; ++ArgRegIdx) {
1297     SDValue Arg = OutVals[ArgRegIdx];
1298     CCValAssign &VA = ArgLocs[ArgRegIdx];
1299
1300     // PtrOff will be used to store the current argument to the stack if a
1301     // register cannot be found for it.
1302     SDValue PtrOff = DAG.getConstant(ArgOffset, StackPtr.getValueType());
1303     PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
1304
1305     switch (Arg.getValueType().getSimpleVT().SimpleTy) {
1306     default: llvm_unreachable("Unexpected ValueType for argument!");
1307     case MVT::i8:
1308     case MVT::i16:
1309     case MVT::i32:
1310     case MVT::i64:
1311     case MVT::i128:
1312     case MVT::f32:
1313     case MVT::f64:
1314     case MVT::v2i64:
1315     case MVT::v2f64:
1316     case MVT::v4f32:
1317     case MVT::v4i32:
1318     case MVT::v8i16:
1319     case MVT::v16i8:
1320       if (ArgRegIdx != NumArgRegs) {
1321         RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
1322       } else {
1323         MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
1324                                            MachinePointerInfo(),
1325                                            false, false, 0));
1326         ArgOffset += StackSlotSize;
1327       }
1328       break;
1329     }
1330   }
1331
1332   // Accumulate how many bytes are to be pushed on the stack, including the
1333   // linkage area, and parameter passing area.  According to the SPU ABI,
1334   // we minimally need space for [LR] and [SP].
1335   unsigned NumStackBytes = ArgOffset - SPUFrameLowering::minStackSize();
1336
1337   // Insert a call sequence start
1338   Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumStackBytes,
1339                                                             true));
1340
1341   if (!MemOpChains.empty()) {
1342     // Adjust the stack pointer for the stack arguments.
1343     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
1344                         &MemOpChains[0], MemOpChains.size());
1345   }
1346
1347   // Build a sequence of copy-to-reg nodes chained together with token chain
1348   // and flag operands which copy the outgoing args into the appropriate regs.
1349   SDValue InFlag;
1350   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1351     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
1352                              RegsToPass[i].second, InFlag);
1353     InFlag = Chain.getValue(1);
1354   }
1355
1356   SmallVector<SDValue, 8> Ops;
1357   unsigned CallOpc = SPUISD::CALL;
1358
1359   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
1360   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
1361   // node so that legalize doesn't hack it.
1362   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
1363     const GlobalValue *GV = G->getGlobal();
1364     EVT CalleeVT = Callee.getValueType();
1365     SDValue Zero = DAG.getConstant(0, PtrVT);
1366     SDValue GA = DAG.getTargetGlobalAddress(GV, dl, CalleeVT);
1367
1368     if (!ST->usingLargeMem()) {
1369       // Turn calls to targets that are defined (i.e., have bodies) into BRSL
1370       // style calls, otherwise, external symbols are BRASL calls. This assumes
1371       // that declared/defined symbols are in the same compilation unit and can
1372       // be reached through PC-relative jumps.
1373       //
1374       // NOTE:
1375       // This may be an unsafe assumption for JIT and really large compilation
1376       // units.
1377       if (GV->isDeclaration()) {
1378         Callee = DAG.getNode(SPUISD::AFormAddr, dl, CalleeVT, GA, Zero);
1379       } else {
1380         Callee = DAG.getNode(SPUISD::PCRelAddr, dl, CalleeVT, GA, Zero);
1381       }
1382     } else {
1383       // "Large memory" mode: Turn all calls into indirect calls with a X-form
1384       // address pairs:
1385       Callee = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, GA, Zero);
1386     }
1387   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
1388     EVT CalleeVT = Callee.getValueType();
1389     SDValue Zero = DAG.getConstant(0, PtrVT);
1390     SDValue ExtSym = DAG.getTargetExternalSymbol(S->getSymbol(),
1391         Callee.getValueType());
1392
1393     if (!ST->usingLargeMem()) {
1394       Callee = DAG.getNode(SPUISD::AFormAddr, dl, CalleeVT, ExtSym, Zero);
1395     } else {
1396       Callee = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, ExtSym, Zero);
1397     }
1398   } else if (SDNode *Dest = isLSAAddress(Callee, DAG)) {
1399     // If this is an absolute destination address that appears to be a legal
1400     // local store address, use the munged value.
1401     Callee = SDValue(Dest, 0);
1402   }
1403
1404   Ops.push_back(Chain);
1405   Ops.push_back(Callee);
1406
1407   // Add argument registers to the end of the list so that they are known live
1408   // into the call.
1409   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
1410     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
1411                                   RegsToPass[i].second.getValueType()));
1412
1413   if (InFlag.getNode())
1414     Ops.push_back(InFlag);
1415   // Returns a chain and a flag for retval copy to use.
1416   Chain = DAG.getNode(CallOpc, dl, DAG.getVTList(MVT::Other, MVT::Glue),
1417                       &Ops[0], Ops.size());
1418   InFlag = Chain.getValue(1);
1419
1420   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumStackBytes, true),
1421                              DAG.getIntPtrConstant(0, true), InFlag);
1422   if (!Ins.empty())
1423     InFlag = Chain.getValue(1);
1424
1425   // If the function returns void, just return the chain.
1426   if (Ins.empty())
1427     return Chain;
1428
1429   // Now handle the return value(s)
1430   SmallVector<CCValAssign, 16> RVLocs;
1431   CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1432                     getTargetMachine(), RVLocs, *DAG.getContext());
1433   CCRetInfo.AnalyzeCallResult(Ins, CCC_SPU);
1434
1435
1436   // If the call has results, copy the values out of the ret val registers.
1437   for (unsigned i = 0; i != RVLocs.size(); ++i) {
1438     CCValAssign VA = RVLocs[i];
1439
1440     SDValue Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
1441                                      InFlag);
1442     Chain = Val.getValue(1);
1443     InFlag = Val.getValue(2);
1444     InVals.push_back(Val);
1445    }
1446
1447   return Chain;
1448 }
1449
1450 SDValue
1451 SPUTargetLowering::LowerReturn(SDValue Chain,
1452                                CallingConv::ID CallConv, bool isVarArg,
1453                                const SmallVectorImpl<ISD::OutputArg> &Outs,
1454                                const SmallVectorImpl<SDValue> &OutVals,
1455                                DebugLoc dl, SelectionDAG &DAG) const {
1456
1457   SmallVector<CCValAssign, 16> RVLocs;
1458   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1459                  getTargetMachine(), RVLocs, *DAG.getContext());
1460   CCInfo.AnalyzeReturn(Outs, RetCC_SPU);
1461
1462   // If this is the first return lowered for this function, add the regs to the
1463   // liveout set for the function.
1464   if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
1465     for (unsigned i = 0; i != RVLocs.size(); ++i)
1466       DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
1467   }
1468
1469   SDValue Flag;
1470
1471   // Copy the result values into the output registers.
1472   for (unsigned i = 0; i != RVLocs.size(); ++i) {
1473     CCValAssign &VA = RVLocs[i];
1474     assert(VA.isRegLoc() && "Can only return in registers!");
1475     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
1476                              OutVals[i], Flag);
1477     Flag = Chain.getValue(1);
1478   }
1479
1480   if (Flag.getNode())
1481     return DAG.getNode(SPUISD::RET_FLAG, dl, MVT::Other, Chain, Flag);
1482   else
1483     return DAG.getNode(SPUISD::RET_FLAG, dl, MVT::Other, Chain);
1484 }
1485
1486
1487 //===----------------------------------------------------------------------===//
1488 // Vector related lowering:
1489 //===----------------------------------------------------------------------===//
1490
1491 static ConstantSDNode *
1492 getVecImm(SDNode *N) {
1493   SDValue OpVal(0, 0);
1494
1495   // Check to see if this buildvec has a single non-undef value in its elements.
1496   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
1497     if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
1498     if (OpVal.getNode() == 0)
1499       OpVal = N->getOperand(i);
1500     else if (OpVal != N->getOperand(i))
1501       return 0;
1502   }
1503
1504   if (OpVal.getNode() != 0) {
1505     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) {
1506       return CN;
1507     }
1508   }
1509
1510   return 0;
1511 }
1512
1513 /// get_vec_i18imm - Test if this vector is a vector filled with the same value
1514 /// and the value fits into an unsigned 18-bit constant, and if so, return the
1515 /// constant
1516 SDValue SPU::get_vec_u18imm(SDNode *N, SelectionDAG &DAG,
1517                               EVT ValueType) {
1518   if (ConstantSDNode *CN = getVecImm(N)) {
1519     uint64_t Value = CN->getZExtValue();
1520     if (ValueType == MVT::i64) {
1521       uint64_t UValue = CN->getZExtValue();
1522       uint32_t upper = uint32_t(UValue >> 32);
1523       uint32_t lower = uint32_t(UValue);
1524       if (upper != lower)
1525         return SDValue();
1526       Value = Value >> 32;
1527     }
1528     if (Value <= 0x3ffff)
1529       return DAG.getTargetConstant(Value, ValueType);
1530   }
1531
1532   return SDValue();
1533 }
1534
1535 /// get_vec_i16imm - Test if this vector is a vector filled with the same value
1536 /// and the value fits into a signed 16-bit constant, and if so, return the
1537 /// constant
1538 SDValue SPU::get_vec_i16imm(SDNode *N, SelectionDAG &DAG,
1539                               EVT ValueType) {
1540   if (ConstantSDNode *CN = getVecImm(N)) {
1541     int64_t Value = CN->getSExtValue();
1542     if (ValueType == MVT::i64) {
1543       uint64_t UValue = CN->getZExtValue();
1544       uint32_t upper = uint32_t(UValue >> 32);
1545       uint32_t lower = uint32_t(UValue);
1546       if (upper != lower)
1547         return SDValue();
1548       Value = Value >> 32;
1549     }
1550     if (Value >= -(1 << 15) && Value <= ((1 << 15) - 1)) {
1551       return DAG.getTargetConstant(Value, ValueType);
1552     }
1553   }
1554
1555   return SDValue();
1556 }
1557
1558 /// get_vec_i10imm - Test if this vector is a vector filled with the same value
1559 /// and the value fits into a signed 10-bit constant, and if so, return the
1560 /// constant
1561 SDValue SPU::get_vec_i10imm(SDNode *N, SelectionDAG &DAG,
1562                               EVT ValueType) {
1563   if (ConstantSDNode *CN = getVecImm(N)) {
1564     int64_t Value = CN->getSExtValue();
1565     if (ValueType == MVT::i64) {
1566       uint64_t UValue = CN->getZExtValue();
1567       uint32_t upper = uint32_t(UValue >> 32);
1568       uint32_t lower = uint32_t(UValue);
1569       if (upper != lower)
1570         return SDValue();
1571       Value = Value >> 32;
1572     }
1573     if (isInt<10>(Value))
1574       return DAG.getTargetConstant(Value, ValueType);
1575   }
1576
1577   return SDValue();
1578 }
1579
1580 /// get_vec_i8imm - Test if this vector is a vector filled with the same value
1581 /// and the value fits into a signed 8-bit constant, and if so, return the
1582 /// constant.
1583 ///
1584 /// @note: The incoming vector is v16i8 because that's the only way we can load
1585 /// constant vectors. Thus, we test to see if the upper and lower bytes are the
1586 /// same value.
1587 SDValue SPU::get_vec_i8imm(SDNode *N, SelectionDAG &DAG,
1588                              EVT ValueType) {
1589   if (ConstantSDNode *CN = getVecImm(N)) {
1590     int Value = (int) CN->getZExtValue();
1591     if (ValueType == MVT::i16
1592         && Value <= 0xffff                 /* truncated from uint64_t */
1593         && ((short) Value >> 8) == ((short) Value & 0xff))
1594       return DAG.getTargetConstant(Value & 0xff, ValueType);
1595     else if (ValueType == MVT::i8
1596              && (Value & 0xff) == Value)
1597       return DAG.getTargetConstant(Value, ValueType);
1598   }
1599
1600   return SDValue();
1601 }
1602
1603 /// get_ILHUvec_imm - Test if this vector is a vector filled with the same value
1604 /// and the value fits into a signed 16-bit constant, and if so, return the
1605 /// constant
1606 SDValue SPU::get_ILHUvec_imm(SDNode *N, SelectionDAG &DAG,
1607                                EVT ValueType) {
1608   if (ConstantSDNode *CN = getVecImm(N)) {
1609     uint64_t Value = CN->getZExtValue();
1610     if ((ValueType == MVT::i32
1611           && ((unsigned) Value & 0xffff0000) == (unsigned) Value)
1612         || (ValueType == MVT::i64 && (Value & 0xffff0000) == Value))
1613       return DAG.getTargetConstant(Value >> 16, ValueType);
1614   }
1615
1616   return SDValue();
1617 }
1618
1619 /// get_v4i32_imm - Catch-all for general 32-bit constant vectors
1620 SDValue SPU::get_v4i32_imm(SDNode *N, SelectionDAG &DAG) {
1621   if (ConstantSDNode *CN = getVecImm(N)) {
1622     return DAG.getTargetConstant((unsigned) CN->getZExtValue(), MVT::i32);
1623   }
1624
1625   return SDValue();
1626 }
1627
1628 /// get_v4i32_imm - Catch-all for general 64-bit constant vectors
1629 SDValue SPU::get_v2i64_imm(SDNode *N, SelectionDAG &DAG) {
1630   if (ConstantSDNode *CN = getVecImm(N)) {
1631     return DAG.getTargetConstant((unsigned) CN->getZExtValue(), MVT::i64);
1632   }
1633
1634   return SDValue();
1635 }
1636
1637 //! Lower a BUILD_VECTOR instruction creatively:
1638 static SDValue
1639 LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
1640   EVT VT = Op.getValueType();
1641   EVT EltVT = VT.getVectorElementType();
1642   DebugLoc dl = Op.getDebugLoc();
1643   BuildVectorSDNode *BCN = dyn_cast<BuildVectorSDNode>(Op.getNode());
1644   assert(BCN != 0 && "Expected BuildVectorSDNode in SPU LowerBUILD_VECTOR");
1645   unsigned minSplatBits = EltVT.getSizeInBits();
1646
1647   if (minSplatBits < 16)
1648     minSplatBits = 16;
1649
1650   APInt APSplatBits, APSplatUndef;
1651   unsigned SplatBitSize;
1652   bool HasAnyUndefs;
1653
1654   if (!BCN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
1655                             HasAnyUndefs, minSplatBits)
1656       || minSplatBits < SplatBitSize)
1657     return SDValue();   // Wasn't a constant vector or splat exceeded min
1658
1659   uint64_t SplatBits = APSplatBits.getZExtValue();
1660
1661   switch (VT.getSimpleVT().SimpleTy) {
1662   default:
1663     report_fatal_error("CellSPU: Unhandled VT in LowerBUILD_VECTOR, VT = " +
1664                        Twine(VT.getEVTString()));
1665     /*NOTREACHED*/
1666   case MVT::v4f32: {
1667     uint32_t Value32 = uint32_t(SplatBits);
1668     assert(SplatBitSize == 32
1669            && "LowerBUILD_VECTOR: Unexpected floating point vector element.");
1670     // NOTE: pretend the constant is an integer. LLVM won't load FP constants
1671     SDValue T = DAG.getConstant(Value32, MVT::i32);
1672     return DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,
1673                        DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, T,T,T,T));
1674     break;
1675   }
1676   case MVT::v2f64: {
1677     uint64_t f64val = uint64_t(SplatBits);
1678     assert(SplatBitSize == 64
1679            && "LowerBUILD_VECTOR: 64-bit float vector size > 8 bytes.");
1680     // NOTE: pretend the constant is an integer. LLVM won't load FP constants
1681     SDValue T = DAG.getConstant(f64val, MVT::i64);
1682     return DAG.getNode(ISD::BITCAST, dl, MVT::v2f64,
1683                        DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, T, T));
1684     break;
1685   }
1686   case MVT::v16i8: {
1687    // 8-bit constants have to be expanded to 16-bits
1688    unsigned short Value16 = SplatBits /* | (SplatBits << 8) */;
1689    SmallVector<SDValue, 8> Ops;
1690
1691    Ops.assign(8, DAG.getConstant(Value16, MVT::i16));
1692    return DAG.getNode(ISD::BITCAST, dl, VT,
1693                       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i16, &Ops[0], Ops.size()));
1694   }
1695   case MVT::v8i16: {
1696     unsigned short Value16 = SplatBits;
1697     SDValue T = DAG.getConstant(Value16, EltVT);
1698     SmallVector<SDValue, 8> Ops;
1699
1700     Ops.assign(8, T);
1701     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Ops[0], Ops.size());
1702   }
1703   case MVT::v4i32: {
1704     SDValue T = DAG.getConstant(unsigned(SplatBits), VT.getVectorElementType());
1705     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, T, T, T, T);
1706   }
1707   case MVT::v2i64: {
1708     return SPU::LowerV2I64Splat(VT, DAG, SplatBits, dl);
1709   }
1710   }
1711
1712   return SDValue();
1713 }
1714
1715 /*!
1716  */
1717 SDValue
1718 SPU::LowerV2I64Splat(EVT OpVT, SelectionDAG& DAG, uint64_t SplatVal,
1719                      DebugLoc dl) {
1720   uint32_t upper = uint32_t(SplatVal >> 32);
1721   uint32_t lower = uint32_t(SplatVal);
1722
1723   if (upper == lower) {
1724     // Magic constant that can be matched by IL, ILA, et. al.
1725     SDValue Val = DAG.getTargetConstant(upper, MVT::i32);
1726     return DAG.getNode(ISD::BITCAST, dl, OpVT,
1727                        DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
1728                                    Val, Val, Val, Val));
1729   } else {
1730     bool upper_special, lower_special;
1731
1732     // NOTE: This code creates common-case shuffle masks that can be easily
1733     // detected as common expressions. It is not attempting to create highly
1734     // specialized masks to replace any and all 0's, 0xff's and 0x80's.
1735
1736     // Detect if the upper or lower half is a special shuffle mask pattern:
1737     upper_special = (upper == 0 || upper == 0xffffffff || upper == 0x80000000);
1738     lower_special = (lower == 0 || lower == 0xffffffff || lower == 0x80000000);
1739
1740     // Both upper and lower are special, lower to a constant pool load:
1741     if (lower_special && upper_special) {
1742       SDValue SplatValCN = DAG.getConstant(SplatVal, MVT::i64);
1743       return DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64,
1744                          SplatValCN, SplatValCN);
1745     }
1746
1747     SDValue LO32;
1748     SDValue HI32;
1749     SmallVector<SDValue, 16> ShufBytes;
1750     SDValue Result;
1751
1752     // Create lower vector if not a special pattern
1753     if (!lower_special) {
1754       SDValue LO32C = DAG.getConstant(lower, MVT::i32);
1755       LO32 = DAG.getNode(ISD::BITCAST, dl, OpVT,
1756                          DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
1757                                      LO32C, LO32C, LO32C, LO32C));
1758     }
1759
1760     // Create upper vector if not a special pattern
1761     if (!upper_special) {
1762       SDValue HI32C = DAG.getConstant(upper, MVT::i32);
1763       HI32 = DAG.getNode(ISD::BITCAST, dl, OpVT,
1764                          DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
1765                                      HI32C, HI32C, HI32C, HI32C));
1766     }
1767
1768     // If either upper or lower are special, then the two input operands are
1769     // the same (basically, one of them is a "don't care")
1770     if (lower_special)
1771       LO32 = HI32;
1772     if (upper_special)
1773       HI32 = LO32;
1774
1775     for (int i = 0; i < 4; ++i) {
1776       uint64_t val = 0;
1777       for (int j = 0; j < 4; ++j) {
1778         SDValue V;
1779         bool process_upper, process_lower;
1780         val <<= 8;
1781         process_upper = (upper_special && (i & 1) == 0);
1782         process_lower = (lower_special && (i & 1) == 1);
1783
1784         if (process_upper || process_lower) {
1785           if ((process_upper && upper == 0)
1786                   || (process_lower && lower == 0))
1787             val |= 0x80;
1788           else if ((process_upper && upper == 0xffffffff)
1789                   || (process_lower && lower == 0xffffffff))
1790             val |= 0xc0;
1791           else if ((process_upper && upper == 0x80000000)
1792                   || (process_lower && lower == 0x80000000))
1793             val |= (j == 0 ? 0xe0 : 0x80);
1794         } else
1795           val |= i * 4 + j + ((i & 1) * 16);
1796       }
1797
1798       ShufBytes.push_back(DAG.getConstant(val, MVT::i32));
1799     }
1800
1801     return DAG.getNode(SPUISD::SHUFB, dl, OpVT, HI32, LO32,
1802                        DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
1803                                    &ShufBytes[0], ShufBytes.size()));
1804   }
1805 }
1806
1807 /// LowerVECTOR_SHUFFLE - Lower a vector shuffle (V1, V2, V3) to something on
1808 /// which the Cell can operate. The code inspects V3 to ascertain whether the
1809 /// permutation vector, V3, is monotonically increasing with one "exception"
1810 /// element, e.g., (0, 1, _, 3). If this is the case, then generate a
1811 /// SHUFFLE_MASK synthetic instruction. Otherwise, spill V3 to the constant pool.
1812 /// In either case, the net result is going to eventually invoke SHUFB to
1813 /// permute/shuffle the bytes from V1 and V2.
1814 /// \note
1815 /// SHUFFLE_MASK is eventually selected as one of the C*D instructions, generate
1816 /// control word for byte/halfword/word insertion. This takes care of a single
1817 /// element move from V2 into V1.
1818 /// \note
1819 /// SPUISD::SHUFB is eventually selected as Cell's <i>shufb</i> instructions.
1820 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
1821   const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
1822   SDValue V1 = Op.getOperand(0);
1823   SDValue V2 = Op.getOperand(1);
1824   DebugLoc dl = Op.getDebugLoc();
1825
1826   if (V2.getOpcode() == ISD::UNDEF) V2 = V1;
1827
1828   // If we have a single element being moved from V1 to V2, this can be handled
1829   // using the C*[DX] compute mask instructions, but the vector elements have
1830   // to be monotonically increasing with one exception element, and the source
1831   // slot of the element to move must be the same as the destination.
1832   EVT VecVT = V1.getValueType();
1833   EVT EltVT = VecVT.getVectorElementType();
1834   unsigned EltsFromV2 = 0;
1835   unsigned V2EltOffset = 0;
1836   unsigned V2EltIdx0 = 0;
1837   unsigned CurrElt = 0;
1838   unsigned MaxElts = VecVT.getVectorNumElements();
1839   unsigned PrevElt = 0;
1840   bool monotonic = true;
1841   bool rotate = true;
1842   int rotamt=0;
1843   EVT maskVT;             // which of the c?d instructions to use
1844
1845   if (EltVT == MVT::i8) {
1846     V2EltIdx0 = 16;
1847     maskVT = MVT::v16i8;
1848   } else if (EltVT == MVT::i16) {
1849     V2EltIdx0 = 8;
1850     maskVT = MVT::v8i16;
1851   } else if (EltVT == MVT::i32 || EltVT == MVT::f32) {
1852     V2EltIdx0 = 4;
1853     maskVT = MVT::v4i32;
1854   } else if (EltVT == MVT::i64 || EltVT == MVT::f64) {
1855     V2EltIdx0 = 2;
1856     maskVT = MVT::v2i64;
1857   } else
1858     llvm_unreachable("Unhandled vector type in LowerVECTOR_SHUFFLE");
1859
1860   for (unsigned i = 0; i != MaxElts; ++i) {
1861     if (SVN->getMaskElt(i) < 0)
1862       continue;
1863
1864     unsigned SrcElt = SVN->getMaskElt(i);
1865
1866     if (monotonic) {
1867       if (SrcElt >= V2EltIdx0) {
1868         // TODO: optimize for the monotonic case when several consecutive
1869         // elements are taken form V2. Do we ever get such a case?
1870         if (EltsFromV2 == 0 && CurrElt == (SrcElt - V2EltIdx0))
1871           V2EltOffset = (SrcElt - V2EltIdx0) * (EltVT.getSizeInBits()/8);
1872         else
1873           monotonic = false;
1874         ++EltsFromV2;
1875       } else if (CurrElt != SrcElt) {
1876         monotonic = false;
1877       }
1878
1879       ++CurrElt;
1880     }
1881
1882     if (rotate) {
1883       if (PrevElt > 0 && SrcElt < MaxElts) {
1884         if ((PrevElt == SrcElt - 1)
1885             || (PrevElt == MaxElts - 1 && SrcElt == 0)) {
1886           PrevElt = SrcElt;
1887         } else {
1888           rotate = false;
1889         }
1890       } else if (i == 0 || (PrevElt==0 && SrcElt==1)) {
1891         // First time or after a "wrap around"
1892         rotamt = SrcElt-i;
1893         PrevElt = SrcElt;
1894       } else {
1895         // This isn't a rotation, takes elements from vector 2
1896         rotate = false;
1897       }
1898     }
1899   }
1900
1901   if (EltsFromV2 == 1 && monotonic) {
1902     // Compute mask and shuffle
1903     EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
1904
1905     // As SHUFFLE_MASK becomes a c?d instruction, feed it an address
1906     // R1 ($sp) is used here only as it is guaranteed to have last bits zero
1907     SDValue Pointer = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
1908                                 DAG.getRegister(SPU::R1, PtrVT),
1909                                 DAG.getConstant(V2EltOffset, MVT::i32));
1910     SDValue ShufMaskOp = DAG.getNode(SPUISD::SHUFFLE_MASK, dl,
1911                                      maskVT, Pointer);
1912
1913     // Use shuffle mask in SHUFB synthetic instruction:
1914     return DAG.getNode(SPUISD::SHUFB, dl, V1.getValueType(), V2, V1,
1915                        ShufMaskOp);
1916   } else if (rotate) {
1917     if (rotamt < 0)
1918       rotamt +=MaxElts;
1919     rotamt *= EltVT.getSizeInBits()/8;
1920     return DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, V1.getValueType(),
1921                        V1, DAG.getConstant(rotamt, MVT::i16));
1922   } else {
1923    // Convert the SHUFFLE_VECTOR mask's input element units to the
1924    // actual bytes.
1925     unsigned BytesPerElement = EltVT.getSizeInBits()/8;
1926
1927     SmallVector<SDValue, 16> ResultMask;
1928     for (unsigned i = 0, e = MaxElts; i != e; ++i) {
1929       unsigned SrcElt = SVN->getMaskElt(i) < 0 ? 0 : SVN->getMaskElt(i);
1930
1931       for (unsigned j = 0; j < BytesPerElement; ++j)
1932         ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement+j,MVT::i8));
1933     }
1934     SDValue VPermMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8,
1935                                     &ResultMask[0], ResultMask.size());
1936     return DAG.getNode(SPUISD::SHUFB, dl, V1.getValueType(), V1, V2, VPermMask);
1937   }
1938 }
1939
1940 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
1941   SDValue Op0 = Op.getOperand(0);                     // Op0 = the scalar
1942   DebugLoc dl = Op.getDebugLoc();
1943
1944   if (Op0.getNode()->getOpcode() == ISD::Constant) {
1945     // For a constant, build the appropriate constant vector, which will
1946     // eventually simplify to a vector register load.
1947
1948     ConstantSDNode *CN = cast<ConstantSDNode>(Op0.getNode());
1949     SmallVector<SDValue, 16> ConstVecValues;
1950     EVT VT;
1951     size_t n_copies;
1952
1953     // Create a constant vector:
1954     switch (Op.getValueType().getSimpleVT().SimpleTy) {
1955     default: llvm_unreachable("Unexpected constant value type in "
1956                               "LowerSCALAR_TO_VECTOR");
1957     case MVT::v16i8: n_copies = 16; VT = MVT::i8; break;
1958     case MVT::v8i16: n_copies = 8; VT = MVT::i16; break;
1959     case MVT::v4i32: n_copies = 4; VT = MVT::i32; break;
1960     case MVT::v4f32: n_copies = 4; VT = MVT::f32; break;
1961     case MVT::v2i64: n_copies = 2; VT = MVT::i64; break;
1962     case MVT::v2f64: n_copies = 2; VT = MVT::f64; break;
1963     }
1964
1965     SDValue CValue = DAG.getConstant(CN->getZExtValue(), VT);
1966     for (size_t j = 0; j < n_copies; ++j)
1967       ConstVecValues.push_back(CValue);
1968
1969     return DAG.getNode(ISD::BUILD_VECTOR, dl, Op.getValueType(),
1970                        &ConstVecValues[0], ConstVecValues.size());
1971   } else {
1972     // Otherwise, copy the value from one register to another:
1973     switch (Op0.getValueType().getSimpleVT().SimpleTy) {
1974     default: llvm_unreachable("Unexpected value type in LowerSCALAR_TO_VECTOR");
1975     case MVT::i8:
1976     case MVT::i16:
1977     case MVT::i32:
1978     case MVT::i64:
1979     case MVT::f32:
1980     case MVT::f64:
1981       return DAG.getNode(SPUISD::PREFSLOT2VEC, dl, Op.getValueType(), Op0, Op0);
1982     }
1983   }
1984
1985   return SDValue();
1986 }
1987
1988 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
1989   EVT VT = Op.getValueType();
1990   SDValue N = Op.getOperand(0);
1991   SDValue Elt = Op.getOperand(1);
1992   DebugLoc dl = Op.getDebugLoc();
1993   SDValue retval;
1994
1995   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
1996     // Constant argument:
1997     int EltNo = (int) C->getZExtValue();
1998
1999     // sanity checks:
2000     if (VT == MVT::i8 && EltNo >= 16)
2001       llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i8 extraction slot > 15");
2002     else if (VT == MVT::i16 && EltNo >= 8)
2003       llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i16 extraction slot > 7");
2004     else if (VT == MVT::i32 && EltNo >= 4)
2005       llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i32 extraction slot > 4");
2006     else if (VT == MVT::i64 && EltNo >= 2)
2007       llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i64 extraction slot > 2");
2008
2009     if (EltNo == 0 && (VT == MVT::i32 || VT == MVT::i64)) {
2010       // i32 and i64: Element 0 is the preferred slot
2011       return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT, N);
2012     }
2013
2014     // Need to generate shuffle mask and extract:
2015     int prefslot_begin = -1, prefslot_end = -1;
2016     int elt_byte = EltNo * VT.getSizeInBits() / 8;
2017
2018     switch (VT.getSimpleVT().SimpleTy) {
2019     default:
2020       assert(false && "Invalid value type!");
2021     case MVT::i8: {
2022       prefslot_begin = prefslot_end = 3;
2023       break;
2024     }
2025     case MVT::i16: {
2026       prefslot_begin = 2; prefslot_end = 3;
2027       break;
2028     }
2029     case MVT::i32:
2030     case MVT::f32: {
2031       prefslot_begin = 0; prefslot_end = 3;
2032       break;
2033     }
2034     case MVT::i64:
2035     case MVT::f64: {
2036       prefslot_begin = 0; prefslot_end = 7;
2037       break;
2038     }
2039     }
2040
2041     assert(prefslot_begin != -1 && prefslot_end != -1 &&
2042            "LowerEXTRACT_VECTOR_ELT: preferred slots uninitialized");
2043
2044     unsigned int ShufBytes[16] = {
2045       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
2046     };
2047     for (int i = 0; i < 16; ++i) {
2048       // zero fill uppper part of preferred slot, don't care about the
2049       // other slots:
2050       unsigned int mask_val;
2051       if (i <= prefslot_end) {
2052         mask_val =
2053           ((i < prefslot_begin)
2054            ? 0x80
2055            : elt_byte + (i - prefslot_begin));
2056
2057         ShufBytes[i] = mask_val;
2058       } else
2059         ShufBytes[i] = ShufBytes[i % (prefslot_end + 1)];
2060     }
2061
2062     SDValue ShufMask[4];
2063     for (unsigned i = 0; i < sizeof(ShufMask)/sizeof(ShufMask[0]); ++i) {
2064       unsigned bidx = i * 4;
2065       unsigned int bits = ((ShufBytes[bidx] << 24) |
2066                            (ShufBytes[bidx+1] << 16) |
2067                            (ShufBytes[bidx+2] << 8) |
2068                            ShufBytes[bidx+3]);
2069       ShufMask[i] = DAG.getConstant(bits, MVT::i32);
2070     }
2071
2072     SDValue ShufMaskVec =
2073       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
2074                   &ShufMask[0], sizeof(ShufMask)/sizeof(ShufMask[0]));
2075
2076     retval = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
2077                          DAG.getNode(SPUISD::SHUFB, dl, N.getValueType(),
2078                                      N, N, ShufMaskVec));
2079   } else {
2080     // Variable index: Rotate the requested element into slot 0, then replicate
2081     // slot 0 across the vector
2082     EVT VecVT = N.getValueType();
2083     if (!VecVT.isSimple() || !VecVT.isVector()) {
2084       report_fatal_error("LowerEXTRACT_VECTOR_ELT: Must have a simple, 128-bit"
2085                         "vector type!");
2086     }
2087
2088     // Make life easier by making sure the index is zero-extended to i32
2089     if (Elt.getValueType() != MVT::i32)
2090       Elt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Elt);
2091
2092     // Scale the index to a bit/byte shift quantity
2093     APInt scaleFactor =
2094             APInt(32, uint64_t(16 / N.getValueType().getVectorNumElements()), false);
2095     unsigned scaleShift = scaleFactor.logBase2();
2096     SDValue vecShift;
2097
2098     if (scaleShift > 0) {
2099       // Scale the shift factor:
2100       Elt = DAG.getNode(ISD::SHL, dl, MVT::i32, Elt,
2101                         DAG.getConstant(scaleShift, MVT::i32));
2102     }
2103
2104     vecShift = DAG.getNode(SPUISD::SHL_BYTES, dl, VecVT, N, Elt);
2105
2106     // Replicate the bytes starting at byte 0 across the entire vector (for
2107     // consistency with the notion of a unified register set)
2108     SDValue replicate;
2109
2110     switch (VT.getSimpleVT().SimpleTy) {
2111     default:
2112       report_fatal_error("LowerEXTRACT_VECTOR_ELT(varable): Unhandled vector"
2113                         "type");
2114       /*NOTREACHED*/
2115     case MVT::i8: {
2116       SDValue factor = DAG.getConstant(0x00000000, MVT::i32);
2117       replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
2118                               factor, factor, factor, factor);
2119       break;
2120     }
2121     case MVT::i16: {
2122       SDValue factor = DAG.getConstant(0x00010001, MVT::i32);
2123       replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
2124                               factor, factor, factor, factor);
2125       break;
2126     }
2127     case MVT::i32:
2128     case MVT::f32: {
2129       SDValue factor = DAG.getConstant(0x00010203, MVT::i32);
2130       replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
2131                               factor, factor, factor, factor);
2132       break;
2133     }
2134     case MVT::i64:
2135     case MVT::f64: {
2136       SDValue loFactor = DAG.getConstant(0x00010203, MVT::i32);
2137       SDValue hiFactor = DAG.getConstant(0x04050607, MVT::i32);
2138       replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
2139                               loFactor, hiFactor, loFactor, hiFactor);
2140       break;
2141     }
2142     }
2143
2144     retval = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
2145                          DAG.getNode(SPUISD::SHUFB, dl, VecVT,
2146                                      vecShift, vecShift, replicate));
2147   }
2148
2149   return retval;
2150 }
2151
2152 static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
2153   SDValue VecOp = Op.getOperand(0);
2154   SDValue ValOp = Op.getOperand(1);
2155   SDValue IdxOp = Op.getOperand(2);
2156   DebugLoc dl = Op.getDebugLoc();
2157   EVT VT = Op.getValueType();
2158   EVT eltVT = ValOp.getValueType();
2159
2160   // use 0 when the lane to insert to is 'undef'
2161   int64_t Offset=0;
2162   if (IdxOp.getOpcode() != ISD::UNDEF) {
2163     ConstantSDNode *CN = cast<ConstantSDNode>(IdxOp);
2164     assert(CN != 0 && "LowerINSERT_VECTOR_ELT: Index is not constant!");
2165     Offset = (CN->getSExtValue()) * eltVT.getSizeInBits()/8;
2166   }
2167
2168   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
2169   // Use $sp ($1) because it's always 16-byte aligned and it's available:
2170   SDValue Pointer = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
2171                                 DAG.getRegister(SPU::R1, PtrVT),
2172                                 DAG.getConstant(Offset, PtrVT));
2173   // widen the mask when dealing with half vectors
2174   EVT maskVT = EVT::getVectorVT(*(DAG.getContext()), VT.getVectorElementType(),
2175                                 128/ VT.getVectorElementType().getSizeInBits());
2176   SDValue ShufMask = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, maskVT, Pointer);
2177
2178   SDValue result =
2179     DAG.getNode(SPUISD::SHUFB, dl, VT,
2180                 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, ValOp),
2181                 VecOp,
2182                 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, ShufMask));
2183
2184   return result;
2185 }
2186
2187 static SDValue LowerI8Math(SDValue Op, SelectionDAG &DAG, unsigned Opc,
2188                            const TargetLowering &TLI)
2189 {
2190   SDValue N0 = Op.getOperand(0);      // Everything has at least one operand
2191   DebugLoc dl = Op.getDebugLoc();
2192   EVT ShiftVT = TLI.getShiftAmountTy(N0.getValueType());
2193
2194   assert(Op.getValueType() == MVT::i8);
2195   switch (Opc) {
2196   default:
2197     llvm_unreachable("Unhandled i8 math operator");
2198     /*NOTREACHED*/
2199     break;
2200   case ISD::ADD: {
2201     // 8-bit addition: Promote the arguments up to 16-bits and truncate
2202     // the result:
2203     SDValue N1 = Op.getOperand(1);
2204     N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
2205     N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
2206     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
2207                        DAG.getNode(Opc, dl, MVT::i16, N0, N1));
2208
2209   }
2210
2211   case ISD::SUB: {
2212     // 8-bit subtraction: Promote the arguments up to 16-bits and truncate
2213     // the result:
2214     SDValue N1 = Op.getOperand(1);
2215     N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
2216     N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
2217     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
2218                        DAG.getNode(Opc, dl, MVT::i16, N0, N1));
2219   }
2220   case ISD::ROTR:
2221   case ISD::ROTL: {
2222     SDValue N1 = Op.getOperand(1);
2223     EVT N1VT = N1.getValueType();
2224
2225     N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, N0);
2226     if (!N1VT.bitsEq(ShiftVT)) {
2227       unsigned N1Opc = N1.getValueType().bitsLT(ShiftVT)
2228                        ? ISD::ZERO_EXTEND
2229                        : ISD::TRUNCATE;
2230       N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
2231     }
2232
2233     // Replicate lower 8-bits into upper 8:
2234     SDValue ExpandArg =
2235       DAG.getNode(ISD::OR, dl, MVT::i16, N0,
2236                   DAG.getNode(ISD::SHL, dl, MVT::i16,
2237                               N0, DAG.getConstant(8, MVT::i32)));
2238
2239     // Truncate back down to i8
2240     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
2241                        DAG.getNode(Opc, dl, MVT::i16, ExpandArg, N1));
2242   }
2243   case ISD::SRL:
2244   case ISD::SHL: {
2245     SDValue N1 = Op.getOperand(1);
2246     EVT N1VT = N1.getValueType();
2247
2248     N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, N0);
2249     if (!N1VT.bitsEq(ShiftVT)) {
2250       unsigned N1Opc = ISD::ZERO_EXTEND;
2251
2252       if (N1.getValueType().bitsGT(ShiftVT))
2253         N1Opc = ISD::TRUNCATE;
2254
2255       N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
2256     }
2257
2258     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
2259                        DAG.getNode(Opc, dl, MVT::i16, N0, N1));
2260   }
2261   case ISD::SRA: {
2262     SDValue N1 = Op.getOperand(1);
2263     EVT N1VT = N1.getValueType();
2264
2265     N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
2266     if (!N1VT.bitsEq(ShiftVT)) {
2267       unsigned N1Opc = ISD::SIGN_EXTEND;
2268
2269       if (N1VT.bitsGT(ShiftVT))
2270         N1Opc = ISD::TRUNCATE;
2271       N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
2272     }
2273
2274     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
2275                        DAG.getNode(Opc, dl, MVT::i16, N0, N1));
2276   }
2277   case ISD::MUL: {
2278     SDValue N1 = Op.getOperand(1);
2279
2280     N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
2281     N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
2282     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
2283                        DAG.getNode(Opc, dl, MVT::i16, N0, N1));
2284     break;
2285   }
2286   }
2287
2288   return SDValue();
2289 }
2290
2291 //! Lower byte immediate operations for v16i8 vectors:
2292 static SDValue
2293 LowerByteImmed(SDValue Op, SelectionDAG &DAG) {
2294   SDValue ConstVec;
2295   SDValue Arg;
2296   EVT VT = Op.getValueType();
2297   DebugLoc dl = Op.getDebugLoc();
2298
2299   ConstVec = Op.getOperand(0);
2300   Arg = Op.getOperand(1);
2301   if (ConstVec.getNode()->getOpcode() != ISD::BUILD_VECTOR) {
2302     if (ConstVec.getNode()->getOpcode() == ISD::BITCAST) {
2303       ConstVec = ConstVec.getOperand(0);
2304     } else {
2305       ConstVec = Op.getOperand(1);
2306       Arg = Op.getOperand(0);
2307       if (ConstVec.getNode()->getOpcode() == ISD::BITCAST) {
2308         ConstVec = ConstVec.getOperand(0);
2309       }
2310     }
2311   }
2312
2313   if (ConstVec.getNode()->getOpcode() == ISD::BUILD_VECTOR) {
2314     BuildVectorSDNode *BCN = dyn_cast<BuildVectorSDNode>(ConstVec.getNode());
2315     assert(BCN != 0 && "Expected BuildVectorSDNode in SPU LowerByteImmed");
2316
2317     APInt APSplatBits, APSplatUndef;
2318     unsigned SplatBitSize;
2319     bool HasAnyUndefs;
2320     unsigned minSplatBits = VT.getVectorElementType().getSizeInBits();
2321
2322     if (BCN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
2323                               HasAnyUndefs, minSplatBits)
2324         && minSplatBits <= SplatBitSize) {
2325       uint64_t SplatBits = APSplatBits.getZExtValue();
2326       SDValue tc = DAG.getTargetConstant(SplatBits & 0xff, MVT::i8);
2327
2328       SmallVector<SDValue, 16> tcVec;
2329       tcVec.assign(16, tc);
2330       return DAG.getNode(Op.getNode()->getOpcode(), dl, VT, Arg,
2331                          DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &tcVec[0], tcVec.size()));
2332     }
2333   }
2334
2335   // These operations (AND, OR, XOR) are legal, they just couldn't be custom
2336   // lowered.  Return the operation, rather than a null SDValue.
2337   return Op;
2338 }
2339
2340 //! Custom lowering for CTPOP (count population)
2341 /*!
2342   Custom lowering code that counts the number ones in the input
2343   operand. SPU has such an instruction, but it counts the number of
2344   ones per byte, which then have to be accumulated.
2345 */
2346 static SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) {
2347   EVT VT = Op.getValueType();
2348   EVT vecVT = EVT::getVectorVT(*DAG.getContext(),
2349                                VT, (128 / VT.getSizeInBits()));
2350   DebugLoc dl = Op.getDebugLoc();
2351
2352   switch (VT.getSimpleVT().SimpleTy) {
2353   default:
2354     assert(false && "Invalid value type!");
2355   case MVT::i8: {
2356     SDValue N = Op.getOperand(0);
2357     SDValue Elt0 = DAG.getConstant(0, MVT::i32);
2358
2359     SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, N, N);
2360     SDValue CNTB = DAG.getNode(SPUISD::CNTB, dl, vecVT, Promote);
2361
2362     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, CNTB, Elt0);
2363   }
2364
2365   case MVT::i16: {
2366     MachineFunction &MF = DAG.getMachineFunction();
2367     MachineRegisterInfo &RegInfo = MF.getRegInfo();
2368
2369     unsigned CNTB_reg = RegInfo.createVirtualRegister(&SPU::R16CRegClass);
2370
2371     SDValue N = Op.getOperand(0);
2372     SDValue Elt0 = DAG.getConstant(0, MVT::i16);
2373     SDValue Mask0 = DAG.getConstant(0x0f, MVT::i16);
2374     SDValue Shift1 = DAG.getConstant(8, MVT::i32);
2375
2376     SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, N, N);
2377     SDValue CNTB = DAG.getNode(SPUISD::CNTB, dl, vecVT, Promote);
2378
2379     // CNTB_result becomes the chain to which all of the virtual registers
2380     // CNTB_reg, SUM1_reg become associated:
2381     SDValue CNTB_result =
2382       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, CNTB, Elt0);
2383
2384     SDValue CNTB_rescopy =
2385       DAG.getCopyToReg(CNTB_result, dl, CNTB_reg, CNTB_result);
2386
2387     SDValue Tmp1 = DAG.getCopyFromReg(CNTB_rescopy, dl, CNTB_reg, MVT::i16);
2388
2389     return DAG.getNode(ISD::AND, dl, MVT::i16,
2390                        DAG.getNode(ISD::ADD, dl, MVT::i16,
2391                                    DAG.getNode(ISD::SRL, dl, MVT::i16,
2392                                                Tmp1, Shift1),
2393                                    Tmp1),
2394                        Mask0);
2395   }
2396
2397   case MVT::i32: {
2398     MachineFunction &MF = DAG.getMachineFunction();
2399     MachineRegisterInfo &RegInfo = MF.getRegInfo();
2400
2401     unsigned CNTB_reg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
2402     unsigned SUM1_reg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
2403
2404     SDValue N = Op.getOperand(0);
2405     SDValue Elt0 = DAG.getConstant(0, MVT::i32);
2406     SDValue Mask0 = DAG.getConstant(0xff, MVT::i32);
2407     SDValue Shift1 = DAG.getConstant(16, MVT::i32);
2408     SDValue Shift2 = DAG.getConstant(8, MVT::i32);
2409
2410     SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, N, N);
2411     SDValue CNTB = DAG.getNode(SPUISD::CNTB, dl, vecVT, Promote);
2412
2413     // CNTB_result becomes the chain to which all of the virtual registers
2414     // CNTB_reg, SUM1_reg become associated:
2415     SDValue CNTB_result =
2416       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, CNTB, Elt0);
2417
2418     SDValue CNTB_rescopy =
2419       DAG.getCopyToReg(CNTB_result, dl, CNTB_reg, CNTB_result);
2420
2421     SDValue Comp1 =
2422       DAG.getNode(ISD::SRL, dl, MVT::i32,
2423                   DAG.getCopyFromReg(CNTB_rescopy, dl, CNTB_reg, MVT::i32),
2424                   Shift1);
2425
2426     SDValue Sum1 =
2427       DAG.getNode(ISD::ADD, dl, MVT::i32, Comp1,
2428                   DAG.getCopyFromReg(CNTB_rescopy, dl, CNTB_reg, MVT::i32));
2429
2430     SDValue Sum1_rescopy =
2431       DAG.getCopyToReg(CNTB_result, dl, SUM1_reg, Sum1);
2432
2433     SDValue Comp2 =
2434       DAG.getNode(ISD::SRL, dl, MVT::i32,
2435                   DAG.getCopyFromReg(Sum1_rescopy, dl, SUM1_reg, MVT::i32),
2436                   Shift2);
2437     SDValue Sum2 =
2438       DAG.getNode(ISD::ADD, dl, MVT::i32, Comp2,
2439                   DAG.getCopyFromReg(Sum1_rescopy, dl, SUM1_reg, MVT::i32));
2440
2441     return DAG.getNode(ISD::AND, dl, MVT::i32, Sum2, Mask0);
2442   }
2443
2444   case MVT::i64:
2445     break;
2446   }
2447
2448   return SDValue();
2449 }
2450
2451 //! Lower ISD::FP_TO_SINT, ISD::FP_TO_UINT for i32
2452 /*!
2453  f32->i32 passes through unchanged, whereas f64->i32 expands to a libcall.
2454  All conversions to i64 are expanded to a libcall.
2455  */
2456 static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
2457                               const SPUTargetLowering &TLI) {
2458   EVT OpVT = Op.getValueType();
2459   SDValue Op0 = Op.getOperand(0);
2460   EVT Op0VT = Op0.getValueType();
2461
2462   if ((OpVT == MVT::i32 && Op0VT == MVT::f64)
2463       || OpVT == MVT::i64) {
2464     // Convert f32 / f64 to i32 / i64 via libcall.
2465     RTLIB::Libcall LC =
2466             (Op.getOpcode() == ISD::FP_TO_SINT)
2467              ? RTLIB::getFPTOSINT(Op0VT, OpVT)
2468              : RTLIB::getFPTOUINT(Op0VT, OpVT);
2469     assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpectd fp-to-int conversion!");
2470     SDValue Dummy;
2471     return ExpandLibCall(LC, Op, DAG, false, Dummy, TLI);
2472   }
2473
2474   return Op;
2475 }
2476
2477 //! Lower ISD::SINT_TO_FP, ISD::UINT_TO_FP for i32
2478 /*!
2479  i32->f32 passes through unchanged, whereas i32->f64 is expanded to a libcall.
2480  All conversions from i64 are expanded to a libcall.
2481  */
2482 static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG,
2483                               const SPUTargetLowering &TLI) {
2484   EVT OpVT = Op.getValueType();
2485   SDValue Op0 = Op.getOperand(0);
2486   EVT Op0VT = Op0.getValueType();
2487
2488   if ((OpVT == MVT::f64 && Op0VT == MVT::i32)
2489       || Op0VT == MVT::i64) {
2490     // Convert i32, i64 to f64 via libcall:
2491     RTLIB::Libcall LC =
2492             (Op.getOpcode() == ISD::SINT_TO_FP)
2493              ? RTLIB::getSINTTOFP(Op0VT, OpVT)
2494              : RTLIB::getUINTTOFP(Op0VT, OpVT);
2495     assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpectd int-to-fp conversion!");
2496     SDValue Dummy;
2497     return ExpandLibCall(LC, Op, DAG, false, Dummy, TLI);
2498   }
2499
2500   return Op;
2501 }
2502
2503 //! Lower ISD::SETCC
2504 /*!
2505  This handles MVT::f64 (double floating point) condition lowering
2506  */
2507 static SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG,
2508                           const TargetLowering &TLI) {
2509   CondCodeSDNode *CC = dyn_cast<CondCodeSDNode>(Op.getOperand(2));
2510   DebugLoc dl = Op.getDebugLoc();
2511   assert(CC != 0 && "LowerSETCC: CondCodeSDNode should not be null here!\n");
2512
2513   SDValue lhs = Op.getOperand(0);
2514   SDValue rhs = Op.getOperand(1);
2515   EVT lhsVT = lhs.getValueType();
2516   assert(lhsVT == MVT::f64 && "LowerSETCC: type other than MVT::64\n");
2517
2518   EVT ccResultVT = TLI.getSetCCResultType(lhs.getValueType());
2519   APInt ccResultOnes = APInt::getAllOnesValue(ccResultVT.getSizeInBits());
2520   EVT IntVT(MVT::i64);
2521
2522   // Take advantage of the fact that (truncate (sra arg, 32)) is efficiently
2523   // selected to a NOP:
2524   SDValue i64lhs = DAG.getNode(ISD::BITCAST, dl, IntVT, lhs);
2525   SDValue lhsHi32 =
2526           DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
2527                       DAG.getNode(ISD::SRL, dl, IntVT,
2528                                   i64lhs, DAG.getConstant(32, MVT::i32)));
2529   SDValue lhsHi32abs =
2530           DAG.getNode(ISD::AND, dl, MVT::i32,
2531                       lhsHi32, DAG.getConstant(0x7fffffff, MVT::i32));
2532   SDValue lhsLo32 =
2533           DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, i64lhs);
2534
2535   // SETO and SETUO only use the lhs operand:
2536   if (CC->get() == ISD::SETO) {
2537     // Evaluates to true if Op0 is not [SQ]NaN - lowers to the inverse of
2538     // SETUO
2539     APInt ccResultAllOnes = APInt::getAllOnesValue(ccResultVT.getSizeInBits());
2540     return DAG.getNode(ISD::XOR, dl, ccResultVT,
2541                        DAG.getSetCC(dl, ccResultVT,
2542                                     lhs, DAG.getConstantFP(0.0, lhsVT),
2543                                     ISD::SETUO),
2544                        DAG.getConstant(ccResultAllOnes, ccResultVT));
2545   } else if (CC->get() == ISD::SETUO) {
2546     // Evaluates to true if Op0 is [SQ]NaN
2547     return DAG.getNode(ISD::AND, dl, ccResultVT,
2548                        DAG.getSetCC(dl, ccResultVT,
2549                                     lhsHi32abs,
2550                                     DAG.getConstant(0x7ff00000, MVT::i32),
2551                                     ISD::SETGE),
2552                        DAG.getSetCC(dl, ccResultVT,
2553                                     lhsLo32,
2554                                     DAG.getConstant(0, MVT::i32),
2555                                     ISD::SETGT));
2556   }
2557
2558   SDValue i64rhs = DAG.getNode(ISD::BITCAST, dl, IntVT, rhs);
2559   SDValue rhsHi32 =
2560           DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
2561                       DAG.getNode(ISD::SRL, dl, IntVT,
2562                                   i64rhs, DAG.getConstant(32, MVT::i32)));
2563
2564   // If a value is negative, subtract from the sign magnitude constant:
2565   SDValue signMag2TC = DAG.getConstant(0x8000000000000000ULL, IntVT);
2566
2567   // Convert the sign-magnitude representation into 2's complement:
2568   SDValue lhsSelectMask = DAG.getNode(ISD::SRA, dl, ccResultVT,
2569                                       lhsHi32, DAG.getConstant(31, MVT::i32));
2570   SDValue lhsSignMag2TC = DAG.getNode(ISD::SUB, dl, IntVT, signMag2TC, i64lhs);
2571   SDValue lhsSelect =
2572           DAG.getNode(ISD::SELECT, dl, IntVT,
2573                       lhsSelectMask, lhsSignMag2TC, i64lhs);
2574
2575   SDValue rhsSelectMask = DAG.getNode(ISD::SRA, dl, ccResultVT,
2576                                       rhsHi32, DAG.getConstant(31, MVT::i32));
2577   SDValue rhsSignMag2TC = DAG.getNode(ISD::SUB, dl, IntVT, signMag2TC, i64rhs);
2578   SDValue rhsSelect =
2579           DAG.getNode(ISD::SELECT, dl, IntVT,
2580                       rhsSelectMask, rhsSignMag2TC, i64rhs);
2581
2582   unsigned compareOp;
2583
2584   switch (CC->get()) {
2585   case ISD::SETOEQ:
2586   case ISD::SETUEQ:
2587     compareOp = ISD::SETEQ; break;
2588   case ISD::SETOGT:
2589   case ISD::SETUGT:
2590     compareOp = ISD::SETGT; break;
2591   case ISD::SETOGE:
2592   case ISD::SETUGE:
2593     compareOp = ISD::SETGE; break;
2594   case ISD::SETOLT:
2595   case ISD::SETULT:
2596     compareOp = ISD::SETLT; break;
2597   case ISD::SETOLE:
2598   case ISD::SETULE:
2599     compareOp = ISD::SETLE; break;
2600   case ISD::SETUNE:
2601   case ISD::SETONE:
2602     compareOp = ISD::SETNE; break;
2603   default:
2604     report_fatal_error("CellSPU ISel Select: unimplemented f64 condition");
2605   }
2606
2607   SDValue result =
2608           DAG.getSetCC(dl, ccResultVT, lhsSelect, rhsSelect,
2609                        (ISD::CondCode) compareOp);
2610
2611   if ((CC->get() & 0x8) == 0) {
2612     // Ordered comparison:
2613     SDValue lhsNaN = DAG.getSetCC(dl, ccResultVT,
2614                                   lhs, DAG.getConstantFP(0.0, MVT::f64),
2615                                   ISD::SETO);
2616     SDValue rhsNaN = DAG.getSetCC(dl, ccResultVT,
2617                                   rhs, DAG.getConstantFP(0.0, MVT::f64),
2618                                   ISD::SETO);
2619     SDValue ordered = DAG.getNode(ISD::AND, dl, ccResultVT, lhsNaN, rhsNaN);
2620
2621     result = DAG.getNode(ISD::AND, dl, ccResultVT, ordered, result);
2622   }
2623
2624   return result;
2625 }
2626
2627 //! Lower ISD::SELECT_CC
2628 /*!
2629   ISD::SELECT_CC can (generally) be implemented directly on the SPU using the
2630   SELB instruction.
2631
2632   \note Need to revisit this in the future: if the code path through the true
2633   and false value computations is longer than the latency of a branch (6
2634   cycles), then it would be more advantageous to branch and insert a new basic
2635   block and branch on the condition. However, this code does not make that
2636   assumption, given the simplisitc uses so far.
2637  */
2638
2639 static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG,
2640                               const TargetLowering &TLI) {
2641   EVT VT = Op.getValueType();
2642   SDValue lhs = Op.getOperand(0);
2643   SDValue rhs = Op.getOperand(1);
2644   SDValue trueval = Op.getOperand(2);
2645   SDValue falseval = Op.getOperand(3);
2646   SDValue condition = Op.getOperand(4);
2647   DebugLoc dl = Op.getDebugLoc();
2648
2649   // NOTE: SELB's arguments: $rA, $rB, $mask
2650   //
2651   // SELB selects bits from $rA where bits in $mask are 0, bits from $rB
2652   // where bits in $mask are 1. CCond will be inverted, having 1s where the
2653   // condition was true and 0s where the condition was false. Hence, the
2654   // arguments to SELB get reversed.
2655
2656   // Note: Really should be ISD::SELECT instead of SPUISD::SELB, but LLVM's
2657   // legalizer insists on combining SETCC/SELECT into SELECT_CC, so we end up
2658   // with another "cannot select select_cc" assert:
2659
2660   SDValue compare = DAG.getNode(ISD::SETCC, dl,
2661                                 TLI.getSetCCResultType(Op.getValueType()),
2662                                 lhs, rhs, condition);
2663   return DAG.getNode(SPUISD::SELB, dl, VT, falseval, trueval, compare);
2664 }
2665
2666 //! Custom lower ISD::TRUNCATE
2667 static SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG)
2668 {
2669   // Type to truncate to
2670   EVT VT = Op.getValueType();
2671   MVT simpleVT = VT.getSimpleVT();
2672   EVT VecVT = EVT::getVectorVT(*DAG.getContext(),
2673                                VT, (128 / VT.getSizeInBits()));
2674   DebugLoc dl = Op.getDebugLoc();
2675
2676   // Type to truncate from
2677   SDValue Op0 = Op.getOperand(0);
2678   EVT Op0VT = Op0.getValueType();
2679
2680   if (Op0VT == MVT::i128 && simpleVT == MVT::i64) {
2681     // Create shuffle mask, least significant doubleword of quadword
2682     unsigned maskHigh = 0x08090a0b;
2683     unsigned maskLow = 0x0c0d0e0f;
2684     // Use a shuffle to perform the truncation
2685     SDValue shufMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
2686                                    DAG.getConstant(maskHigh, MVT::i32),
2687                                    DAG.getConstant(maskLow, MVT::i32),
2688                                    DAG.getConstant(maskHigh, MVT::i32),
2689                                    DAG.getConstant(maskLow, MVT::i32));
2690
2691     SDValue truncShuffle = DAG.getNode(SPUISD::SHUFB, dl, VecVT,
2692                                        Op0, Op0, shufMask);
2693
2694     return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT, truncShuffle);
2695   }
2696
2697   return SDValue();             // Leave the truncate unmolested
2698 }
2699
2700 /*!
2701  * Emit the instruction sequence for i64/i32 -> i128 sign extend. The basic
2702  * algorithm is to duplicate the sign bit using rotmai to generate at
2703  * least one byte full of sign bits. Then propagate the "sign-byte" into
2704  * the leftmost words and the i64/i32 into the rightmost words using shufb.
2705  *
2706  * @param Op The sext operand
2707  * @param DAG The current DAG
2708  * @return The SDValue with the entire instruction sequence
2709  */
2710 static SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG)
2711 {
2712   DebugLoc dl = Op.getDebugLoc();
2713
2714   // Type to extend to
2715   MVT OpVT = Op.getValueType().getSimpleVT();
2716
2717   // Type to extend from
2718   SDValue Op0 = Op.getOperand(0);
2719   MVT Op0VT = Op0.getValueType().getSimpleVT();
2720
2721   // extend i8 & i16 via i32
2722   if (Op0VT == MVT::i8 || Op0VT == MVT::i16) {
2723     Op0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Op0);
2724     Op0VT = MVT::i32;
2725   }
2726
2727   // The type to extend to needs to be a i128 and
2728   // the type to extend from needs to be i64 or i32.
2729   assert((OpVT == MVT::i128 && (Op0VT == MVT::i64 || Op0VT == MVT::i32)) &&
2730           "LowerSIGN_EXTEND: input and/or output operand have wrong size");
2731
2732   // Create shuffle mask
2733   unsigned mask1 = 0x10101010; // byte 0 - 3 and 4 - 7
2734   unsigned mask2 = Op0VT == MVT::i64 ? 0x00010203 : 0x10101010; // byte  8 - 11
2735   unsigned mask3 = Op0VT == MVT::i64 ? 0x04050607 : 0x00010203; // byte 12 - 15
2736   SDValue shufMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
2737                                  DAG.getConstant(mask1, MVT::i32),
2738                                  DAG.getConstant(mask1, MVT::i32),
2739                                  DAG.getConstant(mask2, MVT::i32),
2740                                  DAG.getConstant(mask3, MVT::i32));
2741
2742   // Word wise arithmetic right shift to generate at least one byte
2743   // that contains sign bits.
2744   MVT mvt = Op0VT == MVT::i64 ? MVT::v2i64 : MVT::v4i32;
2745   SDValue sraVal = DAG.getNode(ISD::SRA,
2746                  dl,
2747                  mvt,
2748                  DAG.getNode(SPUISD::PREFSLOT2VEC, dl, mvt, Op0, Op0),
2749                  DAG.getConstant(31, MVT::i32));
2750
2751   // reinterpret as a i128 (SHUFB requires it). This gets lowered away.
2752   SDValue extended = SDValue(DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
2753                                         dl, Op0VT, Op0,
2754                                         DAG.getTargetConstant(
2755                                                   SPU::GPRCRegClass.getID(),
2756                                                   MVT::i32)), 0);
2757   // Shuffle bytes - Copy the sign bits into the upper 64 bits
2758   // and the input value into the lower 64 bits.
2759   SDValue extShuffle = DAG.getNode(SPUISD::SHUFB, dl, mvt,
2760         extended, sraVal, shufMask);
2761   return DAG.getNode(ISD::BITCAST, dl, MVT::i128, extShuffle);
2762 }
2763
2764 //! Custom (target-specific) lowering entry point
2765 /*!
2766   This is where LLVM's DAG selection process calls to do target-specific
2767   lowering of nodes.
2768  */
2769 SDValue
2770 SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
2771 {
2772   unsigned Opc = (unsigned) Op.getOpcode();
2773   EVT VT = Op.getValueType();
2774
2775   switch (Opc) {
2776   default: {
2777 #ifndef NDEBUG
2778     errs() << "SPUTargetLowering::LowerOperation(): need to lower this!\n";
2779     errs() << "Op.getOpcode() = " << Opc << "\n";
2780     errs() << "*Op.getNode():\n";
2781     Op.getNode()->dump();
2782 #endif
2783     llvm_unreachable(0);
2784   }
2785   case ISD::LOAD:
2786   case ISD::EXTLOAD:
2787   case ISD::SEXTLOAD:
2788   case ISD::ZEXTLOAD:
2789     return LowerLOAD(Op, DAG, SPUTM.getSubtargetImpl());
2790   case ISD::STORE:
2791     return LowerSTORE(Op, DAG, SPUTM.getSubtargetImpl());
2792   case ISD::ConstantPool:
2793     return LowerConstantPool(Op, DAG, SPUTM.getSubtargetImpl());
2794   case ISD::GlobalAddress:
2795     return LowerGlobalAddress(Op, DAG, SPUTM.getSubtargetImpl());
2796   case ISD::JumpTable:
2797     return LowerJumpTable(Op, DAG, SPUTM.getSubtargetImpl());
2798   case ISD::ConstantFP:
2799     return LowerConstantFP(Op, DAG);
2800
2801   // i8, i64 math ops:
2802   case ISD::ADD:
2803   case ISD::SUB:
2804   case ISD::ROTR:
2805   case ISD::ROTL:
2806   case ISD::SRL:
2807   case ISD::SHL:
2808   case ISD::SRA: {
2809     if (VT == MVT::i8)
2810       return LowerI8Math(Op, DAG, Opc, *this);
2811     break;
2812   }
2813
2814   case ISD::FP_TO_SINT:
2815   case ISD::FP_TO_UINT:
2816     return LowerFP_TO_INT(Op, DAG, *this);
2817
2818   case ISD::SINT_TO_FP:
2819   case ISD::UINT_TO_FP:
2820     return LowerINT_TO_FP(Op, DAG, *this);
2821
2822   // Vector-related lowering.
2823   case ISD::BUILD_VECTOR:
2824     return LowerBUILD_VECTOR(Op, DAG);
2825   case ISD::SCALAR_TO_VECTOR:
2826     return LowerSCALAR_TO_VECTOR(Op, DAG);
2827   case ISD::VECTOR_SHUFFLE:
2828     return LowerVECTOR_SHUFFLE(Op, DAG);
2829   case ISD::EXTRACT_VECTOR_ELT:
2830     return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2831   case ISD::INSERT_VECTOR_ELT:
2832     return LowerINSERT_VECTOR_ELT(Op, DAG);
2833
2834   // Look for ANDBI, ORBI and XORBI opportunities and lower appropriately:
2835   case ISD::AND:
2836   case ISD::OR:
2837   case ISD::XOR:
2838     return LowerByteImmed(Op, DAG);
2839
2840   // Vector and i8 multiply:
2841   case ISD::MUL:
2842     if (VT == MVT::i8)
2843       return LowerI8Math(Op, DAG, Opc, *this);
2844
2845   case ISD::CTPOP:
2846     return LowerCTPOP(Op, DAG);
2847
2848   case ISD::SELECT_CC:
2849     return LowerSELECT_CC(Op, DAG, *this);
2850
2851   case ISD::SETCC:
2852     return LowerSETCC(Op, DAG, *this);
2853
2854   case ISD::TRUNCATE:
2855     return LowerTRUNCATE(Op, DAG);
2856
2857   case ISD::SIGN_EXTEND:
2858     return LowerSIGN_EXTEND(Op, DAG);
2859   }
2860
2861   return SDValue();
2862 }
2863
2864 void SPUTargetLowering::ReplaceNodeResults(SDNode *N,
2865                                            SmallVectorImpl<SDValue>&Results,
2866                                            SelectionDAG &DAG) const
2867 {
2868 #if 0
2869   unsigned Opc = (unsigned) N->getOpcode();
2870   EVT OpVT = N->getValueType(0);
2871
2872   switch (Opc) {
2873   default: {
2874     errs() << "SPUTargetLowering::ReplaceNodeResults(): need to fix this!\n";
2875     errs() << "Op.getOpcode() = " << Opc << "\n";
2876     errs() << "*Op.getNode():\n";
2877     N->dump();
2878     abort();
2879     /*NOTREACHED*/
2880   }
2881   }
2882 #endif
2883
2884   /* Otherwise, return unchanged */
2885 }
2886
2887 //===----------------------------------------------------------------------===//
2888 // Target Optimization Hooks
2889 //===----------------------------------------------------------------------===//
2890
2891 SDValue
2892 SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
2893 {
2894 #if 0
2895   TargetMachine &TM = getTargetMachine();
2896 #endif
2897   const SPUSubtarget *ST = SPUTM.getSubtargetImpl();
2898   SelectionDAG &DAG = DCI.DAG;
2899   SDValue Op0 = N->getOperand(0);       // everything has at least one operand
2900   EVT NodeVT = N->getValueType(0);      // The node's value type
2901   EVT Op0VT = Op0.getValueType();       // The first operand's result
2902   SDValue Result;                       // Initially, empty result
2903   DebugLoc dl = N->getDebugLoc();
2904
2905   switch (N->getOpcode()) {
2906   default: break;
2907   case ISD::ADD: {
2908     SDValue Op1 = N->getOperand(1);
2909
2910     if (Op0.getOpcode() == SPUISD::IndirectAddr
2911         || Op1.getOpcode() == SPUISD::IndirectAddr) {
2912       // Normalize the operands to reduce repeated code
2913       SDValue IndirectArg = Op0, AddArg = Op1;
2914
2915       if (Op1.getOpcode() == SPUISD::IndirectAddr) {
2916         IndirectArg = Op1;
2917         AddArg = Op0;
2918       }
2919
2920       if (isa<ConstantSDNode>(AddArg)) {
2921         ConstantSDNode *CN0 = cast<ConstantSDNode > (AddArg);
2922         SDValue IndOp1 = IndirectArg.getOperand(1);
2923
2924         if (CN0->isNullValue()) {
2925           // (add (SPUindirect <arg>, <arg>), 0) ->
2926           // (SPUindirect <arg>, <arg>)
2927
2928 #if !defined(NDEBUG)
2929           if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
2930             errs() << "\n"
2931                  << "Replace: (add (SPUindirect <arg>, <arg>), 0)\n"
2932                  << "With:    (SPUindirect <arg>, <arg>)\n";
2933           }
2934 #endif
2935
2936           return IndirectArg;
2937         } else if (isa<ConstantSDNode>(IndOp1)) {
2938           // (add (SPUindirect <arg>, <const>), <const>) ->
2939           // (SPUindirect <arg>, <const + const>)
2940           ConstantSDNode *CN1 = cast<ConstantSDNode > (IndOp1);
2941           int64_t combinedConst = CN0->getSExtValue() + CN1->getSExtValue();
2942           SDValue combinedValue = DAG.getConstant(combinedConst, Op0VT);
2943
2944 #if !defined(NDEBUG)
2945           if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
2946             errs() << "\n"
2947                  << "Replace: (add (SPUindirect <arg>, " << CN1->getSExtValue()
2948                  << "), " << CN0->getSExtValue() << ")\n"
2949                  << "With:    (SPUindirect <arg>, "
2950                  << combinedConst << ")\n";
2951           }
2952 #endif
2953
2954           return DAG.getNode(SPUISD::IndirectAddr, dl, Op0VT,
2955                              IndirectArg, combinedValue);
2956         }
2957       }
2958     }
2959     break;
2960   }
2961   case ISD::SIGN_EXTEND:
2962   case ISD::ZERO_EXTEND:
2963   case ISD::ANY_EXTEND: {
2964     if (Op0.getOpcode() == SPUISD::VEC2PREFSLOT && NodeVT == Op0VT) {
2965       // (any_extend (SPUextract_elt0 <arg>)) ->
2966       // (SPUextract_elt0 <arg>)
2967       // Types must match, however...
2968 #if !defined(NDEBUG)
2969       if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
2970         errs() << "\nReplace: ";
2971         N->dump(&DAG);
2972         errs() << "\nWith:    ";
2973         Op0.getNode()->dump(&DAG);
2974         errs() << "\n";
2975       }
2976 #endif
2977
2978       return Op0;
2979     }
2980     break;
2981   }
2982   case SPUISD::IndirectAddr: {
2983     if (!ST->usingLargeMem() && Op0.getOpcode() == SPUISD::AFormAddr) {
2984       ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1));
2985       if (CN != 0 && CN->isNullValue()) {
2986         // (SPUindirect (SPUaform <addr>, 0), 0) ->
2987         // (SPUaform <addr>, 0)
2988
2989         DEBUG(errs() << "Replace: ");
2990         DEBUG(N->dump(&DAG));
2991         DEBUG(errs() << "\nWith:    ");
2992         DEBUG(Op0.getNode()->dump(&DAG));
2993         DEBUG(errs() << "\n");
2994
2995         return Op0;
2996       }
2997     } else if (Op0.getOpcode() == ISD::ADD) {
2998       SDValue Op1 = N->getOperand(1);
2999       if (ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(Op1)) {
3000         // (SPUindirect (add <arg>, <arg>), 0) ->
3001         // (SPUindirect <arg>, <arg>)
3002         if (CN1->isNullValue()) {
3003
3004 #if !defined(NDEBUG)
3005           if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
3006             errs() << "\n"
3007                  << "Replace: (SPUindirect (add <arg>, <arg>), 0)\n"
3008                  << "With:    (SPUindirect <arg>, <arg>)\n";
3009           }
3010 #endif
3011
3012           return DAG.getNode(SPUISD::IndirectAddr, dl, Op0VT,
3013                              Op0.getOperand(0), Op0.getOperand(1));
3014         }
3015       }
3016     }
3017     break;
3018   }
3019   case SPUISD::SHL_BITS:
3020   case SPUISD::SHL_BYTES:
3021   case SPUISD::ROTBYTES_LEFT: {
3022     SDValue Op1 = N->getOperand(1);
3023
3024     // Kill degenerate vector shifts:
3025     if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op1)) {
3026       if (CN->isNullValue()) {
3027         Result = Op0;
3028       }
3029     }
3030     break;
3031   }
3032   case SPUISD::PREFSLOT2VEC: {
3033     switch (Op0.getOpcode()) {
3034     default:
3035       break;
3036     case ISD::ANY_EXTEND:
3037     case ISD::ZERO_EXTEND:
3038     case ISD::SIGN_EXTEND: {
3039       // (SPUprefslot2vec (any|zero|sign_extend (SPUvec2prefslot <arg>))) ->
3040       // <arg>
3041       // but only if the SPUprefslot2vec and <arg> types match.
3042       SDValue Op00 = Op0.getOperand(0);
3043       if (Op00.getOpcode() == SPUISD::VEC2PREFSLOT) {
3044         SDValue Op000 = Op00.getOperand(0);
3045         if (Op000.getValueType() == NodeVT) {
3046           Result = Op000;
3047         }
3048       }
3049       break;
3050     }
3051     case SPUISD::VEC2PREFSLOT: {
3052       // (SPUprefslot2vec (SPUvec2prefslot <arg>)) ->
3053       // <arg>
3054       Result = Op0.getOperand(0);
3055       break;
3056     }
3057     }
3058     break;
3059   }
3060   }
3061
3062   // Otherwise, return unchanged.
3063 #ifndef NDEBUG
3064   if (Result.getNode()) {
3065     DEBUG(errs() << "\nReplace.SPU: ");
3066     DEBUG(N->dump(&DAG));
3067     DEBUG(errs() << "\nWith:        ");
3068     DEBUG(Result.getNode()->dump(&DAG));
3069     DEBUG(errs() << "\n");
3070   }
3071 #endif
3072
3073   return Result;
3074 }
3075
3076 //===----------------------------------------------------------------------===//
3077 // Inline Assembly Support
3078 //===----------------------------------------------------------------------===//
3079
3080 /// getConstraintType - Given a constraint letter, return the type of
3081 /// constraint it is for this target.
3082 SPUTargetLowering::ConstraintType
3083 SPUTargetLowering::getConstraintType(const std::string &ConstraintLetter) const {
3084   if (ConstraintLetter.size() == 1) {
3085     switch (ConstraintLetter[0]) {
3086     default: break;
3087     case 'b':
3088     case 'r':
3089     case 'f':
3090     case 'v':
3091     case 'y':
3092       return C_RegisterClass;
3093     }
3094   }
3095   return TargetLowering::getConstraintType(ConstraintLetter);
3096 }
3097
3098 /// Examine constraint type and operand type and determine a weight value.
3099 /// This object must already have been set up with the operand type
3100 /// and the current alternative constraint selected.
3101 TargetLowering::ConstraintWeight
3102 SPUTargetLowering::getSingleConstraintMatchWeight(
3103     AsmOperandInfo &info, const char *constraint) const {
3104   ConstraintWeight weight = CW_Invalid;
3105   Value *CallOperandVal = info.CallOperandVal;
3106     // If we don't have a value, we can't do a match,
3107     // but allow it at the lowest weight.
3108   if (CallOperandVal == NULL)
3109     return CW_Default;
3110   // Look at the constraint type.
3111   switch (*constraint) {
3112   default:
3113     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
3114     break;
3115     //FIXME: Seems like the supported constraint letters were just copied
3116     // from PPC, as the following doesn't correspond to the GCC docs.
3117     // I'm leaving it so until someone adds the corresponding lowering support.
3118   case 'b':
3119   case 'r':
3120   case 'f':
3121   case 'd':
3122   case 'v':
3123   case 'y':
3124     weight = CW_Register;
3125     break;
3126   }
3127   return weight;
3128 }
3129
3130 std::pair<unsigned, const TargetRegisterClass*>
3131 SPUTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
3132                                                 EVT VT) const
3133 {
3134   if (Constraint.size() == 1) {
3135     // GCC RS6000 Constraint Letters
3136     switch (Constraint[0]) {
3137     case 'b':   // R1-R31
3138     case 'r':   // R0-R31
3139       if (VT == MVT::i64)
3140         return std::make_pair(0U, SPU::R64CRegisterClass);
3141       return std::make_pair(0U, SPU::R32CRegisterClass);
3142     case 'f':
3143       if (VT == MVT::f32)
3144         return std::make_pair(0U, SPU::R32FPRegisterClass);
3145       else if (VT == MVT::f64)
3146         return std::make_pair(0U, SPU::R64FPRegisterClass);
3147       break;
3148     case 'v':
3149       return std::make_pair(0U, SPU::GPRCRegisterClass);
3150     }
3151   }
3152
3153   return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
3154 }
3155
3156 //! Compute used/known bits for a SPU operand
3157 void
3158 SPUTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
3159                                                   const APInt &Mask,
3160                                                   APInt &KnownZero,
3161                                                   APInt &KnownOne,
3162                                                   const SelectionDAG &DAG,
3163                                                   unsigned Depth ) const {
3164 #if 0
3165   const uint64_t uint64_sizebits = sizeof(uint64_t) * CHAR_BIT;
3166
3167   switch (Op.getOpcode()) {
3168   default:
3169     // KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);
3170     break;
3171   case CALL:
3172   case SHUFB:
3173   case SHUFFLE_MASK:
3174   case CNTB:
3175   case SPUISD::PREFSLOT2VEC:
3176   case SPUISD::LDRESULT:
3177   case SPUISD::VEC2PREFSLOT:
3178   case SPUISD::SHLQUAD_L_BITS:
3179   case SPUISD::SHLQUAD_L_BYTES:
3180   case SPUISD::VEC_ROTL:
3181   case SPUISD::VEC_ROTR:
3182   case SPUISD::ROTBYTES_LEFT:
3183   case SPUISD::SELECT_MASK:
3184   case SPUISD::SELB:
3185   }
3186 #endif
3187 }
3188
3189 unsigned
3190 SPUTargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
3191                                                    unsigned Depth) const {
3192   switch (Op.getOpcode()) {
3193   default:
3194     return 1;
3195
3196   case ISD::SETCC: {
3197     EVT VT = Op.getValueType();
3198
3199     if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32) {
3200       VT = MVT::i32;
3201     }
3202     return VT.getSizeInBits();
3203   }
3204   }
3205 }
3206
3207 // LowerAsmOperandForConstraint
3208 void
3209 SPUTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
3210                                                 std::string &Constraint,
3211                                                 std::vector<SDValue> &Ops,
3212                                                 SelectionDAG &DAG) const {
3213   // Default, for the time being, to the base class handler
3214   TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
3215 }
3216
3217 /// isLegalAddressImmediate - Return true if the integer value can be used
3218 /// as the offset of the target addressing mode.
3219 bool SPUTargetLowering::isLegalAddressImmediate(int64_t V,
3220                                                 const Type *Ty) const {
3221   // SPU's addresses are 256K:
3222   return (V > -(1 << 18) && V < (1 << 18) - 1);
3223 }
3224
3225 bool SPUTargetLowering::isLegalAddressImmediate(llvm::GlobalValue* GV) const {
3226   return false;
3227 }
3228
3229 bool
3230 SPUTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
3231   // The SPU target isn't yet aware of offsets.
3232   return false;
3233 }
3234
3235 // can we compare to Imm without writing it into a register?
3236 bool SPUTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
3237   //ceqi, cgti, etc. all take s10 operand
3238   return isInt<10>(Imm);
3239 }
3240
3241 bool
3242 SPUTargetLowering::isLegalAddressingMode(const AddrMode &AM,
3243                                          const Type * ) const{
3244
3245   // A-form: 18bit absolute address.
3246   if (AM.BaseGV && !AM.HasBaseReg && AM.Scale == 0 && AM.BaseOffs == 0)
3247     return true;
3248
3249   // D-form: reg + 14bit offset
3250   if (AM.BaseGV ==0 && AM.HasBaseReg && AM.Scale == 0 && isInt<14>(AM.BaseOffs))
3251     return true;
3252
3253   // X-form: reg+reg
3254   if (AM.BaseGV == 0 && AM.HasBaseReg && AM.Scale == 1 && AM.BaseOffs ==0)
3255     return true;
3256
3257   return false;
3258 }