1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// Custom DAG lowering for R600
12 //===----------------------------------------------------------------------===//
14 #include "R600ISelLowering.h"
15 #include "AMDGPUFrameLowering.h"
16 #include "AMDGPUSubtarget.h"
17 #include "R600Defines.h"
18 #include "R600FrameLowering.h"
19 #include "R600InstrInfo.h"
20 #include "R600MachineFunctionInfo.h"
21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/APFloat.h"
24 #include "llvm/ADT/APInt.h"
25 #include "llvm/ADT/ArrayRef.h"
26 #include "llvm/ADT/DenseMap.h"
27 #include "llvm/ADT/SmallVector.h"
28 #include "llvm/CodeGen/CallingConvLower.h"
29 #include "llvm/CodeGen/DAGCombine.h"
30 #include "llvm/CodeGen/ISDOpcodes.h"
31 #include "llvm/CodeGen/MachineBasicBlock.h"
32 #include "llvm/CodeGen/MachineFunction.h"
33 #include "llvm/CodeGen/MachineInstr.h"
34 #include "llvm/CodeGen/MachineInstrBuilder.h"
35 #include "llvm/CodeGen/MachineMemOperand.h"
36 #include "llvm/CodeGen/MachineRegisterInfo.h"
37 #include "llvm/CodeGen/SelectionDAG.h"
38 #include "llvm/IR/Constants.h"
39 #include "llvm/IR/DerivedTypes.h"
40 #include "llvm/Support/Casting.h"
41 #include "llvm/Support/Compiler.h"
42 #include "llvm/Support/ErrorHandling.h"
43 #include "llvm/Support/MachineValueType.h"
44 #include "llvm/Support/MathExtras.h"
53 #include "R600GenCallingConv.inc"
55 R600TargetLowering::R600TargetLowering(const TargetMachine
&TM
,
56 const R600Subtarget
&STI
)
57 : AMDGPUTargetLowering(TM
, STI
), Subtarget(&STI
), Gen(STI
.getGeneration()) {
58 addRegisterClass(MVT::f32
, &R600::R600_Reg32RegClass
);
59 addRegisterClass(MVT::i32
, &R600::R600_Reg32RegClass
);
60 addRegisterClass(MVT::v2f32
, &R600::R600_Reg64RegClass
);
61 addRegisterClass(MVT::v2i32
, &R600::R600_Reg64RegClass
);
62 addRegisterClass(MVT::v4f32
, &R600::R600_Reg128RegClass
);
63 addRegisterClass(MVT::v4i32
, &R600::R600_Reg128RegClass
);
65 computeRegisterProperties(Subtarget
->getRegisterInfo());
67 // Legalize loads and stores to the private address space.
68 setOperationAction(ISD::LOAD
, MVT::i32
, Custom
);
69 setOperationAction(ISD::LOAD
, MVT::v2i32
, Custom
);
70 setOperationAction(ISD::LOAD
, MVT::v4i32
, Custom
);
72 // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
73 // spaces, so it is custom lowered to handle those where it isn't.
74 for (MVT VT
: MVT::integer_valuetypes()) {
75 setLoadExtAction(ISD::SEXTLOAD
, VT
, MVT::i1
, Promote
);
76 setLoadExtAction(ISD::SEXTLOAD
, VT
, MVT::i8
, Custom
);
77 setLoadExtAction(ISD::SEXTLOAD
, VT
, MVT::i16
, Custom
);
79 setLoadExtAction(ISD::ZEXTLOAD
, VT
, MVT::i1
, Promote
);
80 setLoadExtAction(ISD::ZEXTLOAD
, VT
, MVT::i8
, Custom
);
81 setLoadExtAction(ISD::ZEXTLOAD
, VT
, MVT::i16
, Custom
);
83 setLoadExtAction(ISD::EXTLOAD
, VT
, MVT::i1
, Promote
);
84 setLoadExtAction(ISD::EXTLOAD
, VT
, MVT::i8
, Custom
);
85 setLoadExtAction(ISD::EXTLOAD
, VT
, MVT::i16
, Custom
);
88 // Workaround for LegalizeDAG asserting on expansion of i1 vector loads.
89 setLoadExtAction(ISD::EXTLOAD
, MVT::v2i32
, MVT::v2i1
, Expand
);
90 setLoadExtAction(ISD::SEXTLOAD
, MVT::v2i32
, MVT::v2i1
, Expand
);
91 setLoadExtAction(ISD::ZEXTLOAD
, MVT::v2i32
, MVT::v2i1
, Expand
);
93 setLoadExtAction(ISD::EXTLOAD
, MVT::v4i32
, MVT::v4i1
, Expand
);
94 setLoadExtAction(ISD::SEXTLOAD
, MVT::v4i32
, MVT::v4i1
, Expand
);
95 setLoadExtAction(ISD::ZEXTLOAD
, MVT::v4i32
, MVT::v4i1
, Expand
);
97 setOperationAction(ISD::STORE
, MVT::i8
, Custom
);
98 setOperationAction(ISD::STORE
, MVT::i32
, Custom
);
99 setOperationAction(ISD::STORE
, MVT::v2i32
, Custom
);
100 setOperationAction(ISD::STORE
, MVT::v4i32
, Custom
);
102 setTruncStoreAction(MVT::i32
, MVT::i8
, Custom
);
103 setTruncStoreAction(MVT::i32
, MVT::i16
, Custom
);
104 // We need to include these since trunc STORES to PRIVATE need
105 // special handling to accommodate RMW
106 setTruncStoreAction(MVT::v2i32
, MVT::v2i16
, Custom
);
107 setTruncStoreAction(MVT::v4i32
, MVT::v4i16
, Custom
);
108 setTruncStoreAction(MVT::v8i32
, MVT::v8i16
, Custom
);
109 setTruncStoreAction(MVT::v16i32
, MVT::v16i16
, Custom
);
110 setTruncStoreAction(MVT::v32i32
, MVT::v32i16
, Custom
);
111 setTruncStoreAction(MVT::v2i32
, MVT::v2i8
, Custom
);
112 setTruncStoreAction(MVT::v4i32
, MVT::v4i8
, Custom
);
113 setTruncStoreAction(MVT::v8i32
, MVT::v8i8
, Custom
);
114 setTruncStoreAction(MVT::v16i32
, MVT::v16i8
, Custom
);
115 setTruncStoreAction(MVT::v32i32
, MVT::v32i8
, Custom
);
117 // Workaround for LegalizeDAG asserting on expansion of i1 vector stores.
118 setTruncStoreAction(MVT::v2i32
, MVT::v2i1
, Expand
);
119 setTruncStoreAction(MVT::v4i32
, MVT::v4i1
, Expand
);
121 // Set condition code actions
122 setCondCodeAction(ISD::SETO
, MVT::f32
, Expand
);
123 setCondCodeAction(ISD::SETUO
, MVT::f32
, Expand
);
124 setCondCodeAction(ISD::SETLT
, MVT::f32
, Expand
);
125 setCondCodeAction(ISD::SETLE
, MVT::f32
, Expand
);
126 setCondCodeAction(ISD::SETOLT
, MVT::f32
, Expand
);
127 setCondCodeAction(ISD::SETOLE
, MVT::f32
, Expand
);
128 setCondCodeAction(ISD::SETONE
, MVT::f32
, Expand
);
129 setCondCodeAction(ISD::SETUEQ
, MVT::f32
, Expand
);
130 setCondCodeAction(ISD::SETUGE
, MVT::f32
, Expand
);
131 setCondCodeAction(ISD::SETUGT
, MVT::f32
, Expand
);
132 setCondCodeAction(ISD::SETULT
, MVT::f32
, Expand
);
133 setCondCodeAction(ISD::SETULE
, MVT::f32
, Expand
);
135 setCondCodeAction(ISD::SETLE
, MVT::i32
, Expand
);
136 setCondCodeAction(ISD::SETLT
, MVT::i32
, Expand
);
137 setCondCodeAction(ISD::SETULE
, MVT::i32
, Expand
);
138 setCondCodeAction(ISD::SETULT
, MVT::i32
, Expand
);
140 setOperationAction(ISD::FCOS
, MVT::f32
, Custom
);
141 setOperationAction(ISD::FSIN
, MVT::f32
, Custom
);
143 setOperationAction(ISD::SETCC
, MVT::v4i32
, Expand
);
144 setOperationAction(ISD::SETCC
, MVT::v2i32
, Expand
);
146 setOperationAction(ISD::BR_CC
, MVT::i32
, Expand
);
147 setOperationAction(ISD::BR_CC
, MVT::f32
, Expand
);
148 setOperationAction(ISD::BRCOND
, MVT::Other
, Custom
);
150 setOperationAction(ISD::FSUB
, MVT::f32
, Expand
);
152 setOperationAction(ISD::FCEIL
, MVT::f64
, Custom
);
153 setOperationAction(ISD::FTRUNC
, MVT::f64
, Custom
);
154 setOperationAction(ISD::FRINT
, MVT::f64
, Custom
);
155 setOperationAction(ISD::FFLOOR
, MVT::f64
, Custom
);
157 setOperationAction(ISD::SELECT_CC
, MVT::f32
, Custom
);
158 setOperationAction(ISD::SELECT_CC
, MVT::i32
, Custom
);
160 setOperationAction(ISD::SETCC
, MVT::i32
, Expand
);
161 setOperationAction(ISD::SETCC
, MVT::f32
, Expand
);
162 setOperationAction(ISD::FP_TO_UINT
, MVT::i1
, Custom
);
163 setOperationAction(ISD::FP_TO_SINT
, MVT::i1
, Custom
);
164 setOperationAction(ISD::FP_TO_SINT
, MVT::i64
, Custom
);
165 setOperationAction(ISD::FP_TO_UINT
, MVT::i64
, Custom
);
167 setOperationAction(ISD::SELECT
, MVT::i32
, Expand
);
168 setOperationAction(ISD::SELECT
, MVT::f32
, Expand
);
169 setOperationAction(ISD::SELECT
, MVT::v2i32
, Expand
);
170 setOperationAction(ISD::SELECT
, MVT::v4i32
, Expand
);
172 // ADD, SUB overflow.
173 // TODO: turn these into Legal?
174 if (Subtarget
->hasCARRY())
175 setOperationAction(ISD::UADDO
, MVT::i32
, Custom
);
177 if (Subtarget
->hasBORROW())
178 setOperationAction(ISD::USUBO
, MVT::i32
, Custom
);
180 // Expand sign extension of vectors
181 if (!Subtarget
->hasBFE())
182 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::i1
, Expand
);
184 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v2i1
, Expand
);
185 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v4i1
, Expand
);
187 if (!Subtarget
->hasBFE())
188 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::i8
, Expand
);
189 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v2i8
, Expand
);
190 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v4i8
, Expand
);
192 if (!Subtarget
->hasBFE())
193 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::i16
, Expand
);
194 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v2i16
, Expand
);
195 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v4i16
, Expand
);
197 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::i32
, Legal
);
198 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v2i32
, Expand
);
199 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v4i32
, Expand
);
201 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::Other
, Expand
);
203 setOperationAction(ISD::FrameIndex
, MVT::i32
, Custom
);
205 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v2i32
, Custom
);
206 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v2f32
, Custom
);
207 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v4i32
, Custom
);
208 setOperationAction(ISD::EXTRACT_VECTOR_ELT
, MVT::v4f32
, Custom
);
210 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v2i32
, Custom
);
211 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v2f32
, Custom
);
212 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v4i32
, Custom
);
213 setOperationAction(ISD::INSERT_VECTOR_ELT
, MVT::v4f32
, Custom
);
215 // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
216 // to be Legal/Custom in order to avoid library calls.
217 setOperationAction(ISD::SHL_PARTS
, MVT::i32
, Custom
);
218 setOperationAction(ISD::SRL_PARTS
, MVT::i32
, Custom
);
219 setOperationAction(ISD::SRA_PARTS
, MVT::i32
, Custom
);
221 if (!Subtarget
->hasFMA()) {
222 setOperationAction(ISD::FMA
, MVT::f32
, Expand
);
223 setOperationAction(ISD::FMA
, MVT::f64
, Expand
);
226 // FIXME: This was moved from AMDGPUTargetLowering, I'm not sure if we
228 if (!Subtarget
->hasFP32Denormals())
229 setOperationAction(ISD::FMAD
, MVT::f32
, Legal
);
231 if (!Subtarget
->hasBFI()) {
232 // fcopysign can be done in a single instruction with BFI.
233 setOperationAction(ISD::FCOPYSIGN
, MVT::f32
, Expand
);
234 setOperationAction(ISD::FCOPYSIGN
, MVT::f64
, Expand
);
237 if (!Subtarget
->hasBCNT(32))
238 setOperationAction(ISD::CTPOP
, MVT::i32
, Expand
);
240 if (!Subtarget
->hasBCNT(64))
241 setOperationAction(ISD::CTPOP
, MVT::i64
, Expand
);
243 if (Subtarget
->hasFFBH())
244 setOperationAction(ISD::CTLZ_ZERO_UNDEF
, MVT::i32
, Custom
);
246 if (Subtarget
->hasFFBL())
247 setOperationAction(ISD::CTTZ_ZERO_UNDEF
, MVT::i32
, Custom
);
249 // FIXME: This was moved from AMDGPUTargetLowering, I'm not sure if we
251 if (Subtarget
->hasBFE())
252 setHasExtractBitsInsn(true);
254 setOperationAction(ISD::GlobalAddress
, MVT::i32
, Custom
);
256 const MVT ScalarIntVTs
[] = { MVT::i32
, MVT::i64
};
257 for (MVT VT
: ScalarIntVTs
) {
258 setOperationAction(ISD::ADDC
, VT
, Expand
);
259 setOperationAction(ISD::SUBC
, VT
, Expand
);
260 setOperationAction(ISD::ADDE
, VT
, Expand
);
261 setOperationAction(ISD::SUBE
, VT
, Expand
);
264 // LLVM will expand these to atomic_cmp_swap(0)
265 // and atomic_swap, respectively.
266 setOperationAction(ISD::ATOMIC_LOAD
, MVT::i32
, Expand
);
267 setOperationAction(ISD::ATOMIC_STORE
, MVT::i32
, Expand
);
269 // We need to custom lower some of the intrinsics
270 setOperationAction(ISD::INTRINSIC_VOID
, MVT::Other
, Custom
);
271 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::Other
, Custom
);
273 setSchedulingPreference(Sched::Source
);
275 setTargetDAGCombine(ISD::FP_ROUND
);
276 setTargetDAGCombine(ISD::FP_TO_SINT
);
277 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT
);
278 setTargetDAGCombine(ISD::SELECT_CC
);
279 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT
);
280 setTargetDAGCombine(ISD::LOAD
);
283 static inline bool isEOP(MachineBasicBlock::iterator I
) {
284 if (std::next(I
) == I
->getParent()->end())
286 return std::next(I
)->getOpcode() == R600::RETURN
;
290 R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr
&MI
,
291 MachineBasicBlock
*BB
) const {
292 MachineFunction
*MF
= BB
->getParent();
293 MachineRegisterInfo
&MRI
= MF
->getRegInfo();
294 MachineBasicBlock::iterator I
= MI
;
295 const R600InstrInfo
*TII
= Subtarget
->getInstrInfo();
297 switch (MI
.getOpcode()) {
299 // Replace LDS_*_RET instruction that don't have any uses with the
300 // equivalent LDS_*_NORET instruction.
301 if (TII
->isLDSRetInstr(MI
.getOpcode())) {
302 int DstIdx
= TII
->getOperandIdx(MI
.getOpcode(), R600::OpName::dst
);
303 assert(DstIdx
!= -1);
304 MachineInstrBuilder NewMI
;
305 // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
306 // LDS_1A2D support and remove this special case.
307 if (!MRI
.use_empty(MI
.getOperand(DstIdx
).getReg()) ||
308 MI
.getOpcode() == R600::LDS_CMPST_RET
)
311 NewMI
= BuildMI(*BB
, I
, BB
->findDebugLoc(I
),
312 TII
->get(R600::getLDSNoRetOp(MI
.getOpcode())));
313 for (unsigned i
= 1, e
= MI
.getNumOperands(); i
< e
; ++i
) {
314 NewMI
.add(MI
.getOperand(i
));
317 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI
, BB
);
321 case R600::FABS_R600
: {
322 MachineInstr
*NewMI
= TII
->buildDefaultInstruction(
323 *BB
, I
, R600::MOV
, MI
.getOperand(0).getReg(),
324 MI
.getOperand(1).getReg());
325 TII
->addFlag(*NewMI
, 0, MO_FLAG_ABS
);
329 case R600::FNEG_R600
: {
330 MachineInstr
*NewMI
= TII
->buildDefaultInstruction(
331 *BB
, I
, R600::MOV
, MI
.getOperand(0).getReg(),
332 MI
.getOperand(1).getReg());
333 TII
->addFlag(*NewMI
, 0, MO_FLAG_NEG
);
337 case R600::MASK_WRITE
: {
338 Register maskedRegister
= MI
.getOperand(0).getReg();
339 assert(Register::isVirtualRegister(maskedRegister
));
340 MachineInstr
* defInstr
= MRI
.getVRegDef(maskedRegister
);
341 TII
->addFlag(*defInstr
, 0, MO_FLAG_MASK
);
345 case R600::MOV_IMM_F32
:
346 TII
->buildMovImm(*BB
, I
, MI
.getOperand(0).getReg(), MI
.getOperand(1)
353 case R600::MOV_IMM_I32
:
354 TII
->buildMovImm(*BB
, I
, MI
.getOperand(0).getReg(),
355 MI
.getOperand(1).getImm());
358 case R600::MOV_IMM_GLOBAL_ADDR
: {
359 //TODO: Perhaps combine this instruction with the next if possible
360 auto MIB
= TII
->buildDefaultInstruction(
361 *BB
, MI
, R600::MOV
, MI
.getOperand(0).getReg(), R600::ALU_LITERAL_X
);
362 int Idx
= TII
->getOperandIdx(*MIB
, R600::OpName::literal
);
363 //TODO: Ugh this is rather ugly
364 MIB
->getOperand(Idx
) = MI
.getOperand(1);
368 case R600::CONST_COPY
: {
369 MachineInstr
*NewMI
= TII
->buildDefaultInstruction(
370 *BB
, MI
, R600::MOV
, MI
.getOperand(0).getReg(), R600::ALU_CONST
);
371 TII
->setImmOperand(*NewMI
, R600::OpName::src0_sel
,
372 MI
.getOperand(1).getImm());
376 case R600::RAT_WRITE_CACHELESS_32_eg
:
377 case R600::RAT_WRITE_CACHELESS_64_eg
:
378 case R600::RAT_WRITE_CACHELESS_128_eg
:
379 BuildMI(*BB
, I
, BB
->findDebugLoc(I
), TII
->get(MI
.getOpcode()))
380 .add(MI
.getOperand(0))
381 .add(MI
.getOperand(1))
382 .addImm(isEOP(I
)); // Set End of program bit
385 case R600::RAT_STORE_TYPED_eg
:
386 BuildMI(*BB
, I
, BB
->findDebugLoc(I
), TII
->get(MI
.getOpcode()))
387 .add(MI
.getOperand(0))
388 .add(MI
.getOperand(1))
389 .add(MI
.getOperand(2))
390 .addImm(isEOP(I
)); // Set End of program bit
394 BuildMI(*BB
, I
, BB
->findDebugLoc(I
), TII
->get(R600::JUMP
))
395 .add(MI
.getOperand(0));
398 case R600::BRANCH_COND_f32
: {
399 MachineInstr
*NewMI
=
400 BuildMI(*BB
, I
, BB
->findDebugLoc(I
), TII
->get(R600::PRED_X
),
402 .add(MI
.getOperand(1))
403 .addImm(R600::PRED_SETNE
)
405 TII
->addFlag(*NewMI
, 0, MO_FLAG_PUSH
);
406 BuildMI(*BB
, I
, BB
->findDebugLoc(I
), TII
->get(R600::JUMP_COND
))
407 .add(MI
.getOperand(0))
408 .addReg(R600::PREDICATE_BIT
, RegState::Kill
);
412 case R600::BRANCH_COND_i32
: {
413 MachineInstr
*NewMI
=
414 BuildMI(*BB
, I
, BB
->findDebugLoc(I
), TII
->get(R600::PRED_X
),
416 .add(MI
.getOperand(1))
417 .addImm(R600::PRED_SETNE_INT
)
419 TII
->addFlag(*NewMI
, 0, MO_FLAG_PUSH
);
420 BuildMI(*BB
, I
, BB
->findDebugLoc(I
), TII
->get(R600::JUMP_COND
))
421 .add(MI
.getOperand(0))
422 .addReg(R600::PREDICATE_BIT
, RegState::Kill
);
426 case R600::EG_ExportSwz
:
427 case R600::R600_ExportSwz
: {
428 // Instruction is left unmodified if its not the last one of its type
429 bool isLastInstructionOfItsType
= true;
430 unsigned InstExportType
= MI
.getOperand(1).getImm();
431 for (MachineBasicBlock::iterator NextExportInst
= std::next(I
),
432 EndBlock
= BB
->end(); NextExportInst
!= EndBlock
;
433 NextExportInst
= std::next(NextExportInst
)) {
434 if (NextExportInst
->getOpcode() == R600::EG_ExportSwz
||
435 NextExportInst
->getOpcode() == R600::R600_ExportSwz
) {
436 unsigned CurrentInstExportType
= NextExportInst
->getOperand(1)
438 if (CurrentInstExportType
== InstExportType
) {
439 isLastInstructionOfItsType
= false;
445 if (!EOP
&& !isLastInstructionOfItsType
)
447 unsigned CfInst
= (MI
.getOpcode() == R600::EG_ExportSwz
) ? 84 : 40;
448 BuildMI(*BB
, I
, BB
->findDebugLoc(I
), TII
->get(MI
.getOpcode()))
449 .add(MI
.getOperand(0))
450 .add(MI
.getOperand(1))
451 .add(MI
.getOperand(2))
452 .add(MI
.getOperand(3))
453 .add(MI
.getOperand(4))
454 .add(MI
.getOperand(5))
455 .add(MI
.getOperand(6))
465 MI
.eraseFromParent();
469 //===----------------------------------------------------------------------===//
470 // Custom DAG Lowering Operations
471 //===----------------------------------------------------------------------===//
473 SDValue
R600TargetLowering::LowerOperation(SDValue Op
, SelectionDAG
&DAG
) const {
474 MachineFunction
&MF
= DAG
.getMachineFunction();
475 R600MachineFunctionInfo
*MFI
= MF
.getInfo
<R600MachineFunctionInfo
>();
476 switch (Op
.getOpcode()) {
477 default: return AMDGPUTargetLowering::LowerOperation(Op
, DAG
);
478 case ISD::EXTRACT_VECTOR_ELT
: return LowerEXTRACT_VECTOR_ELT(Op
, DAG
);
479 case ISD::INSERT_VECTOR_ELT
: return LowerINSERT_VECTOR_ELT(Op
, DAG
);
480 case ISD::SHL_PARTS
: return LowerSHLParts(Op
, DAG
);
482 case ISD::SRL_PARTS
: return LowerSRXParts(Op
, DAG
);
483 case ISD::UADDO
: return LowerUADDSUBO(Op
, DAG
, ISD::ADD
, AMDGPUISD::CARRY
);
484 case ISD::USUBO
: return LowerUADDSUBO(Op
, DAG
, ISD::SUB
, AMDGPUISD::BORROW
);
486 case ISD::FSIN
: return LowerTrig(Op
, DAG
);
487 case ISD::SELECT_CC
: return LowerSELECT_CC(Op
, DAG
);
488 case ISD::STORE
: return LowerSTORE(Op
, DAG
);
490 SDValue Result
= LowerLOAD(Op
, DAG
);
491 assert((!Result
.getNode() ||
492 Result
.getNode()->getNumValues() == 2) &&
493 "Load should return a value and a chain");
497 case ISD::BRCOND
: return LowerBRCOND(Op
, DAG
);
498 case ISD::GlobalAddress
: return LowerGlobalAddress(MFI
, Op
, DAG
);
499 case ISD::FrameIndex
: return lowerFrameIndex(Op
, DAG
);
500 case ISD::INTRINSIC_VOID
: {
501 SDValue Chain
= Op
.getOperand(0);
502 unsigned IntrinsicID
=
503 cast
<ConstantSDNode
>(Op
.getOperand(1))->getZExtValue();
504 switch (IntrinsicID
) {
505 case Intrinsic::r600_store_swizzle
: {
507 const SDValue Args
[8] = {
509 Op
.getOperand(2), // Export Value
510 Op
.getOperand(3), // ArrayBase
511 Op
.getOperand(4), // Type
512 DAG
.getConstant(0, DL
, MVT::i32
), // SWZ_X
513 DAG
.getConstant(1, DL
, MVT::i32
), // SWZ_Y
514 DAG
.getConstant(2, DL
, MVT::i32
), // SWZ_Z
515 DAG
.getConstant(3, DL
, MVT::i32
) // SWZ_W
517 return DAG
.getNode(AMDGPUISD::R600_EXPORT
, DL
, Op
.getValueType(), Args
);
520 // default for switch(IntrinsicID)
523 // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
526 case ISD::INTRINSIC_WO_CHAIN
: {
527 unsigned IntrinsicID
=
528 cast
<ConstantSDNode
>(Op
.getOperand(0))->getZExtValue();
529 EVT VT
= Op
.getValueType();
531 switch (IntrinsicID
) {
532 case Intrinsic::r600_tex
:
533 case Intrinsic::r600_texc
: {
535 switch (IntrinsicID
) {
536 case Intrinsic::r600_tex
:
539 case Intrinsic::r600_texc
:
543 llvm_unreachable("unhandled texture operation");
546 SDValue TexArgs
[19] = {
547 DAG
.getConstant(TextureOp
, DL
, MVT::i32
),
549 DAG
.getConstant(0, DL
, MVT::i32
),
550 DAG
.getConstant(1, DL
, MVT::i32
),
551 DAG
.getConstant(2, DL
, MVT::i32
),
552 DAG
.getConstant(3, DL
, MVT::i32
),
556 DAG
.getConstant(0, DL
, MVT::i32
),
557 DAG
.getConstant(1, DL
, MVT::i32
),
558 DAG
.getConstant(2, DL
, MVT::i32
),
559 DAG
.getConstant(3, DL
, MVT::i32
),
567 return DAG
.getNode(AMDGPUISD::TEXTURE_FETCH
, DL
, MVT::v4f32
, TexArgs
);
569 case Intrinsic::r600_dot4
: {
571 DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::f32
, Op
.getOperand(1),
572 DAG
.getConstant(0, DL
, MVT::i32
)),
573 DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::f32
, Op
.getOperand(2),
574 DAG
.getConstant(0, DL
, MVT::i32
)),
575 DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::f32
, Op
.getOperand(1),
576 DAG
.getConstant(1, DL
, MVT::i32
)),
577 DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::f32
, Op
.getOperand(2),
578 DAG
.getConstant(1, DL
, MVT::i32
)),
579 DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::f32
, Op
.getOperand(1),
580 DAG
.getConstant(2, DL
, MVT::i32
)),
581 DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::f32
, Op
.getOperand(2),
582 DAG
.getConstant(2, DL
, MVT::i32
)),
583 DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::f32
, Op
.getOperand(1),
584 DAG
.getConstant(3, DL
, MVT::i32
)),
585 DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::f32
, Op
.getOperand(2),
586 DAG
.getConstant(3, DL
, MVT::i32
))
588 return DAG
.getNode(AMDGPUISD::DOT4
, DL
, MVT::f32
, Args
);
591 case Intrinsic::r600_implicitarg_ptr
: {
592 MVT PtrVT
= getPointerTy(DAG
.getDataLayout(), AMDGPUAS::PARAM_I_ADDRESS
);
593 uint32_t ByteOffset
= getImplicitParameterOffset(MF
, FIRST_IMPLICIT
);
594 return DAG
.getConstant(ByteOffset
, DL
, PtrVT
);
596 case Intrinsic::r600_read_ngroups_x
:
597 return LowerImplicitParameter(DAG
, VT
, DL
, 0);
598 case Intrinsic::r600_read_ngroups_y
:
599 return LowerImplicitParameter(DAG
, VT
, DL
, 1);
600 case Intrinsic::r600_read_ngroups_z
:
601 return LowerImplicitParameter(DAG
, VT
, DL
, 2);
602 case Intrinsic::r600_read_global_size_x
:
603 return LowerImplicitParameter(DAG
, VT
, DL
, 3);
604 case Intrinsic::r600_read_global_size_y
:
605 return LowerImplicitParameter(DAG
, VT
, DL
, 4);
606 case Intrinsic::r600_read_global_size_z
:
607 return LowerImplicitParameter(DAG
, VT
, DL
, 5);
608 case Intrinsic::r600_read_local_size_x
:
609 return LowerImplicitParameter(DAG
, VT
, DL
, 6);
610 case Intrinsic::r600_read_local_size_y
:
611 return LowerImplicitParameter(DAG
, VT
, DL
, 7);
612 case Intrinsic::r600_read_local_size_z
:
613 return LowerImplicitParameter(DAG
, VT
, DL
, 8);
615 case Intrinsic::r600_read_tgid_x
:
616 return CreateLiveInRegisterRaw(DAG
, &R600::R600_TReg32RegClass
,
618 case Intrinsic::r600_read_tgid_y
:
619 return CreateLiveInRegisterRaw(DAG
, &R600::R600_TReg32RegClass
,
621 case Intrinsic::r600_read_tgid_z
:
622 return CreateLiveInRegisterRaw(DAG
, &R600::R600_TReg32RegClass
,
624 case Intrinsic::r600_read_tidig_x
:
625 return CreateLiveInRegisterRaw(DAG
, &R600::R600_TReg32RegClass
,
627 case Intrinsic::r600_read_tidig_y
:
628 return CreateLiveInRegisterRaw(DAG
, &R600::R600_TReg32RegClass
,
630 case Intrinsic::r600_read_tidig_z
:
631 return CreateLiveInRegisterRaw(DAG
, &R600::R600_TReg32RegClass
,
634 case Intrinsic::r600_recipsqrt_ieee
:
635 return DAG
.getNode(AMDGPUISD::RSQ
, DL
, VT
, Op
.getOperand(1));
637 case Intrinsic::r600_recipsqrt_clamped
:
638 return DAG
.getNode(AMDGPUISD::RSQ_CLAMP
, DL
, VT
, Op
.getOperand(1));
643 // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
646 } // end switch(Op.getOpcode())
650 void R600TargetLowering::ReplaceNodeResults(SDNode
*N
,
651 SmallVectorImpl
<SDValue
> &Results
,
652 SelectionDAG
&DAG
) const {
653 switch (N
->getOpcode()) {
655 AMDGPUTargetLowering::ReplaceNodeResults(N
, Results
, DAG
);
657 case ISD::FP_TO_UINT
:
658 if (N
->getValueType(0) == MVT::i1
) {
659 Results
.push_back(lowerFP_TO_UINT(N
->getOperand(0), DAG
));
662 // Since we don't care about out of bounds values we can use FP_TO_SINT for
663 // uints too. The DAGLegalizer code for uint considers some extra cases
664 // which are not necessary here.
666 case ISD::FP_TO_SINT
: {
667 if (N
->getValueType(0) == MVT::i1
) {
668 Results
.push_back(lowerFP_TO_SINT(N
->getOperand(0), DAG
));
673 if (expandFP_TO_SINT(N
, Result
, DAG
))
674 Results
.push_back(Result
);
678 SDValue Op
= SDValue(N
, 1);
679 SDValue RES
= LowerSDIVREM(Op
, DAG
);
680 Results
.push_back(RES
);
681 Results
.push_back(RES
.getValue(1));
685 SDValue Op
= SDValue(N
, 0);
686 LowerUDIVREM64(Op
, DAG
, Results
);
692 SDValue
R600TargetLowering::vectorToVerticalVector(SelectionDAG
&DAG
,
693 SDValue Vector
) const {
695 EVT VecVT
= Vector
.getValueType();
696 EVT EltVT
= VecVT
.getVectorElementType();
697 SmallVector
<SDValue
, 8> Args
;
699 for (unsigned i
= 0, e
= VecVT
.getVectorNumElements(); i
!= e
; ++i
) {
700 Args
.push_back(DAG
.getNode(
701 ISD::EXTRACT_VECTOR_ELT
, DL
, EltVT
, Vector
,
702 DAG
.getConstant(i
, DL
, getVectorIdxTy(DAG
.getDataLayout()))));
705 return DAG
.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR
, DL
, VecVT
, Args
);
708 SDValue
R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op
,
709 SelectionDAG
&DAG
) const {
711 SDValue Vector
= Op
.getOperand(0);
712 SDValue Index
= Op
.getOperand(1);
714 if (isa
<ConstantSDNode
>(Index
) ||
715 Vector
.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR
)
718 Vector
= vectorToVerticalVector(DAG
, Vector
);
719 return DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, Op
.getValueType(),
723 SDValue
R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op
,
724 SelectionDAG
&DAG
) const {
726 SDValue Vector
= Op
.getOperand(0);
727 SDValue Value
= Op
.getOperand(1);
728 SDValue Index
= Op
.getOperand(2);
730 if (isa
<ConstantSDNode
>(Index
) ||
731 Vector
.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR
)
734 Vector
= vectorToVerticalVector(DAG
, Vector
);
735 SDValue Insert
= DAG
.getNode(ISD::INSERT_VECTOR_ELT
, DL
, Op
.getValueType(),
736 Vector
, Value
, Index
);
737 return vectorToVerticalVector(DAG
, Insert
);
740 SDValue
R600TargetLowering::LowerGlobalAddress(AMDGPUMachineFunction
*MFI
,
742 SelectionDAG
&DAG
) const {
743 GlobalAddressSDNode
*GSD
= cast
<GlobalAddressSDNode
>(Op
);
744 if (GSD
->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS
)
745 return AMDGPUTargetLowering::LowerGlobalAddress(MFI
, Op
, DAG
);
747 const DataLayout
&DL
= DAG
.getDataLayout();
748 const GlobalValue
*GV
= GSD
->getGlobal();
749 MVT ConstPtrVT
= getPointerTy(DL
, AMDGPUAS::CONSTANT_ADDRESS
);
751 SDValue GA
= DAG
.getTargetGlobalAddress(GV
, SDLoc(GSD
), ConstPtrVT
);
752 return DAG
.getNode(AMDGPUISD::CONST_DATA_PTR
, SDLoc(GSD
), ConstPtrVT
, GA
);
755 SDValue
R600TargetLowering::LowerTrig(SDValue Op
, SelectionDAG
&DAG
) const {
756 // On hw >= R700, COS/SIN input must be between -1. and 1.
757 // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
758 EVT VT
= Op
.getValueType();
759 SDValue Arg
= Op
.getOperand(0);
762 // TODO: Should this propagate fast-math-flags?
763 SDValue FractPart
= DAG
.getNode(AMDGPUISD::FRACT
, DL
, VT
,
764 DAG
.getNode(ISD::FADD
, DL
, VT
,
765 DAG
.getNode(ISD::FMUL
, DL
, VT
, Arg
,
766 DAG
.getConstantFP(0.15915494309, DL
, MVT::f32
)),
767 DAG
.getConstantFP(0.5, DL
, MVT::f32
)));
769 switch (Op
.getOpcode()) {
771 TrigNode
= AMDGPUISD::COS_HW
;
774 TrigNode
= AMDGPUISD::SIN_HW
;
777 llvm_unreachable("Wrong trig opcode");
779 SDValue TrigVal
= DAG
.getNode(TrigNode
, DL
, VT
,
780 DAG
.getNode(ISD::FADD
, DL
, VT
, FractPart
,
781 DAG
.getConstantFP(-0.5, DL
, MVT::f32
)));
782 if (Gen
>= AMDGPUSubtarget::R700
)
784 // On R600 hw, COS/SIN input must be between -Pi and Pi.
785 return DAG
.getNode(ISD::FMUL
, DL
, VT
, TrigVal
,
786 DAG
.getConstantFP(numbers::pif
, DL
, MVT::f32
));
789 SDValue
R600TargetLowering::LowerSHLParts(SDValue Op
, SelectionDAG
&DAG
) const {
791 EVT VT
= Op
.getValueType();
793 SDValue Lo
= Op
.getOperand(0);
794 SDValue Hi
= Op
.getOperand(1);
795 SDValue Shift
= Op
.getOperand(2);
796 SDValue Zero
= DAG
.getConstant(0, DL
, VT
);
797 SDValue One
= DAG
.getConstant(1, DL
, VT
);
799 SDValue Width
= DAG
.getConstant(VT
.getSizeInBits(), DL
, VT
);
800 SDValue Width1
= DAG
.getConstant(VT
.getSizeInBits() - 1, DL
, VT
);
801 SDValue BigShift
= DAG
.getNode(ISD::SUB
, DL
, VT
, Shift
, Width
);
802 SDValue CompShift
= DAG
.getNode(ISD::SUB
, DL
, VT
, Width1
, Shift
);
804 // The dance around Width1 is necessary for 0 special case.
805 // Without it the CompShift might be 32, producing incorrect results in
806 // Overflow. So we do the shift in two steps, the alternative is to
807 // add a conditional to filter the special case.
809 SDValue Overflow
= DAG
.getNode(ISD::SRL
, DL
, VT
, Lo
, CompShift
);
810 Overflow
= DAG
.getNode(ISD::SRL
, DL
, VT
, Overflow
, One
);
812 SDValue HiSmall
= DAG
.getNode(ISD::SHL
, DL
, VT
, Hi
, Shift
);
813 HiSmall
= DAG
.getNode(ISD::OR
, DL
, VT
, HiSmall
, Overflow
);
814 SDValue LoSmall
= DAG
.getNode(ISD::SHL
, DL
, VT
, Lo
, Shift
);
816 SDValue HiBig
= DAG
.getNode(ISD::SHL
, DL
, VT
, Lo
, BigShift
);
817 SDValue LoBig
= Zero
;
819 Hi
= DAG
.getSelectCC(DL
, Shift
, Width
, HiSmall
, HiBig
, ISD::SETULT
);
820 Lo
= DAG
.getSelectCC(DL
, Shift
, Width
, LoSmall
, LoBig
, ISD::SETULT
);
822 return DAG
.getNode(ISD::MERGE_VALUES
, DL
, DAG
.getVTList(VT
,VT
), Lo
, Hi
);
825 SDValue
R600TargetLowering::LowerSRXParts(SDValue Op
, SelectionDAG
&DAG
) const {
827 EVT VT
= Op
.getValueType();
829 SDValue Lo
= Op
.getOperand(0);
830 SDValue Hi
= Op
.getOperand(1);
831 SDValue Shift
= Op
.getOperand(2);
832 SDValue Zero
= DAG
.getConstant(0, DL
, VT
);
833 SDValue One
= DAG
.getConstant(1, DL
, VT
);
835 const bool SRA
= Op
.getOpcode() == ISD::SRA_PARTS
;
837 SDValue Width
= DAG
.getConstant(VT
.getSizeInBits(), DL
, VT
);
838 SDValue Width1
= DAG
.getConstant(VT
.getSizeInBits() - 1, DL
, VT
);
839 SDValue BigShift
= DAG
.getNode(ISD::SUB
, DL
, VT
, Shift
, Width
);
840 SDValue CompShift
= DAG
.getNode(ISD::SUB
, DL
, VT
, Width1
, Shift
);
842 // The dance around Width1 is necessary for 0 special case.
843 // Without it the CompShift might be 32, producing incorrect results in
844 // Overflow. So we do the shift in two steps, the alternative is to
845 // add a conditional to filter the special case.
847 SDValue Overflow
= DAG
.getNode(ISD::SHL
, DL
, VT
, Hi
, CompShift
);
848 Overflow
= DAG
.getNode(ISD::SHL
, DL
, VT
, Overflow
, One
);
850 SDValue HiSmall
= DAG
.getNode(SRA
? ISD::SRA
: ISD::SRL
, DL
, VT
, Hi
, Shift
);
851 SDValue LoSmall
= DAG
.getNode(ISD::SRL
, DL
, VT
, Lo
, Shift
);
852 LoSmall
= DAG
.getNode(ISD::OR
, DL
, VT
, LoSmall
, Overflow
);
854 SDValue LoBig
= DAG
.getNode(SRA
? ISD::SRA
: ISD::SRL
, DL
, VT
, Hi
, BigShift
);
855 SDValue HiBig
= SRA
? DAG
.getNode(ISD::SRA
, DL
, VT
, Hi
, Width1
) : Zero
;
857 Hi
= DAG
.getSelectCC(DL
, Shift
, Width
, HiSmall
, HiBig
, ISD::SETULT
);
858 Lo
= DAG
.getSelectCC(DL
, Shift
, Width
, LoSmall
, LoBig
, ISD::SETULT
);
860 return DAG
.getNode(ISD::MERGE_VALUES
, DL
, DAG
.getVTList(VT
,VT
), Lo
, Hi
);
863 SDValue
R600TargetLowering::LowerUADDSUBO(SDValue Op
, SelectionDAG
&DAG
,
864 unsigned mainop
, unsigned ovf
) const {
866 EVT VT
= Op
.getValueType();
868 SDValue Lo
= Op
.getOperand(0);
869 SDValue Hi
= Op
.getOperand(1);
871 SDValue OVF
= DAG
.getNode(ovf
, DL
, VT
, Lo
, Hi
);
873 OVF
= DAG
.getNode(ISD::SIGN_EXTEND_INREG
, DL
, VT
, OVF
,
874 DAG
.getValueType(MVT::i1
));
876 SDValue Res
= DAG
.getNode(mainop
, DL
, VT
, Lo
, Hi
);
878 return DAG
.getNode(ISD::MERGE_VALUES
, DL
, DAG
.getVTList(VT
, VT
), Res
, OVF
);
881 SDValue
R600TargetLowering::lowerFP_TO_UINT(SDValue Op
, SelectionDAG
&DAG
) const {
887 Op
, DAG
.getConstantFP(1.0f
, DL
, MVT::f32
),
888 DAG
.getCondCode(ISD::SETEQ
));
891 SDValue
R600TargetLowering::lowerFP_TO_SINT(SDValue Op
, SelectionDAG
&DAG
) const {
897 Op
, DAG
.getConstantFP(-1.0f
, DL
, MVT::f32
),
898 DAG
.getCondCode(ISD::SETEQ
));
901 SDValue
R600TargetLowering::LowerImplicitParameter(SelectionDAG
&DAG
, EVT VT
,
903 unsigned DwordOffset
) const {
904 unsigned ByteOffset
= DwordOffset
* 4;
905 PointerType
* PtrType
= PointerType::get(VT
.getTypeForEVT(*DAG
.getContext()),
906 AMDGPUAS::PARAM_I_ADDRESS
);
908 // We shouldn't be using an offset wider than 16-bits for implicit parameters.
909 assert(isInt
<16>(ByteOffset
));
911 return DAG
.getLoad(VT
, DL
, DAG
.getEntryNode(),
912 DAG
.getConstant(ByteOffset
, DL
, MVT::i32
), // PTR
913 MachinePointerInfo(ConstantPointerNull::get(PtrType
)));
916 bool R600TargetLowering::isZero(SDValue Op
) const {
917 if(ConstantSDNode
*Cst
= dyn_cast
<ConstantSDNode
>(Op
)) {
918 return Cst
->isNullValue();
919 } else if(ConstantFPSDNode
*CstFP
= dyn_cast
<ConstantFPSDNode
>(Op
)){
920 return CstFP
->isZero();
926 bool R600TargetLowering::isHWTrueValue(SDValue Op
) const {
927 if (ConstantFPSDNode
* CFP
= dyn_cast
<ConstantFPSDNode
>(Op
)) {
928 return CFP
->isExactlyValue(1.0);
930 return isAllOnesConstant(Op
);
933 bool R600TargetLowering::isHWFalseValue(SDValue Op
) const {
934 if (ConstantFPSDNode
* CFP
= dyn_cast
<ConstantFPSDNode
>(Op
)) {
935 return CFP
->getValueAPF().isZero();
937 return isNullConstant(Op
);
940 SDValue
R600TargetLowering::LowerSELECT_CC(SDValue Op
, SelectionDAG
&DAG
) const {
942 EVT VT
= Op
.getValueType();
944 SDValue LHS
= Op
.getOperand(0);
945 SDValue RHS
= Op
.getOperand(1);
946 SDValue True
= Op
.getOperand(2);
947 SDValue False
= Op
.getOperand(3);
948 SDValue CC
= Op
.getOperand(4);
951 if (VT
== MVT::f32
) {
952 DAGCombinerInfo
DCI(DAG
, AfterLegalizeVectorOps
, true, nullptr);
953 SDValue MinMax
= combineFMinMaxLegacy(DL
, VT
, LHS
, RHS
, True
, False
, CC
, DCI
);
958 // LHS and RHS are guaranteed to be the same value type
959 EVT CompareVT
= LHS
.getValueType();
961 // Check if we can lower this to a native operation.
963 // Try to lower to a SET* instruction:
965 // SET* can match the following patterns:
967 // select_cc f32, f32, -1, 0, cc_supported
968 // select_cc f32, f32, 1.0f, 0.0f, cc_supported
969 // select_cc i32, i32, -1, 0, cc_supported
972 // Move hardware True/False values to the correct operand.
973 ISD::CondCode CCOpcode
= cast
<CondCodeSDNode
>(CC
)->get();
974 ISD::CondCode InverseCC
=
975 ISD::getSetCCInverse(CCOpcode
, CompareVT
== MVT::i32
);
976 if (isHWTrueValue(False
) && isHWFalseValue(True
)) {
977 if (isCondCodeLegal(InverseCC
, CompareVT
.getSimpleVT())) {
978 std::swap(False
, True
);
979 CC
= DAG
.getCondCode(InverseCC
);
981 ISD::CondCode SwapInvCC
= ISD::getSetCCSwappedOperands(InverseCC
);
982 if (isCondCodeLegal(SwapInvCC
, CompareVT
.getSimpleVT())) {
983 std::swap(False
, True
);
985 CC
= DAG
.getCondCode(SwapInvCC
);
990 if (isHWTrueValue(True
) && isHWFalseValue(False
) &&
991 (CompareVT
== VT
|| VT
== MVT::i32
)) {
992 // This can be matched by a SET* instruction.
993 return DAG
.getNode(ISD::SELECT_CC
, DL
, VT
, LHS
, RHS
, True
, False
, CC
);
996 // Try to lower to a CND* instruction:
998 // CND* can match the following patterns:
1000 // select_cc f32, 0.0, f32, f32, cc_supported
1001 // select_cc f32, 0.0, i32, i32, cc_supported
1002 // select_cc i32, 0, f32, f32, cc_supported
1003 // select_cc i32, 0, i32, i32, cc_supported
1006 // Try to move the zero value to the RHS
1008 ISD::CondCode CCOpcode
= cast
<CondCodeSDNode
>(CC
)->get();
1009 // Try swapping the operands
1010 ISD::CondCode CCSwapped
= ISD::getSetCCSwappedOperands(CCOpcode
);
1011 if (isCondCodeLegal(CCSwapped
, CompareVT
.getSimpleVT())) {
1012 std::swap(LHS
, RHS
);
1013 CC
= DAG
.getCondCode(CCSwapped
);
1015 // Try inverting the conditon and then swapping the operands
1016 ISD::CondCode CCInv
= ISD::getSetCCInverse(CCOpcode
, CompareVT
.isInteger());
1017 CCSwapped
= ISD::getSetCCSwappedOperands(CCInv
);
1018 if (isCondCodeLegal(CCSwapped
, CompareVT
.getSimpleVT())) {
1019 std::swap(True
, False
);
1020 std::swap(LHS
, RHS
);
1021 CC
= DAG
.getCondCode(CCSwapped
);
1028 ISD::CondCode CCOpcode
= cast
<CondCodeSDNode
>(CC
)->get();
1029 if (CompareVT
!= VT
) {
1030 // Bitcast True / False to the correct types. This will end up being
1031 // a nop, but it allows us to define only a single pattern in the
1032 // .TD files for each CND* instruction rather than having to have
1033 // one pattern for integer True/False and one for fp True/False
1034 True
= DAG
.getNode(ISD::BITCAST
, DL
, CompareVT
, True
);
1035 False
= DAG
.getNode(ISD::BITCAST
, DL
, CompareVT
, False
);
1042 CCOpcode
= ISD::getSetCCInverse(CCOpcode
, CompareVT
== MVT::i32
);
1050 SDValue SelectNode
= DAG
.getNode(ISD::SELECT_CC
, DL
, CompareVT
,
1053 DAG
.getCondCode(CCOpcode
));
1054 return DAG
.getNode(ISD::BITCAST
, DL
, VT
, SelectNode
);
1057 // If we make it this for it means we have no native instructions to handle
1058 // this SELECT_CC, so we must lower it.
1059 SDValue HWTrue
, HWFalse
;
1061 if (CompareVT
== MVT::f32
) {
1062 HWTrue
= DAG
.getConstantFP(1.0f
, DL
, CompareVT
);
1063 HWFalse
= DAG
.getConstantFP(0.0f
, DL
, CompareVT
);
1064 } else if (CompareVT
== MVT::i32
) {
1065 HWTrue
= DAG
.getConstant(-1, DL
, CompareVT
);
1066 HWFalse
= DAG
.getConstant(0, DL
, CompareVT
);
1069 llvm_unreachable("Unhandled value type in LowerSELECT_CC");
1072 // Lower this unsupported SELECT_CC into a combination of two supported
1073 // SELECT_CC operations.
1074 SDValue Cond
= DAG
.getNode(ISD::SELECT_CC
, DL
, CompareVT
, LHS
, RHS
, HWTrue
, HWFalse
, CC
);
1076 return DAG
.getNode(ISD::SELECT_CC
, DL
, VT
,
1079 DAG
.getCondCode(ISD::SETNE
));
1082 /// LLVM generates byte-addressed pointers. For indirect addressing, we need to
1083 /// convert these pointers to a register index. Each register holds
1084 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
1085 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
1086 /// for indirect addressing.
1087 SDValue
R600TargetLowering::stackPtrToRegIndex(SDValue Ptr
,
1088 unsigned StackWidth
,
1089 SelectionDAG
&DAG
) const {
1091 switch(StackWidth
) {
1101 default: llvm_unreachable("Invalid stack width");
1105 return DAG
.getNode(ISD::SRL
, DL
, Ptr
.getValueType(), Ptr
,
1106 DAG
.getConstant(SRLPad
, DL
, MVT::i32
));
1109 void R600TargetLowering::getStackAddress(unsigned StackWidth
,
1112 unsigned &PtrIncr
) const {
1113 switch (StackWidth
) {
1124 Channel
= ElemIdx
% 2;
1138 SDValue
R600TargetLowering::lowerPrivateTruncStore(StoreSDNode
*Store
,
1139 SelectionDAG
&DAG
) const {
1141 //TODO: Who creates the i8 stores?
1142 assert(Store
->isTruncatingStore()
1143 || Store
->getValue().getValueType() == MVT::i8
);
1144 assert(Store
->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
);
1147 if (Store
->getMemoryVT() == MVT::i8
) {
1148 assert(Store
->getAlignment() >= 1);
1149 Mask
= DAG
.getConstant(0xff, DL
, MVT::i32
);
1150 } else if (Store
->getMemoryVT() == MVT::i16
) {
1151 assert(Store
->getAlignment() >= 2);
1152 Mask
= DAG
.getConstant(0xffff, DL
, MVT::i32
);
1154 llvm_unreachable("Unsupported private trunc store");
1157 SDValue OldChain
= Store
->getChain();
1158 bool VectorTrunc
= (OldChain
.getOpcode() == AMDGPUISD::DUMMY_CHAIN
);
1160 SDValue Chain
= VectorTrunc
? OldChain
->getOperand(0) : OldChain
;
1161 SDValue BasePtr
= Store
->getBasePtr();
1162 SDValue Offset
= Store
->getOffset();
1163 EVT MemVT
= Store
->getMemoryVT();
1165 SDValue LoadPtr
= BasePtr
;
1166 if (!Offset
.isUndef()) {
1167 LoadPtr
= DAG
.getNode(ISD::ADD
, DL
, MVT::i32
, BasePtr
, Offset
);
1170 // Get dword location
1171 // TODO: this should be eliminated by the future SHR ptr, 2
1172 SDValue Ptr
= DAG
.getNode(ISD::AND
, DL
, MVT::i32
, LoadPtr
,
1173 DAG
.getConstant(0xfffffffc, DL
, MVT::i32
));
1176 // TODO: can we be smarter about machine pointer info?
1177 MachinePointerInfo
PtrInfo(UndefValue::get(
1178 Type::getInt32PtrTy(*DAG
.getContext(), AMDGPUAS::PRIVATE_ADDRESS
)));
1179 SDValue Dst
= DAG
.getLoad(MVT::i32
, DL
, Chain
, Ptr
, PtrInfo
);
1181 Chain
= Dst
.getValue(1);
1183 // Get offset in dword
1184 SDValue ByteIdx
= DAG
.getNode(ISD::AND
, DL
, MVT::i32
, LoadPtr
,
1185 DAG
.getConstant(0x3, DL
, MVT::i32
));
1187 // Convert byte offset to bit shift
1188 SDValue ShiftAmt
= DAG
.getNode(ISD::SHL
, DL
, MVT::i32
, ByteIdx
,
1189 DAG
.getConstant(3, DL
, MVT::i32
));
1191 // TODO: Contrary to the name of the functiom,
1192 // it also handles sub i32 non-truncating stores (like i1)
1193 SDValue SExtValue
= DAG
.getNode(ISD::SIGN_EXTEND
, DL
, MVT::i32
,
1196 // Mask the value to the right type
1197 SDValue MaskedValue
= DAG
.getZeroExtendInReg(SExtValue
, DL
, MemVT
);
1199 // Shift the value in place
1200 SDValue ShiftedValue
= DAG
.getNode(ISD::SHL
, DL
, MVT::i32
,
1201 MaskedValue
, ShiftAmt
);
1203 // Shift the mask in place
1204 SDValue DstMask
= DAG
.getNode(ISD::SHL
, DL
, MVT::i32
, Mask
, ShiftAmt
);
1206 // Invert the mask. NOTE: if we had native ROL instructions we could
1207 // use inverted mask
1208 DstMask
= DAG
.getNOT(DL
, DstMask
, MVT::i32
);
1210 // Cleanup the target bits
1211 Dst
= DAG
.getNode(ISD::AND
, DL
, MVT::i32
, Dst
, DstMask
);
1214 SDValue Value
= DAG
.getNode(ISD::OR
, DL
, MVT::i32
, Dst
, ShiftedValue
);
1217 // TODO: Can we be smarter about MachinePointerInfo?
1218 SDValue NewStore
= DAG
.getStore(Chain
, DL
, Value
, Ptr
, PtrInfo
);
1220 // If we are part of expanded vector, make our neighbors depend on this store
1222 // Make all other vector elements depend on this store
1223 Chain
= DAG
.getNode(AMDGPUISD::DUMMY_CHAIN
, DL
, MVT::Other
, NewStore
);
1224 DAG
.ReplaceAllUsesOfValueWith(OldChain
, Chain
);
1229 SDValue
R600TargetLowering::LowerSTORE(SDValue Op
, SelectionDAG
&DAG
) const {
1230 StoreSDNode
*StoreNode
= cast
<StoreSDNode
>(Op
);
1231 unsigned AS
= StoreNode
->getAddressSpace();
1233 SDValue Chain
= StoreNode
->getChain();
1234 SDValue Ptr
= StoreNode
->getBasePtr();
1235 SDValue Value
= StoreNode
->getValue();
1237 EVT VT
= Value
.getValueType();
1238 EVT MemVT
= StoreNode
->getMemoryVT();
1239 EVT PtrVT
= Ptr
.getValueType();
1243 const bool TruncatingStore
= StoreNode
->isTruncatingStore();
1245 // Neither LOCAL nor PRIVATE can do vectors at the moment
1246 if ((AS
== AMDGPUAS::LOCAL_ADDRESS
|| AS
== AMDGPUAS::PRIVATE_ADDRESS
||
1249 if ((AS
== AMDGPUAS::PRIVATE_ADDRESS
) && TruncatingStore
) {
1250 // Add an extra level of chain to isolate this vector
1251 SDValue NewChain
= DAG
.getNode(AMDGPUISD::DUMMY_CHAIN
, DL
, MVT::Other
, Chain
);
1252 // TODO: can the chain be replaced without creating a new store?
1253 SDValue NewStore
= DAG
.getTruncStore(
1254 NewChain
, DL
, Value
, Ptr
, StoreNode
->getPointerInfo(),
1255 MemVT
, StoreNode
->getAlignment(),
1256 StoreNode
->getMemOperand()->getFlags(), StoreNode
->getAAInfo());
1257 StoreNode
= cast
<StoreSDNode
>(NewStore
);
1260 return scalarizeVectorStore(StoreNode
, DAG
);
1263 unsigned Align
= StoreNode
->getAlignment();
1264 if (Align
< MemVT
.getStoreSize() &&
1265 !allowsMisalignedMemoryAccesses(
1266 MemVT
, AS
, Align
, StoreNode
->getMemOperand()->getFlags(), nullptr)) {
1267 return expandUnalignedStore(StoreNode
, DAG
);
1270 SDValue DWordAddr
= DAG
.getNode(ISD::SRL
, DL
, PtrVT
, Ptr
,
1271 DAG
.getConstant(2, DL
, PtrVT
));
1273 if (AS
== AMDGPUAS::GLOBAL_ADDRESS
) {
1274 // It is beneficial to create MSKOR here instead of combiner to avoid
1275 // artificial dependencies introduced by RMW
1276 if (TruncatingStore
) {
1277 assert(VT
.bitsLE(MVT::i32
));
1278 SDValue MaskConstant
;
1279 if (MemVT
== MVT::i8
) {
1280 MaskConstant
= DAG
.getConstant(0xFF, DL
, MVT::i32
);
1282 assert(MemVT
== MVT::i16
);
1283 assert(StoreNode
->getAlignment() >= 2);
1284 MaskConstant
= DAG
.getConstant(0xFFFF, DL
, MVT::i32
);
1287 SDValue ByteIndex
= DAG
.getNode(ISD::AND
, DL
, PtrVT
, Ptr
,
1288 DAG
.getConstant(0x00000003, DL
, PtrVT
));
1289 SDValue BitShift
= DAG
.getNode(ISD::SHL
, DL
, VT
, ByteIndex
,
1290 DAG
.getConstant(3, DL
, VT
));
1292 // Put the mask in correct place
1293 SDValue Mask
= DAG
.getNode(ISD::SHL
, DL
, VT
, MaskConstant
, BitShift
);
1295 // Put the value bits in correct place
1296 SDValue TruncValue
= DAG
.getNode(ISD::AND
, DL
, VT
, Value
, MaskConstant
);
1297 SDValue ShiftedValue
= DAG
.getNode(ISD::SHL
, DL
, VT
, TruncValue
, BitShift
);
1299 // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1303 DAG
.getConstant(0, DL
, MVT::i32
),
1304 DAG
.getConstant(0, DL
, MVT::i32
),
1307 SDValue Input
= DAG
.getBuildVector(MVT::v4i32
, DL
, Src
);
1308 SDValue Args
[3] = { Chain
, Input
, DWordAddr
};
1309 return DAG
.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR
, DL
,
1310 Op
->getVTList(), Args
, MemVT
,
1311 StoreNode
->getMemOperand());
1312 } else if (Ptr
->getOpcode() != AMDGPUISD::DWORDADDR
&& VT
.bitsGE(MVT::i32
)) {
1313 // Convert pointer from byte address to dword address.
1314 Ptr
= DAG
.getNode(AMDGPUISD::DWORDADDR
, DL
, PtrVT
, DWordAddr
);
1316 if (StoreNode
->isIndexed()) {
1317 llvm_unreachable("Indexed stores not supported yet");
1319 Chain
= DAG
.getStore(Chain
, DL
, Value
, Ptr
, StoreNode
->getMemOperand());
1325 // GLOBAL_ADDRESS has been handled above, LOCAL_ADDRESS allows all sizes
1326 if (AS
!= AMDGPUAS::PRIVATE_ADDRESS
)
1329 if (MemVT
.bitsLT(MVT::i32
))
1330 return lowerPrivateTruncStore(StoreNode
, DAG
);
1332 // Standard i32+ store, tag it with DWORDADDR to note that the address
1334 if (Ptr
.getOpcode() != AMDGPUISD::DWORDADDR
) {
1335 Ptr
= DAG
.getNode(AMDGPUISD::DWORDADDR
, DL
, PtrVT
, DWordAddr
);
1336 return DAG
.getStore(Chain
, DL
, Value
, Ptr
, StoreNode
->getMemOperand());
1339 // Tagged i32+ stores will be matched by patterns
1343 // return (512 + (kc_bank << 12)
1345 ConstantAddressBlock(unsigned AddressSpace
) {
1346 switch (AddressSpace
) {
1347 case AMDGPUAS::CONSTANT_BUFFER_0
:
1349 case AMDGPUAS::CONSTANT_BUFFER_1
:
1351 case AMDGPUAS::CONSTANT_BUFFER_2
:
1352 return 512 + 4096 * 2;
1353 case AMDGPUAS::CONSTANT_BUFFER_3
:
1354 return 512 + 4096 * 3;
1355 case AMDGPUAS::CONSTANT_BUFFER_4
:
1356 return 512 + 4096 * 4;
1357 case AMDGPUAS::CONSTANT_BUFFER_5
:
1358 return 512 + 4096 * 5;
1359 case AMDGPUAS::CONSTANT_BUFFER_6
:
1360 return 512 + 4096 * 6;
1361 case AMDGPUAS::CONSTANT_BUFFER_7
:
1362 return 512 + 4096 * 7;
1363 case AMDGPUAS::CONSTANT_BUFFER_8
:
1364 return 512 + 4096 * 8;
1365 case AMDGPUAS::CONSTANT_BUFFER_9
:
1366 return 512 + 4096 * 9;
1367 case AMDGPUAS::CONSTANT_BUFFER_10
:
1368 return 512 + 4096 * 10;
1369 case AMDGPUAS::CONSTANT_BUFFER_11
:
1370 return 512 + 4096 * 11;
1371 case AMDGPUAS::CONSTANT_BUFFER_12
:
1372 return 512 + 4096 * 12;
1373 case AMDGPUAS::CONSTANT_BUFFER_13
:
1374 return 512 + 4096 * 13;
1375 case AMDGPUAS::CONSTANT_BUFFER_14
:
1376 return 512 + 4096 * 14;
1377 case AMDGPUAS::CONSTANT_BUFFER_15
:
1378 return 512 + 4096 * 15;
1384 SDValue
R600TargetLowering::lowerPrivateExtLoad(SDValue Op
,
1385 SelectionDAG
&DAG
) const {
1387 LoadSDNode
*Load
= cast
<LoadSDNode
>(Op
);
1388 ISD::LoadExtType ExtType
= Load
->getExtensionType();
1389 EVT MemVT
= Load
->getMemoryVT();
1390 assert(Load
->getAlignment() >= MemVT
.getStoreSize());
1392 SDValue BasePtr
= Load
->getBasePtr();
1393 SDValue Chain
= Load
->getChain();
1394 SDValue Offset
= Load
->getOffset();
1396 SDValue LoadPtr
= BasePtr
;
1397 if (!Offset
.isUndef()) {
1398 LoadPtr
= DAG
.getNode(ISD::ADD
, DL
, MVT::i32
, BasePtr
, Offset
);
1401 // Get dword location
1402 // NOTE: this should be eliminated by the future SHR ptr, 2
1403 SDValue Ptr
= DAG
.getNode(ISD::AND
, DL
, MVT::i32
, LoadPtr
,
1404 DAG
.getConstant(0xfffffffc, DL
, MVT::i32
));
1407 // TODO: can we be smarter about machine pointer info?
1408 MachinePointerInfo
PtrInfo(UndefValue::get(
1409 Type::getInt32PtrTy(*DAG
.getContext(), AMDGPUAS::PRIVATE_ADDRESS
)));
1410 SDValue Read
= DAG
.getLoad(MVT::i32
, DL
, Chain
, Ptr
, PtrInfo
);
1412 // Get offset within the register.
1413 SDValue ByteIdx
= DAG
.getNode(ISD::AND
, DL
, MVT::i32
,
1414 LoadPtr
, DAG
.getConstant(0x3, DL
, MVT::i32
));
1416 // Bit offset of target byte (byteIdx * 8).
1417 SDValue ShiftAmt
= DAG
.getNode(ISD::SHL
, DL
, MVT::i32
, ByteIdx
,
1418 DAG
.getConstant(3, DL
, MVT::i32
));
1420 // Shift to the right.
1421 SDValue Ret
= DAG
.getNode(ISD::SRL
, DL
, MVT::i32
, Read
, ShiftAmt
);
1423 // Eliminate the upper bits by setting them to ...
1424 EVT MemEltVT
= MemVT
.getScalarType();
1426 if (ExtType
== ISD::SEXTLOAD
) { // ... ones.
1427 SDValue MemEltVTNode
= DAG
.getValueType(MemEltVT
);
1428 Ret
= DAG
.getNode(ISD::SIGN_EXTEND_INREG
, DL
, MVT::i32
, Ret
, MemEltVTNode
);
1429 } else { // ... or zeros.
1430 Ret
= DAG
.getZeroExtendInReg(Ret
, DL
, MemEltVT
);
1435 Read
.getValue(1) // This should be our output chain
1438 return DAG
.getMergeValues(Ops
, DL
);
1441 SDValue
R600TargetLowering::LowerLOAD(SDValue Op
, SelectionDAG
&DAG
) const {
1442 LoadSDNode
*LoadNode
= cast
<LoadSDNode
>(Op
);
1443 unsigned AS
= LoadNode
->getAddressSpace();
1444 EVT MemVT
= LoadNode
->getMemoryVT();
1445 ISD::LoadExtType ExtType
= LoadNode
->getExtensionType();
1447 if (AS
== AMDGPUAS::PRIVATE_ADDRESS
&&
1448 ExtType
!= ISD::NON_EXTLOAD
&& MemVT
.bitsLT(MVT::i32
)) {
1449 return lowerPrivateExtLoad(Op
, DAG
);
1453 EVT VT
= Op
.getValueType();
1454 SDValue Chain
= LoadNode
->getChain();
1455 SDValue Ptr
= LoadNode
->getBasePtr();
1457 if ((LoadNode
->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS
||
1458 LoadNode
->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
) &&
1460 return scalarizeVectorLoad(LoadNode
, DAG
);
1463 // This is still used for explicit load from addrspace(8)
1464 int ConstantBlock
= ConstantAddressBlock(LoadNode
->getAddressSpace());
1465 if (ConstantBlock
> -1 &&
1466 ((LoadNode
->getExtensionType() == ISD::NON_EXTLOAD
) ||
1467 (LoadNode
->getExtensionType() == ISD::ZEXTLOAD
))) {
1469 if (isa
<Constant
>(LoadNode
->getMemOperand()->getValue()) ||
1470 isa
<ConstantSDNode
>(Ptr
)) {
1471 return constBufferLoad(LoadNode
, LoadNode
->getAddressSpace(), DAG
);
1473 //TODO: Does this even work?
1474 // non-constant ptr can't be folded, keeps it as a v4f32 load
1475 Result
= DAG
.getNode(AMDGPUISD::CONST_ADDRESS
, DL
, MVT::v4i32
,
1476 DAG
.getNode(ISD::SRL
, DL
, MVT::i32
, Ptr
,
1477 DAG
.getConstant(4, DL
, MVT::i32
)),
1478 DAG
.getConstant(LoadNode
->getAddressSpace() -
1479 AMDGPUAS::CONSTANT_BUFFER_0
, DL
, MVT::i32
)
1483 if (!VT
.isVector()) {
1484 Result
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i32
, Result
,
1485 DAG
.getConstant(0, DL
, MVT::i32
));
1488 SDValue MergedValues
[2] = {
1492 return DAG
.getMergeValues(MergedValues
, DL
);
1495 // For most operations returning SDValue() will result in the node being
1496 // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
1497 // need to manually expand loads that may be legal in some address spaces and
1498 // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
1499 // compute shaders, since the data is sign extended when it is uploaded to the
1500 // buffer. However SEXT loads from other address spaces are not supported, so
1501 // we need to expand them here.
1502 if (LoadNode
->getExtensionType() == ISD::SEXTLOAD
) {
1503 EVT MemVT
= LoadNode
->getMemoryVT();
1504 assert(!MemVT
.isVector() && (MemVT
== MVT::i16
|| MemVT
== MVT::i8
));
1505 SDValue NewLoad
= DAG
.getExtLoad(
1506 ISD::EXTLOAD
, DL
, VT
, Chain
, Ptr
, LoadNode
->getPointerInfo(), MemVT
,
1507 LoadNode
->getAlignment(), LoadNode
->getMemOperand()->getFlags());
1508 SDValue Res
= DAG
.getNode(ISD::SIGN_EXTEND_INREG
, DL
, VT
, NewLoad
,
1509 DAG
.getValueType(MemVT
));
1511 SDValue MergedValues
[2] = { Res
, Chain
};
1512 return DAG
.getMergeValues(MergedValues
, DL
);
1515 if (LoadNode
->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS
) {
1519 // DWORDADDR ISD marks already shifted address
1520 if (Ptr
.getOpcode() != AMDGPUISD::DWORDADDR
) {
1521 assert(VT
== MVT::i32
);
1522 Ptr
= DAG
.getNode(ISD::SRL
, DL
, MVT::i32
, Ptr
, DAG
.getConstant(2, DL
, MVT::i32
));
1523 Ptr
= DAG
.getNode(AMDGPUISD::DWORDADDR
, DL
, MVT::i32
, Ptr
);
1524 return DAG
.getLoad(MVT::i32
, DL
, Chain
, Ptr
, LoadNode
->getMemOperand());
1529 SDValue
R600TargetLowering::LowerBRCOND(SDValue Op
, SelectionDAG
&DAG
) const {
1530 SDValue Chain
= Op
.getOperand(0);
1531 SDValue Cond
= Op
.getOperand(1);
1532 SDValue Jump
= Op
.getOperand(2);
1534 return DAG
.getNode(AMDGPUISD::BRANCH_COND
, SDLoc(Op
), Op
.getValueType(),
1538 SDValue
R600TargetLowering::lowerFrameIndex(SDValue Op
,
1539 SelectionDAG
&DAG
) const {
1540 MachineFunction
&MF
= DAG
.getMachineFunction();
1541 const R600FrameLowering
*TFL
= Subtarget
->getFrameLowering();
1543 FrameIndexSDNode
*FIN
= cast
<FrameIndexSDNode
>(Op
);
1545 unsigned FrameIndex
= FIN
->getIndex();
1546 unsigned IgnoredFrameReg
;
1548 TFL
->getFrameIndexReference(MF
, FrameIndex
, IgnoredFrameReg
);
1549 return DAG
.getConstant(Offset
* 4 * TFL
->getStackWidth(MF
), SDLoc(Op
),
1553 CCAssignFn
*R600TargetLowering::CCAssignFnForCall(CallingConv::ID CC
,
1554 bool IsVarArg
) const {
1556 case CallingConv::AMDGPU_KERNEL
:
1557 case CallingConv::SPIR_KERNEL
:
1558 case CallingConv::C
:
1559 case CallingConv::Fast
:
1560 case CallingConv::Cold
:
1561 llvm_unreachable("kernels should not be handled here");
1562 case CallingConv::AMDGPU_VS
:
1563 case CallingConv::AMDGPU_GS
:
1564 case CallingConv::AMDGPU_PS
:
1565 case CallingConv::AMDGPU_CS
:
1566 case CallingConv::AMDGPU_HS
:
1567 case CallingConv::AMDGPU_ES
:
1568 case CallingConv::AMDGPU_LS
:
1571 report_fatal_error("Unsupported calling convention.");
1575 /// XXX Only kernel functions are supported, so we can assume for now that
1576 /// every function is a kernel function, but in the future we should use
1577 /// separate calling conventions for kernel and non-kernel functions.
1578 SDValue
R600TargetLowering::LowerFormalArguments(
1579 SDValue Chain
, CallingConv::ID CallConv
, bool isVarArg
,
1580 const SmallVectorImpl
<ISD::InputArg
> &Ins
, const SDLoc
&DL
,
1581 SelectionDAG
&DAG
, SmallVectorImpl
<SDValue
> &InVals
) const {
1582 SmallVector
<CCValAssign
, 16> ArgLocs
;
1583 CCState
CCInfo(CallConv
, isVarArg
, DAG
.getMachineFunction(), ArgLocs
,
1585 MachineFunction
&MF
= DAG
.getMachineFunction();
1586 SmallVector
<ISD::InputArg
, 8> LocalIns
;
1588 if (AMDGPU::isShader(CallConv
)) {
1589 CCInfo
.AnalyzeFormalArguments(Ins
, CCAssignFnForCall(CallConv
, isVarArg
));
1591 analyzeFormalArgumentsCompute(CCInfo
, Ins
);
1594 for (unsigned i
= 0, e
= Ins
.size(); i
< e
; ++i
) {
1595 CCValAssign
&VA
= ArgLocs
[i
];
1596 const ISD::InputArg
&In
= Ins
[i
];
1598 EVT MemVT
= VA
.getLocVT();
1599 if (!VT
.isVector() && MemVT
.isVector()) {
1600 // Get load source type if scalarized.
1601 MemVT
= MemVT
.getVectorElementType();
1604 if (AMDGPU::isShader(CallConv
)) {
1605 unsigned Reg
= MF
.addLiveIn(VA
.getLocReg(), &R600::R600_Reg128RegClass
);
1606 SDValue Register
= DAG
.getCopyFromReg(Chain
, DL
, Reg
, VT
);
1607 InVals
.push_back(Register
);
1611 PointerType
*PtrTy
= PointerType::get(VT
.getTypeForEVT(*DAG
.getContext()),
1612 AMDGPUAS::PARAM_I_ADDRESS
);
1614 // i64 isn't a legal type, so the register type used ends up as i32, which
1615 // isn't expected here. It attempts to create this sextload, but it ends up
1616 // being invalid. Somehow this seems to work with i64 arguments, but breaks
1619 // The first 36 bytes of the input buffer contains information about
1620 // thread group and global sizes.
1621 ISD::LoadExtType Ext
= ISD::NON_EXTLOAD
;
1622 if (MemVT
.getScalarSizeInBits() != VT
.getScalarSizeInBits()) {
1623 // FIXME: This should really check the extload type, but the handling of
1624 // extload vector parameters seems to be broken.
1626 // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
1627 Ext
= ISD::SEXTLOAD
;
1630 // Compute the offset from the value.
1631 // XXX - I think PartOffset should give you this, but it seems to give the
1632 // size of the register which isn't useful.
1634 unsigned ValBase
= ArgLocs
[In
.getOrigArgIndex()].getLocMemOffset();
1635 unsigned PartOffset
= VA
.getLocMemOffset();
1636 unsigned Alignment
= MinAlign(VT
.getStoreSize(), PartOffset
);
1638 MachinePointerInfo
PtrInfo(UndefValue::get(PtrTy
), PartOffset
- ValBase
);
1639 SDValue Arg
= DAG
.getLoad(
1640 ISD::UNINDEXED
, Ext
, VT
, DL
, Chain
,
1641 DAG
.getConstant(PartOffset
, DL
, MVT::i32
), DAG
.getUNDEF(MVT::i32
),
1643 MemVT
, Alignment
, MachineMemOperand::MONonTemporal
|
1644 MachineMemOperand::MODereferenceable
|
1645 MachineMemOperand::MOInvariant
);
1647 InVals
.push_back(Arg
);
1652 EVT
R600TargetLowering::getSetCCResultType(const DataLayout
&DL
, LLVMContext
&,
1656 return VT
.changeVectorElementTypeToInteger();
1659 bool R600TargetLowering::canMergeStoresTo(unsigned AS
, EVT MemVT
,
1660 const SelectionDAG
&DAG
) const {
1661 // Local and Private addresses do not handle vectors. Limit to i32
1662 if ((AS
== AMDGPUAS::LOCAL_ADDRESS
|| AS
== AMDGPUAS::PRIVATE_ADDRESS
)) {
1663 return (MemVT
.getSizeInBits() <= 32);
1668 bool R600TargetLowering::allowsMisalignedMemoryAccesses(
1669 EVT VT
, unsigned AddrSpace
, unsigned Align
, MachineMemOperand::Flags Flags
,
1670 bool *IsFast
) const {
1674 if (!VT
.isSimple() || VT
== MVT::Other
)
1677 if (VT
.bitsLT(MVT::i32
))
1680 // TODO: This is a rough estimate.
1684 return VT
.bitsGT(MVT::i32
) && Align
% 4 == 0;
1687 static SDValue
CompactSwizzlableVector(
1688 SelectionDAG
&DAG
, SDValue VectorEntry
,
1689 DenseMap
<unsigned, unsigned> &RemapSwizzle
) {
1690 assert(RemapSwizzle
.empty());
1692 SDLoc
DL(VectorEntry
);
1693 EVT EltTy
= VectorEntry
.getValueType().getVectorElementType();
1695 SDValue NewBldVec
[4];
1696 for (unsigned i
= 0; i
< 4; i
++)
1697 NewBldVec
[i
] = DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, EltTy
, VectorEntry
,
1698 DAG
.getIntPtrConstant(i
, DL
));
1700 for (unsigned i
= 0; i
< 4; i
++) {
1701 if (NewBldVec
[i
].isUndef())
1702 // We mask write here to teach later passes that the ith element of this
1703 // vector is undef. Thus we can use it to reduce 128 bits reg usage,
1704 // break false dependencies and additionnaly make assembly easier to read.
1705 RemapSwizzle
[i
] = 7; // SEL_MASK_WRITE
1706 if (ConstantFPSDNode
*C
= dyn_cast
<ConstantFPSDNode
>(NewBldVec
[i
])) {
1708 RemapSwizzle
[i
] = 4; // SEL_0
1709 NewBldVec
[i
] = DAG
.getUNDEF(MVT::f32
);
1710 } else if (C
->isExactlyValue(1.0)) {
1711 RemapSwizzle
[i
] = 5; // SEL_1
1712 NewBldVec
[i
] = DAG
.getUNDEF(MVT::f32
);
1716 if (NewBldVec
[i
].isUndef())
1718 // Fix spurious warning with gcc 7.3 -O3
1719 // warning: array subscript is above array bounds [-Warray-bounds]
1720 // if (NewBldVec[i] == NewBldVec[j]) {
1724 for (unsigned j
= 0; j
< i
; j
++) {
1725 if (NewBldVec
[i
] == NewBldVec
[j
]) {
1726 NewBldVec
[i
] = DAG
.getUNDEF(NewBldVec
[i
].getValueType());
1727 RemapSwizzle
[i
] = j
;
1733 return DAG
.getBuildVector(VectorEntry
.getValueType(), SDLoc(VectorEntry
),
1737 static SDValue
ReorganizeVector(SelectionDAG
&DAG
, SDValue VectorEntry
,
1738 DenseMap
<unsigned, unsigned> &RemapSwizzle
) {
1739 assert(RemapSwizzle
.empty());
1741 SDLoc
DL(VectorEntry
);
1742 EVT EltTy
= VectorEntry
.getValueType().getVectorElementType();
1744 SDValue NewBldVec
[4];
1745 bool isUnmovable
[4] = {false, false, false, false};
1746 for (unsigned i
= 0; i
< 4; i
++)
1747 NewBldVec
[i
] = DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, EltTy
, VectorEntry
,
1748 DAG
.getIntPtrConstant(i
, DL
));
1750 for (unsigned i
= 0; i
< 4; i
++) {
1751 RemapSwizzle
[i
] = i
;
1752 if (NewBldVec
[i
].getOpcode() == ISD::EXTRACT_VECTOR_ELT
) {
1753 unsigned Idx
= dyn_cast
<ConstantSDNode
>(NewBldVec
[i
].getOperand(1))
1756 isUnmovable
[Idx
] = true;
1760 for (unsigned i
= 0; i
< 4; i
++) {
1761 if (NewBldVec
[i
].getOpcode() == ISD::EXTRACT_VECTOR_ELT
) {
1762 unsigned Idx
= dyn_cast
<ConstantSDNode
>(NewBldVec
[i
].getOperand(1))
1764 if (isUnmovable
[Idx
])
1767 std::swap(NewBldVec
[Idx
], NewBldVec
[i
]);
1768 std::swap(RemapSwizzle
[i
], RemapSwizzle
[Idx
]);
1773 return DAG
.getBuildVector(VectorEntry
.getValueType(), SDLoc(VectorEntry
),
1777 SDValue
R600TargetLowering::OptimizeSwizzle(SDValue BuildVector
, SDValue Swz
[4],
1779 const SDLoc
&DL
) const {
1780 // Old -> New swizzle values
1781 DenseMap
<unsigned, unsigned> SwizzleRemap
;
1783 BuildVector
= CompactSwizzlableVector(DAG
, BuildVector
, SwizzleRemap
);
1784 for (unsigned i
= 0; i
< 4; i
++) {
1785 unsigned Idx
= cast
<ConstantSDNode
>(Swz
[i
])->getZExtValue();
1786 if (SwizzleRemap
.find(Idx
) != SwizzleRemap
.end())
1787 Swz
[i
] = DAG
.getConstant(SwizzleRemap
[Idx
], DL
, MVT::i32
);
1790 SwizzleRemap
.clear();
1791 BuildVector
= ReorganizeVector(DAG
, BuildVector
, SwizzleRemap
);
1792 for (unsigned i
= 0; i
< 4; i
++) {
1793 unsigned Idx
= cast
<ConstantSDNode
>(Swz
[i
])->getZExtValue();
1794 if (SwizzleRemap
.find(Idx
) != SwizzleRemap
.end())
1795 Swz
[i
] = DAG
.getConstant(SwizzleRemap
[Idx
], DL
, MVT::i32
);
1801 SDValue
R600TargetLowering::constBufferLoad(LoadSDNode
*LoadNode
, int Block
,
1802 SelectionDAG
&DAG
) const {
1804 EVT VT
= LoadNode
->getValueType(0);
1805 SDValue Chain
= LoadNode
->getChain();
1806 SDValue Ptr
= LoadNode
->getBasePtr();
1807 assert (isa
<ConstantSDNode
>(Ptr
));
1809 //TODO: Support smaller loads
1810 if (LoadNode
->getMemoryVT().getScalarType() != MVT::i32
|| !ISD::isNON_EXTLoad(LoadNode
))
1813 if (LoadNode
->getAlignment() < 4)
1816 int ConstantBlock
= ConstantAddressBlock(Block
);
1819 for (unsigned i
= 0; i
< 4; i
++) {
1820 // We want Const position encoded with the following formula :
1821 // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1822 // const_index is Ptr computed by llvm using an alignment of 16.
1823 // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1824 // then div by 4 at the ISel step
1825 SDValue NewPtr
= DAG
.getNode(ISD::ADD
, DL
, Ptr
.getValueType(), Ptr
,
1826 DAG
.getConstant(4 * i
+ ConstantBlock
* 16, DL
, MVT::i32
));
1827 Slots
[i
] = DAG
.getNode(AMDGPUISD::CONST_ADDRESS
, DL
, MVT::i32
, NewPtr
);
1829 EVT NewVT
= MVT::v4i32
;
1830 unsigned NumElements
= 4;
1831 if (VT
.isVector()) {
1833 NumElements
= VT
.getVectorNumElements();
1835 SDValue Result
= DAG
.getBuildVector(NewVT
, DL
, makeArrayRef(Slots
, NumElements
));
1836 if (!VT
.isVector()) {
1837 Result
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i32
, Result
,
1838 DAG
.getConstant(0, DL
, MVT::i32
));
1840 SDValue MergedValues
[2] = {
1844 return DAG
.getMergeValues(MergedValues
, DL
);
1847 //===----------------------------------------------------------------------===//
1848 // Custom DAG Optimizations
1849 //===----------------------------------------------------------------------===//
1851 SDValue
R600TargetLowering::PerformDAGCombine(SDNode
*N
,
1852 DAGCombinerInfo
&DCI
) const {
1853 SelectionDAG
&DAG
= DCI
.DAG
;
1856 switch (N
->getOpcode()) {
1857 // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1858 case ISD::FP_ROUND
: {
1859 SDValue Arg
= N
->getOperand(0);
1860 if (Arg
.getOpcode() == ISD::UINT_TO_FP
&& Arg
.getValueType() == MVT::f64
) {
1861 return DAG
.getNode(ISD::UINT_TO_FP
, DL
, N
->getValueType(0),
1867 // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1868 // (i32 select_cc f32, f32, -1, 0 cc)
1870 // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1871 // this to one of the SET*_DX10 instructions.
1872 case ISD::FP_TO_SINT
: {
1873 SDValue FNeg
= N
->getOperand(0);
1874 if (FNeg
.getOpcode() != ISD::FNEG
) {
1877 SDValue SelectCC
= FNeg
.getOperand(0);
1878 if (SelectCC
.getOpcode() != ISD::SELECT_CC
||
1879 SelectCC
.getOperand(0).getValueType() != MVT::f32
|| // LHS
1880 SelectCC
.getOperand(2).getValueType() != MVT::f32
|| // True
1881 !isHWTrueValue(SelectCC
.getOperand(2)) ||
1882 !isHWFalseValue(SelectCC
.getOperand(3))) {
1886 return DAG
.getNode(ISD::SELECT_CC
, DL
, N
->getValueType(0),
1887 SelectCC
.getOperand(0), // LHS
1888 SelectCC
.getOperand(1), // RHS
1889 DAG
.getConstant(-1, DL
, MVT::i32
), // True
1890 DAG
.getConstant(0, DL
, MVT::i32
), // False
1891 SelectCC
.getOperand(4)); // CC
1896 // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
1897 // => build_vector elt0, ... , NewEltIdx, ... , eltN
1898 case ISD::INSERT_VECTOR_ELT
: {
1899 SDValue InVec
= N
->getOperand(0);
1900 SDValue InVal
= N
->getOperand(1);
1901 SDValue EltNo
= N
->getOperand(2);
1903 // If the inserted element is an UNDEF, just use the input vector.
1904 if (InVal
.isUndef())
1907 EVT VT
= InVec
.getValueType();
1909 // If we can't generate a legal BUILD_VECTOR, exit
1910 if (!isOperationLegal(ISD::BUILD_VECTOR
, VT
))
1913 // Check that we know which element is being inserted
1914 if (!isa
<ConstantSDNode
>(EltNo
))
1916 unsigned Elt
= cast
<ConstantSDNode
>(EltNo
)->getZExtValue();
1918 // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
1919 // be converted to a BUILD_VECTOR). Fill in the Ops vector with the
1921 SmallVector
<SDValue
, 8> Ops
;
1922 if (InVec
.getOpcode() == ISD::BUILD_VECTOR
) {
1923 Ops
.append(InVec
.getNode()->op_begin(),
1924 InVec
.getNode()->op_end());
1925 } else if (InVec
.isUndef()) {
1926 unsigned NElts
= VT
.getVectorNumElements();
1927 Ops
.append(NElts
, DAG
.getUNDEF(InVal
.getValueType()));
1932 // Insert the element
1933 if (Elt
< Ops
.size()) {
1934 // All the operands of BUILD_VECTOR must have the same type;
1935 // we enforce that here.
1936 EVT OpVT
= Ops
[0].getValueType();
1937 if (InVal
.getValueType() != OpVT
)
1938 InVal
= OpVT
.bitsGT(InVal
.getValueType()) ?
1939 DAG
.getNode(ISD::ANY_EXTEND
, DL
, OpVT
, InVal
) :
1940 DAG
.getNode(ISD::TRUNCATE
, DL
, OpVT
, InVal
);
1944 // Return the new vector
1945 return DAG
.getBuildVector(VT
, DL
, Ops
);
1948 // Extract_vec (Build_vector) generated by custom lowering
1949 // also needs to be customly combined
1950 case ISD::EXTRACT_VECTOR_ELT
: {
1951 SDValue Arg
= N
->getOperand(0);
1952 if (Arg
.getOpcode() == ISD::BUILD_VECTOR
) {
1953 if (ConstantSDNode
*Const
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1))) {
1954 unsigned Element
= Const
->getZExtValue();
1955 return Arg
->getOperand(Element
);
1958 if (Arg
.getOpcode() == ISD::BITCAST
&&
1959 Arg
.getOperand(0).getOpcode() == ISD::BUILD_VECTOR
&&
1960 (Arg
.getOperand(0).getValueType().getVectorNumElements() ==
1961 Arg
.getValueType().getVectorNumElements())) {
1962 if (ConstantSDNode
*Const
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1))) {
1963 unsigned Element
= Const
->getZExtValue();
1964 return DAG
.getNode(ISD::BITCAST
, DL
, N
->getVTList(),
1965 Arg
->getOperand(0).getOperand(Element
));
1971 case ISD::SELECT_CC
: {
1972 // Try common optimizations
1973 if (SDValue Ret
= AMDGPUTargetLowering::PerformDAGCombine(N
, DCI
))
1976 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1977 // selectcc x, y, a, b, inv(cc)
1979 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1980 // selectcc x, y, a, b, cc
1981 SDValue LHS
= N
->getOperand(0);
1982 if (LHS
.getOpcode() != ISD::SELECT_CC
) {
1986 SDValue RHS
= N
->getOperand(1);
1987 SDValue True
= N
->getOperand(2);
1988 SDValue False
= N
->getOperand(3);
1989 ISD::CondCode NCC
= cast
<CondCodeSDNode
>(N
->getOperand(4))->get();
1991 if (LHS
.getOperand(2).getNode() != True
.getNode() ||
1992 LHS
.getOperand(3).getNode() != False
.getNode() ||
1993 RHS
.getNode() != False
.getNode()) {
1998 default: return SDValue();
1999 case ISD::SETNE
: return LHS
;
2001 ISD::CondCode LHSCC
= cast
<CondCodeSDNode
>(LHS
.getOperand(4))->get();
2002 LHSCC
= ISD::getSetCCInverse(LHSCC
,
2003 LHS
.getOperand(0).getValueType().isInteger());
2004 if (DCI
.isBeforeLegalizeOps() ||
2005 isCondCodeLegal(LHSCC
, LHS
.getOperand(0).getSimpleValueType()))
2006 return DAG
.getSelectCC(DL
,
2018 case AMDGPUISD::R600_EXPORT
: {
2019 SDValue Arg
= N
->getOperand(1);
2020 if (Arg
.getOpcode() != ISD::BUILD_VECTOR
)
2023 SDValue NewArgs
[8] = {
2024 N
->getOperand(0), // Chain
2026 N
->getOperand(2), // ArrayBase
2027 N
->getOperand(3), // Type
2028 N
->getOperand(4), // SWZ_X
2029 N
->getOperand(5), // SWZ_Y
2030 N
->getOperand(6), // SWZ_Z
2031 N
->getOperand(7) // SWZ_W
2033 NewArgs
[1] = OptimizeSwizzle(N
->getOperand(1), &NewArgs
[4], DAG
, DL
);
2034 return DAG
.getNode(AMDGPUISD::R600_EXPORT
, DL
, N
->getVTList(), NewArgs
);
2036 case AMDGPUISD::TEXTURE_FETCH
: {
2037 SDValue Arg
= N
->getOperand(1);
2038 if (Arg
.getOpcode() != ISD::BUILD_VECTOR
)
2041 SDValue NewArgs
[19] = {
2062 NewArgs
[1] = OptimizeSwizzle(N
->getOperand(1), &NewArgs
[2], DAG
, DL
);
2063 return DAG
.getNode(AMDGPUISD::TEXTURE_FETCH
, DL
, N
->getVTList(), NewArgs
);
2067 LoadSDNode
*LoadNode
= cast
<LoadSDNode
>(N
);
2068 SDValue Ptr
= LoadNode
->getBasePtr();
2069 if (LoadNode
->getAddressSpace() == AMDGPUAS::PARAM_I_ADDRESS
&&
2070 isa
<ConstantSDNode
>(Ptr
))
2071 return constBufferLoad(LoadNode
, AMDGPUAS::CONSTANT_BUFFER_0
, DAG
);
2078 return AMDGPUTargetLowering::PerformDAGCombine(N
, DCI
);
2081 bool R600TargetLowering::FoldOperand(SDNode
*ParentNode
, unsigned SrcIdx
,
2082 SDValue
&Src
, SDValue
&Neg
, SDValue
&Abs
,
2083 SDValue
&Sel
, SDValue
&Imm
,
2084 SelectionDAG
&DAG
) const {
2085 const R600InstrInfo
*TII
= Subtarget
->getInstrInfo();
2086 if (!Src
.isMachineOpcode())
2089 switch (Src
.getMachineOpcode()) {
2090 case R600::FNEG_R600
:
2093 Src
= Src
.getOperand(0);
2094 Neg
= DAG
.getTargetConstant(1, SDLoc(ParentNode
), MVT::i32
);
2096 case R600::FABS_R600
:
2099 Src
= Src
.getOperand(0);
2100 Abs
= DAG
.getTargetConstant(1, SDLoc(ParentNode
), MVT::i32
);
2102 case R600::CONST_COPY
: {
2103 unsigned Opcode
= ParentNode
->getMachineOpcode();
2104 bool HasDst
= TII
->getOperandIdx(Opcode
, R600::OpName::dst
) > -1;
2109 SDValue CstOffset
= Src
.getOperand(0);
2110 if (ParentNode
->getValueType(0).isVector())
2113 // Gather constants values
2114 int SrcIndices
[] = {
2115 TII
->getOperandIdx(Opcode
, R600::OpName::src0
),
2116 TII
->getOperandIdx(Opcode
, R600::OpName::src1
),
2117 TII
->getOperandIdx(Opcode
, R600::OpName::src2
),
2118 TII
->getOperandIdx(Opcode
, R600::OpName::src0_X
),
2119 TII
->getOperandIdx(Opcode
, R600::OpName::src0_Y
),
2120 TII
->getOperandIdx(Opcode
, R600::OpName::src0_Z
),
2121 TII
->getOperandIdx(Opcode
, R600::OpName::src0_W
),
2122 TII
->getOperandIdx(Opcode
, R600::OpName::src1_X
),
2123 TII
->getOperandIdx(Opcode
, R600::OpName::src1_Y
),
2124 TII
->getOperandIdx(Opcode
, R600::OpName::src1_Z
),
2125 TII
->getOperandIdx(Opcode
, R600::OpName::src1_W
)
2127 std::vector
<unsigned> Consts
;
2128 for (int OtherSrcIdx
: SrcIndices
) {
2129 int OtherSelIdx
= TII
->getSelIdx(Opcode
, OtherSrcIdx
);
2130 if (OtherSrcIdx
< 0 || OtherSelIdx
< 0)
2136 if (RegisterSDNode
*Reg
=
2137 dyn_cast
<RegisterSDNode
>(ParentNode
->getOperand(OtherSrcIdx
))) {
2138 if (Reg
->getReg() == R600::ALU_CONST
) {
2140 = cast
<ConstantSDNode
>(ParentNode
->getOperand(OtherSelIdx
));
2141 Consts
.push_back(Cst
->getZExtValue());
2146 ConstantSDNode
*Cst
= cast
<ConstantSDNode
>(CstOffset
);
2147 Consts
.push_back(Cst
->getZExtValue());
2148 if (!TII
->fitsConstReadLimitations(Consts
)) {
2153 Src
= DAG
.getRegister(R600::ALU_CONST
, MVT::f32
);
2156 case R600::MOV_IMM_GLOBAL_ADDR
:
2157 // Check if the Imm slot is used. Taken from below.
2158 if (cast
<ConstantSDNode
>(Imm
)->getZExtValue())
2160 Imm
= Src
.getOperand(0);
2161 Src
= DAG
.getRegister(R600::ALU_LITERAL_X
, MVT::i32
);
2163 case R600::MOV_IMM_I32
:
2164 case R600::MOV_IMM_F32
: {
2165 unsigned ImmReg
= R600::ALU_LITERAL_X
;
2166 uint64_t ImmValue
= 0;
2168 if (Src
.getMachineOpcode() == R600::MOV_IMM_F32
) {
2169 ConstantFPSDNode
*FPC
= dyn_cast
<ConstantFPSDNode
>(Src
.getOperand(0));
2170 float FloatValue
= FPC
->getValueAPF().convertToFloat();
2171 if (FloatValue
== 0.0) {
2172 ImmReg
= R600::ZERO
;
2173 } else if (FloatValue
== 0.5) {
2174 ImmReg
= R600::HALF
;
2175 } else if (FloatValue
== 1.0) {
2178 ImmValue
= FPC
->getValueAPF().bitcastToAPInt().getZExtValue();
2181 ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(Src
.getOperand(0));
2182 uint64_t Value
= C
->getZExtValue();
2184 ImmReg
= R600::ZERO
;
2185 } else if (Value
== 1) {
2186 ImmReg
= R600::ONE_INT
;
2192 // Check that we aren't already using an immediate.
2193 // XXX: It's possible for an instruction to have more than one
2194 // immediate operand, but this is not supported yet.
2195 if (ImmReg
== R600::ALU_LITERAL_X
) {
2198 ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(Imm
);
2200 if (C
->getZExtValue())
2202 Imm
= DAG
.getTargetConstant(ImmValue
, SDLoc(ParentNode
), MVT::i32
);
2204 Src
= DAG
.getRegister(ImmReg
, MVT::i32
);
2212 /// Fold the instructions after selecting them
2213 SDNode
*R600TargetLowering::PostISelFolding(MachineSDNode
*Node
,
2214 SelectionDAG
&DAG
) const {
2215 const R600InstrInfo
*TII
= Subtarget
->getInstrInfo();
2216 if (!Node
->isMachineOpcode())
2219 unsigned Opcode
= Node
->getMachineOpcode();
2222 std::vector
<SDValue
> Ops(Node
->op_begin(), Node
->op_end());
2224 if (Opcode
== R600::DOT_4
) {
2225 int OperandIdx
[] = {
2226 TII
->getOperandIdx(Opcode
, R600::OpName::src0_X
),
2227 TII
->getOperandIdx(Opcode
, R600::OpName::src0_Y
),
2228 TII
->getOperandIdx(Opcode
, R600::OpName::src0_Z
),
2229 TII
->getOperandIdx(Opcode
, R600::OpName::src0_W
),
2230 TII
->getOperandIdx(Opcode
, R600::OpName::src1_X
),
2231 TII
->getOperandIdx(Opcode
, R600::OpName::src1_Y
),
2232 TII
->getOperandIdx(Opcode
, R600::OpName::src1_Z
),
2233 TII
->getOperandIdx(Opcode
, R600::OpName::src1_W
)
2236 TII
->getOperandIdx(Opcode
, R600::OpName::src0_neg_X
),
2237 TII
->getOperandIdx(Opcode
, R600::OpName::src0_neg_Y
),
2238 TII
->getOperandIdx(Opcode
, R600::OpName::src0_neg_Z
),
2239 TII
->getOperandIdx(Opcode
, R600::OpName::src0_neg_W
),
2240 TII
->getOperandIdx(Opcode
, R600::OpName::src1_neg_X
),
2241 TII
->getOperandIdx(Opcode
, R600::OpName::src1_neg_Y
),
2242 TII
->getOperandIdx(Opcode
, R600::OpName::src1_neg_Z
),
2243 TII
->getOperandIdx(Opcode
, R600::OpName::src1_neg_W
)
2246 TII
->getOperandIdx(Opcode
, R600::OpName::src0_abs_X
),
2247 TII
->getOperandIdx(Opcode
, R600::OpName::src0_abs_Y
),
2248 TII
->getOperandIdx(Opcode
, R600::OpName::src0_abs_Z
),
2249 TII
->getOperandIdx(Opcode
, R600::OpName::src0_abs_W
),
2250 TII
->getOperandIdx(Opcode
, R600::OpName::src1_abs_X
),
2251 TII
->getOperandIdx(Opcode
, R600::OpName::src1_abs_Y
),
2252 TII
->getOperandIdx(Opcode
, R600::OpName::src1_abs_Z
),
2253 TII
->getOperandIdx(Opcode
, R600::OpName::src1_abs_W
)
2255 for (unsigned i
= 0; i
< 8; i
++) {
2256 if (OperandIdx
[i
] < 0)
2258 SDValue
&Src
= Ops
[OperandIdx
[i
] - 1];
2259 SDValue
&Neg
= Ops
[NegIdx
[i
] - 1];
2260 SDValue
&Abs
= Ops
[AbsIdx
[i
] - 1];
2261 bool HasDst
= TII
->getOperandIdx(Opcode
, R600::OpName::dst
) > -1;
2262 int SelIdx
= TII
->getSelIdx(Opcode
, OperandIdx
[i
]);
2265 SDValue
&Sel
= (SelIdx
> -1) ? Ops
[SelIdx
] : FakeOp
;
2266 if (FoldOperand(Node
, i
, Src
, Neg
, Abs
, Sel
, FakeOp
, DAG
))
2267 return DAG
.getMachineNode(Opcode
, SDLoc(Node
), Node
->getVTList(), Ops
);
2269 } else if (Opcode
== R600::REG_SEQUENCE
) {
2270 for (unsigned i
= 1, e
= Node
->getNumOperands(); i
< e
; i
+= 2) {
2271 SDValue
&Src
= Ops
[i
];
2272 if (FoldOperand(Node
, i
, Src
, FakeOp
, FakeOp
, FakeOp
, FakeOp
, DAG
))
2273 return DAG
.getMachineNode(Opcode
, SDLoc(Node
), Node
->getVTList(), Ops
);
2276 if (!TII
->hasInstrModifiers(Opcode
))
2278 int OperandIdx
[] = {
2279 TII
->getOperandIdx(Opcode
, R600::OpName::src0
),
2280 TII
->getOperandIdx(Opcode
, R600::OpName::src1
),
2281 TII
->getOperandIdx(Opcode
, R600::OpName::src2
)
2284 TII
->getOperandIdx(Opcode
, R600::OpName::src0_neg
),
2285 TII
->getOperandIdx(Opcode
, R600::OpName::src1_neg
),
2286 TII
->getOperandIdx(Opcode
, R600::OpName::src2_neg
)
2289 TII
->getOperandIdx(Opcode
, R600::OpName::src0_abs
),
2290 TII
->getOperandIdx(Opcode
, R600::OpName::src1_abs
),
2293 for (unsigned i
= 0; i
< 3; i
++) {
2294 if (OperandIdx
[i
] < 0)
2296 SDValue
&Src
= Ops
[OperandIdx
[i
] - 1];
2297 SDValue
&Neg
= Ops
[NegIdx
[i
] - 1];
2299 SDValue
&Abs
= (AbsIdx
[i
] > -1) ? Ops
[AbsIdx
[i
] - 1] : FakeAbs
;
2300 bool HasDst
= TII
->getOperandIdx(Opcode
, R600::OpName::dst
) > -1;
2301 int SelIdx
= TII
->getSelIdx(Opcode
, OperandIdx
[i
]);
2302 int ImmIdx
= TII
->getOperandIdx(Opcode
, R600::OpName::literal
);
2307 SDValue
&Sel
= (SelIdx
> -1) ? Ops
[SelIdx
] : FakeOp
;
2308 SDValue
&Imm
= Ops
[ImmIdx
];
2309 if (FoldOperand(Node
, i
, Src
, Neg
, Abs
, Sel
, Imm
, DAG
))
2310 return DAG
.getMachineNode(Opcode
, SDLoc(Node
), Node
->getVTList(), Ops
);