Run DCE after a LoopFlatten test to reduce spurious output [nfc]
[llvm-project.git] / llvm / lib / Target / NVPTX / NVPTXISelDAGToDAG.cpp
blob68391cdb6ff172bb3a1ee569d1c5107a78607e03
1 //===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines an instruction selector for the NVPTX target.
11 //===----------------------------------------------------------------------===//
13 #include "NVPTXISelDAGToDAG.h"
14 #include "MCTargetDesc/NVPTXBaseInfo.h"
15 #include "NVPTXUtilities.h"
16 #include "llvm/Analysis/ValueTracking.h"
17 #include "llvm/CodeGen/ISDOpcodes.h"
18 #include "llvm/IR/GlobalValue.h"
19 #include "llvm/IR/Instructions.h"
20 #include "llvm/IR/IntrinsicsNVPTX.h"
21 #include "llvm/Support/AtomicOrdering.h"
22 #include "llvm/Support/CommandLine.h"
23 #include "llvm/Support/Debug.h"
24 #include "llvm/Support/ErrorHandling.h"
25 #include "llvm/Support/raw_ostream.h"
26 #include "llvm/Target/TargetIntrinsicInfo.h"
28 using namespace llvm;
30 #define DEBUG_TYPE "nvptx-isel"
31 #define PASS_NAME "NVPTX DAG->DAG Pattern Instruction Selection"
33 /// createNVPTXISelDag - This pass converts a legalized DAG into a
34 /// NVPTX-specific DAG, ready for instruction scheduling.
35 FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM,
36 llvm::CodeGenOptLevel OptLevel) {
37 return new NVPTXDAGToDAGISel(TM, OptLevel);
40 char NVPTXDAGToDAGISel::ID = 0;
42 INITIALIZE_PASS(NVPTXDAGToDAGISel, DEBUG_TYPE, PASS_NAME, false, false)
44 NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
45 CodeGenOptLevel OptLevel)
46 : SelectionDAGISel(ID, tm, OptLevel), TM(tm) {
47 doMulWide = (OptLevel > CodeGenOptLevel::None);
50 bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
51 Subtarget = &MF.getSubtarget<NVPTXSubtarget>();
52 return SelectionDAGISel::runOnMachineFunction(MF);
55 int NVPTXDAGToDAGISel::getDivF32Level() const {
56 return Subtarget->getTargetLowering()->getDivF32Level();
59 bool NVPTXDAGToDAGISel::usePrecSqrtF32() const {
60 return Subtarget->getTargetLowering()->usePrecSqrtF32();
63 bool NVPTXDAGToDAGISel::useF32FTZ() const {
64 return Subtarget->getTargetLowering()->useF32FTZ(*MF);
67 bool NVPTXDAGToDAGISel::allowFMA() const {
68 const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
69 return TL->allowFMA(*MF, OptLevel);
72 bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const {
73 const NVPTXTargetLowering *TL = Subtarget->getTargetLowering();
74 return TL->allowUnsafeFPMath(*MF);
77 bool NVPTXDAGToDAGISel::useShortPointers() const {
78 return TM.useShortPointers();
81 /// Select - Select instructions not customized! Used for
82 /// expanded, promoted and normal instructions.
83 void NVPTXDAGToDAGISel::Select(SDNode *N) {
85 if (N->isMachineOpcode()) {
86 N->setNodeId(-1);
87 return; // Already selected.
90 switch (N->getOpcode()) {
91 case ISD::LOAD:
92 case ISD::ATOMIC_LOAD:
93 if (tryLoad(N))
94 return;
95 break;
96 case ISD::STORE:
97 case ISD::ATOMIC_STORE:
98 if (tryStore(N))
99 return;
100 break;
101 case ISD::EXTRACT_VECTOR_ELT:
102 if (tryEXTRACT_VECTOR_ELEMENT(N))
103 return;
104 break;
105 case NVPTXISD::SETP_F16X2:
106 SelectSETP_F16X2(N);
107 return;
109 case NVPTXISD::LoadV2:
110 case NVPTXISD::LoadV4:
111 if (tryLoadVector(N))
112 return;
113 break;
114 case NVPTXISD::LDGV2:
115 case NVPTXISD::LDGV4:
116 case NVPTXISD::LDUV2:
117 case NVPTXISD::LDUV4:
118 if (tryLDGLDU(N))
119 return;
120 break;
121 case NVPTXISD::StoreV2:
122 case NVPTXISD::StoreV4:
123 if (tryStoreVector(N))
124 return;
125 break;
126 case NVPTXISD::LoadParam:
127 case NVPTXISD::LoadParamV2:
128 case NVPTXISD::LoadParamV4:
129 if (tryLoadParam(N))
130 return;
131 break;
132 case NVPTXISD::StoreRetval:
133 case NVPTXISD::StoreRetvalV2:
134 case NVPTXISD::StoreRetvalV4:
135 if (tryStoreRetval(N))
136 return;
137 break;
138 case NVPTXISD::StoreParam:
139 case NVPTXISD::StoreParamV2:
140 case NVPTXISD::StoreParamV4:
141 case NVPTXISD::StoreParamS32:
142 case NVPTXISD::StoreParamU32:
143 if (tryStoreParam(N))
144 return;
145 break;
146 case ISD::INTRINSIC_WO_CHAIN:
147 if (tryIntrinsicNoChain(N))
148 return;
149 break;
150 case ISD::INTRINSIC_W_CHAIN:
151 if (tryIntrinsicChain(N))
152 return;
153 break;
154 case NVPTXISD::Tex1DFloatS32:
155 case NVPTXISD::Tex1DFloatFloat:
156 case NVPTXISD::Tex1DFloatFloatLevel:
157 case NVPTXISD::Tex1DFloatFloatGrad:
158 case NVPTXISD::Tex1DS32S32:
159 case NVPTXISD::Tex1DS32Float:
160 case NVPTXISD::Tex1DS32FloatLevel:
161 case NVPTXISD::Tex1DS32FloatGrad:
162 case NVPTXISD::Tex1DU32S32:
163 case NVPTXISD::Tex1DU32Float:
164 case NVPTXISD::Tex1DU32FloatLevel:
165 case NVPTXISD::Tex1DU32FloatGrad:
166 case NVPTXISD::Tex1DArrayFloatS32:
167 case NVPTXISD::Tex1DArrayFloatFloat:
168 case NVPTXISD::Tex1DArrayFloatFloatLevel:
169 case NVPTXISD::Tex1DArrayFloatFloatGrad:
170 case NVPTXISD::Tex1DArrayS32S32:
171 case NVPTXISD::Tex1DArrayS32Float:
172 case NVPTXISD::Tex1DArrayS32FloatLevel:
173 case NVPTXISD::Tex1DArrayS32FloatGrad:
174 case NVPTXISD::Tex1DArrayU32S32:
175 case NVPTXISD::Tex1DArrayU32Float:
176 case NVPTXISD::Tex1DArrayU32FloatLevel:
177 case NVPTXISD::Tex1DArrayU32FloatGrad:
178 case NVPTXISD::Tex2DFloatS32:
179 case NVPTXISD::Tex2DFloatFloat:
180 case NVPTXISD::Tex2DFloatFloatLevel:
181 case NVPTXISD::Tex2DFloatFloatGrad:
182 case NVPTXISD::Tex2DS32S32:
183 case NVPTXISD::Tex2DS32Float:
184 case NVPTXISD::Tex2DS32FloatLevel:
185 case NVPTXISD::Tex2DS32FloatGrad:
186 case NVPTXISD::Tex2DU32S32:
187 case NVPTXISD::Tex2DU32Float:
188 case NVPTXISD::Tex2DU32FloatLevel:
189 case NVPTXISD::Tex2DU32FloatGrad:
190 case NVPTXISD::Tex2DArrayFloatS32:
191 case NVPTXISD::Tex2DArrayFloatFloat:
192 case NVPTXISD::Tex2DArrayFloatFloatLevel:
193 case NVPTXISD::Tex2DArrayFloatFloatGrad:
194 case NVPTXISD::Tex2DArrayS32S32:
195 case NVPTXISD::Tex2DArrayS32Float:
196 case NVPTXISD::Tex2DArrayS32FloatLevel:
197 case NVPTXISD::Tex2DArrayS32FloatGrad:
198 case NVPTXISD::Tex2DArrayU32S32:
199 case NVPTXISD::Tex2DArrayU32Float:
200 case NVPTXISD::Tex2DArrayU32FloatLevel:
201 case NVPTXISD::Tex2DArrayU32FloatGrad:
202 case NVPTXISD::Tex3DFloatS32:
203 case NVPTXISD::Tex3DFloatFloat:
204 case NVPTXISD::Tex3DFloatFloatLevel:
205 case NVPTXISD::Tex3DFloatFloatGrad:
206 case NVPTXISD::Tex3DS32S32:
207 case NVPTXISD::Tex3DS32Float:
208 case NVPTXISD::Tex3DS32FloatLevel:
209 case NVPTXISD::Tex3DS32FloatGrad:
210 case NVPTXISD::Tex3DU32S32:
211 case NVPTXISD::Tex3DU32Float:
212 case NVPTXISD::Tex3DU32FloatLevel:
213 case NVPTXISD::Tex3DU32FloatGrad:
214 case NVPTXISD::TexCubeFloatFloat:
215 case NVPTXISD::TexCubeFloatFloatLevel:
216 case NVPTXISD::TexCubeS32Float:
217 case NVPTXISD::TexCubeS32FloatLevel:
218 case NVPTXISD::TexCubeU32Float:
219 case NVPTXISD::TexCubeU32FloatLevel:
220 case NVPTXISD::TexCubeArrayFloatFloat:
221 case NVPTXISD::TexCubeArrayFloatFloatLevel:
222 case NVPTXISD::TexCubeArrayS32Float:
223 case NVPTXISD::TexCubeArrayS32FloatLevel:
224 case NVPTXISD::TexCubeArrayU32Float:
225 case NVPTXISD::TexCubeArrayU32FloatLevel:
226 case NVPTXISD::Tld4R2DFloatFloat:
227 case NVPTXISD::Tld4G2DFloatFloat:
228 case NVPTXISD::Tld4B2DFloatFloat:
229 case NVPTXISD::Tld4A2DFloatFloat:
230 case NVPTXISD::Tld4R2DS64Float:
231 case NVPTXISD::Tld4G2DS64Float:
232 case NVPTXISD::Tld4B2DS64Float:
233 case NVPTXISD::Tld4A2DS64Float:
234 case NVPTXISD::Tld4R2DU64Float:
235 case NVPTXISD::Tld4G2DU64Float:
236 case NVPTXISD::Tld4B2DU64Float:
237 case NVPTXISD::Tld4A2DU64Float:
238 case NVPTXISD::TexUnified1DFloatS32:
239 case NVPTXISD::TexUnified1DFloatFloat:
240 case NVPTXISD::TexUnified1DFloatFloatLevel:
241 case NVPTXISD::TexUnified1DFloatFloatGrad:
242 case NVPTXISD::TexUnified1DS32S32:
243 case NVPTXISD::TexUnified1DS32Float:
244 case NVPTXISD::TexUnified1DS32FloatLevel:
245 case NVPTXISD::TexUnified1DS32FloatGrad:
246 case NVPTXISD::TexUnified1DU32S32:
247 case NVPTXISD::TexUnified1DU32Float:
248 case NVPTXISD::TexUnified1DU32FloatLevel:
249 case NVPTXISD::TexUnified1DU32FloatGrad:
250 case NVPTXISD::TexUnified1DArrayFloatS32:
251 case NVPTXISD::TexUnified1DArrayFloatFloat:
252 case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
253 case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
254 case NVPTXISD::TexUnified1DArrayS32S32:
255 case NVPTXISD::TexUnified1DArrayS32Float:
256 case NVPTXISD::TexUnified1DArrayS32FloatLevel:
257 case NVPTXISD::TexUnified1DArrayS32FloatGrad:
258 case NVPTXISD::TexUnified1DArrayU32S32:
259 case NVPTXISD::TexUnified1DArrayU32Float:
260 case NVPTXISD::TexUnified1DArrayU32FloatLevel:
261 case NVPTXISD::TexUnified1DArrayU32FloatGrad:
262 case NVPTXISD::TexUnified2DFloatS32:
263 case NVPTXISD::TexUnified2DFloatFloat:
264 case NVPTXISD::TexUnified2DFloatFloatLevel:
265 case NVPTXISD::TexUnified2DFloatFloatGrad:
266 case NVPTXISD::TexUnified2DS32S32:
267 case NVPTXISD::TexUnified2DS32Float:
268 case NVPTXISD::TexUnified2DS32FloatLevel:
269 case NVPTXISD::TexUnified2DS32FloatGrad:
270 case NVPTXISD::TexUnified2DU32S32:
271 case NVPTXISD::TexUnified2DU32Float:
272 case NVPTXISD::TexUnified2DU32FloatLevel:
273 case NVPTXISD::TexUnified2DU32FloatGrad:
274 case NVPTXISD::TexUnified2DArrayFloatS32:
275 case NVPTXISD::TexUnified2DArrayFloatFloat:
276 case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
277 case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
278 case NVPTXISD::TexUnified2DArrayS32S32:
279 case NVPTXISD::TexUnified2DArrayS32Float:
280 case NVPTXISD::TexUnified2DArrayS32FloatLevel:
281 case NVPTXISD::TexUnified2DArrayS32FloatGrad:
282 case NVPTXISD::TexUnified2DArrayU32S32:
283 case NVPTXISD::TexUnified2DArrayU32Float:
284 case NVPTXISD::TexUnified2DArrayU32FloatLevel:
285 case NVPTXISD::TexUnified2DArrayU32FloatGrad:
286 case NVPTXISD::TexUnified3DFloatS32:
287 case NVPTXISD::TexUnified3DFloatFloat:
288 case NVPTXISD::TexUnified3DFloatFloatLevel:
289 case NVPTXISD::TexUnified3DFloatFloatGrad:
290 case NVPTXISD::TexUnified3DS32S32:
291 case NVPTXISD::TexUnified3DS32Float:
292 case NVPTXISD::TexUnified3DS32FloatLevel:
293 case NVPTXISD::TexUnified3DS32FloatGrad:
294 case NVPTXISD::TexUnified3DU32S32:
295 case NVPTXISD::TexUnified3DU32Float:
296 case NVPTXISD::TexUnified3DU32FloatLevel:
297 case NVPTXISD::TexUnified3DU32FloatGrad:
298 case NVPTXISD::TexUnifiedCubeFloatFloat:
299 case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
300 case NVPTXISD::TexUnifiedCubeS32Float:
301 case NVPTXISD::TexUnifiedCubeS32FloatLevel:
302 case NVPTXISD::TexUnifiedCubeU32Float:
303 case NVPTXISD::TexUnifiedCubeU32FloatLevel:
304 case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
305 case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
306 case NVPTXISD::TexUnifiedCubeArrayS32Float:
307 case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
308 case NVPTXISD::TexUnifiedCubeArrayU32Float:
309 case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
310 case NVPTXISD::Tld4UnifiedR2DFloatFloat:
311 case NVPTXISD::Tld4UnifiedG2DFloatFloat:
312 case NVPTXISD::Tld4UnifiedB2DFloatFloat:
313 case NVPTXISD::Tld4UnifiedA2DFloatFloat:
314 case NVPTXISD::Tld4UnifiedR2DS64Float:
315 case NVPTXISD::Tld4UnifiedG2DS64Float:
316 case NVPTXISD::Tld4UnifiedB2DS64Float:
317 case NVPTXISD::Tld4UnifiedA2DS64Float:
318 case NVPTXISD::Tld4UnifiedR2DU64Float:
319 case NVPTXISD::Tld4UnifiedG2DU64Float:
320 case NVPTXISD::Tld4UnifiedB2DU64Float:
321 case NVPTXISD::Tld4UnifiedA2DU64Float:
322 if (tryTextureIntrinsic(N))
323 return;
324 break;
325 case NVPTXISD::Suld1DI8Clamp:
326 case NVPTXISD::Suld1DI16Clamp:
327 case NVPTXISD::Suld1DI32Clamp:
328 case NVPTXISD::Suld1DI64Clamp:
329 case NVPTXISD::Suld1DV2I8Clamp:
330 case NVPTXISD::Suld1DV2I16Clamp:
331 case NVPTXISD::Suld1DV2I32Clamp:
332 case NVPTXISD::Suld1DV2I64Clamp:
333 case NVPTXISD::Suld1DV4I8Clamp:
334 case NVPTXISD::Suld1DV4I16Clamp:
335 case NVPTXISD::Suld1DV4I32Clamp:
336 case NVPTXISD::Suld1DArrayI8Clamp:
337 case NVPTXISD::Suld1DArrayI16Clamp:
338 case NVPTXISD::Suld1DArrayI32Clamp:
339 case NVPTXISD::Suld1DArrayI64Clamp:
340 case NVPTXISD::Suld1DArrayV2I8Clamp:
341 case NVPTXISD::Suld1DArrayV2I16Clamp:
342 case NVPTXISD::Suld1DArrayV2I32Clamp:
343 case NVPTXISD::Suld1DArrayV2I64Clamp:
344 case NVPTXISD::Suld1DArrayV4I8Clamp:
345 case NVPTXISD::Suld1DArrayV4I16Clamp:
346 case NVPTXISD::Suld1DArrayV4I32Clamp:
347 case NVPTXISD::Suld2DI8Clamp:
348 case NVPTXISD::Suld2DI16Clamp:
349 case NVPTXISD::Suld2DI32Clamp:
350 case NVPTXISD::Suld2DI64Clamp:
351 case NVPTXISD::Suld2DV2I8Clamp:
352 case NVPTXISD::Suld2DV2I16Clamp:
353 case NVPTXISD::Suld2DV2I32Clamp:
354 case NVPTXISD::Suld2DV2I64Clamp:
355 case NVPTXISD::Suld2DV4I8Clamp:
356 case NVPTXISD::Suld2DV4I16Clamp:
357 case NVPTXISD::Suld2DV4I32Clamp:
358 case NVPTXISD::Suld2DArrayI8Clamp:
359 case NVPTXISD::Suld2DArrayI16Clamp:
360 case NVPTXISD::Suld2DArrayI32Clamp:
361 case NVPTXISD::Suld2DArrayI64Clamp:
362 case NVPTXISD::Suld2DArrayV2I8Clamp:
363 case NVPTXISD::Suld2DArrayV2I16Clamp:
364 case NVPTXISD::Suld2DArrayV2I32Clamp:
365 case NVPTXISD::Suld2DArrayV2I64Clamp:
366 case NVPTXISD::Suld2DArrayV4I8Clamp:
367 case NVPTXISD::Suld2DArrayV4I16Clamp:
368 case NVPTXISD::Suld2DArrayV4I32Clamp:
369 case NVPTXISD::Suld3DI8Clamp:
370 case NVPTXISD::Suld3DI16Clamp:
371 case NVPTXISD::Suld3DI32Clamp:
372 case NVPTXISD::Suld3DI64Clamp:
373 case NVPTXISD::Suld3DV2I8Clamp:
374 case NVPTXISD::Suld3DV2I16Clamp:
375 case NVPTXISD::Suld3DV2I32Clamp:
376 case NVPTXISD::Suld3DV2I64Clamp:
377 case NVPTXISD::Suld3DV4I8Clamp:
378 case NVPTXISD::Suld3DV4I16Clamp:
379 case NVPTXISD::Suld3DV4I32Clamp:
380 case NVPTXISD::Suld1DI8Trap:
381 case NVPTXISD::Suld1DI16Trap:
382 case NVPTXISD::Suld1DI32Trap:
383 case NVPTXISD::Suld1DI64Trap:
384 case NVPTXISD::Suld1DV2I8Trap:
385 case NVPTXISD::Suld1DV2I16Trap:
386 case NVPTXISD::Suld1DV2I32Trap:
387 case NVPTXISD::Suld1DV2I64Trap:
388 case NVPTXISD::Suld1DV4I8Trap:
389 case NVPTXISD::Suld1DV4I16Trap:
390 case NVPTXISD::Suld1DV4I32Trap:
391 case NVPTXISD::Suld1DArrayI8Trap:
392 case NVPTXISD::Suld1DArrayI16Trap:
393 case NVPTXISD::Suld1DArrayI32Trap:
394 case NVPTXISD::Suld1DArrayI64Trap:
395 case NVPTXISD::Suld1DArrayV2I8Trap:
396 case NVPTXISD::Suld1DArrayV2I16Trap:
397 case NVPTXISD::Suld1DArrayV2I32Trap:
398 case NVPTXISD::Suld1DArrayV2I64Trap:
399 case NVPTXISD::Suld1DArrayV4I8Trap:
400 case NVPTXISD::Suld1DArrayV4I16Trap:
401 case NVPTXISD::Suld1DArrayV4I32Trap:
402 case NVPTXISD::Suld2DI8Trap:
403 case NVPTXISD::Suld2DI16Trap:
404 case NVPTXISD::Suld2DI32Trap:
405 case NVPTXISD::Suld2DI64Trap:
406 case NVPTXISD::Suld2DV2I8Trap:
407 case NVPTXISD::Suld2DV2I16Trap:
408 case NVPTXISD::Suld2DV2I32Trap:
409 case NVPTXISD::Suld2DV2I64Trap:
410 case NVPTXISD::Suld2DV4I8Trap:
411 case NVPTXISD::Suld2DV4I16Trap:
412 case NVPTXISD::Suld2DV4I32Trap:
413 case NVPTXISD::Suld2DArrayI8Trap:
414 case NVPTXISD::Suld2DArrayI16Trap:
415 case NVPTXISD::Suld2DArrayI32Trap:
416 case NVPTXISD::Suld2DArrayI64Trap:
417 case NVPTXISD::Suld2DArrayV2I8Trap:
418 case NVPTXISD::Suld2DArrayV2I16Trap:
419 case NVPTXISD::Suld2DArrayV2I32Trap:
420 case NVPTXISD::Suld2DArrayV2I64Trap:
421 case NVPTXISD::Suld2DArrayV4I8Trap:
422 case NVPTXISD::Suld2DArrayV4I16Trap:
423 case NVPTXISD::Suld2DArrayV4I32Trap:
424 case NVPTXISD::Suld3DI8Trap:
425 case NVPTXISD::Suld3DI16Trap:
426 case NVPTXISD::Suld3DI32Trap:
427 case NVPTXISD::Suld3DI64Trap:
428 case NVPTXISD::Suld3DV2I8Trap:
429 case NVPTXISD::Suld3DV2I16Trap:
430 case NVPTXISD::Suld3DV2I32Trap:
431 case NVPTXISD::Suld3DV2I64Trap:
432 case NVPTXISD::Suld3DV4I8Trap:
433 case NVPTXISD::Suld3DV4I16Trap:
434 case NVPTXISD::Suld3DV4I32Trap:
435 case NVPTXISD::Suld1DI8Zero:
436 case NVPTXISD::Suld1DI16Zero:
437 case NVPTXISD::Suld1DI32Zero:
438 case NVPTXISD::Suld1DI64Zero:
439 case NVPTXISD::Suld1DV2I8Zero:
440 case NVPTXISD::Suld1DV2I16Zero:
441 case NVPTXISD::Suld1DV2I32Zero:
442 case NVPTXISD::Suld1DV2I64Zero:
443 case NVPTXISD::Suld1DV4I8Zero:
444 case NVPTXISD::Suld1DV4I16Zero:
445 case NVPTXISD::Suld1DV4I32Zero:
446 case NVPTXISD::Suld1DArrayI8Zero:
447 case NVPTXISD::Suld1DArrayI16Zero:
448 case NVPTXISD::Suld1DArrayI32Zero:
449 case NVPTXISD::Suld1DArrayI64Zero:
450 case NVPTXISD::Suld1DArrayV2I8Zero:
451 case NVPTXISD::Suld1DArrayV2I16Zero:
452 case NVPTXISD::Suld1DArrayV2I32Zero:
453 case NVPTXISD::Suld1DArrayV2I64Zero:
454 case NVPTXISD::Suld1DArrayV4I8Zero:
455 case NVPTXISD::Suld1DArrayV4I16Zero:
456 case NVPTXISD::Suld1DArrayV4I32Zero:
457 case NVPTXISD::Suld2DI8Zero:
458 case NVPTXISD::Suld2DI16Zero:
459 case NVPTXISD::Suld2DI32Zero:
460 case NVPTXISD::Suld2DI64Zero:
461 case NVPTXISD::Suld2DV2I8Zero:
462 case NVPTXISD::Suld2DV2I16Zero:
463 case NVPTXISD::Suld2DV2I32Zero:
464 case NVPTXISD::Suld2DV2I64Zero:
465 case NVPTXISD::Suld2DV4I8Zero:
466 case NVPTXISD::Suld2DV4I16Zero:
467 case NVPTXISD::Suld2DV4I32Zero:
468 case NVPTXISD::Suld2DArrayI8Zero:
469 case NVPTXISD::Suld2DArrayI16Zero:
470 case NVPTXISD::Suld2DArrayI32Zero:
471 case NVPTXISD::Suld2DArrayI64Zero:
472 case NVPTXISD::Suld2DArrayV2I8Zero:
473 case NVPTXISD::Suld2DArrayV2I16Zero:
474 case NVPTXISD::Suld2DArrayV2I32Zero:
475 case NVPTXISD::Suld2DArrayV2I64Zero:
476 case NVPTXISD::Suld2DArrayV4I8Zero:
477 case NVPTXISD::Suld2DArrayV4I16Zero:
478 case NVPTXISD::Suld2DArrayV4I32Zero:
479 case NVPTXISD::Suld3DI8Zero:
480 case NVPTXISD::Suld3DI16Zero:
481 case NVPTXISD::Suld3DI32Zero:
482 case NVPTXISD::Suld3DI64Zero:
483 case NVPTXISD::Suld3DV2I8Zero:
484 case NVPTXISD::Suld3DV2I16Zero:
485 case NVPTXISD::Suld3DV2I32Zero:
486 case NVPTXISD::Suld3DV2I64Zero:
487 case NVPTXISD::Suld3DV4I8Zero:
488 case NVPTXISD::Suld3DV4I16Zero:
489 case NVPTXISD::Suld3DV4I32Zero:
490 if (trySurfaceIntrinsic(N))
491 return;
492 break;
493 case ISD::AND:
494 case ISD::SRA:
495 case ISD::SRL:
496 // Try to select BFE
497 if (tryBFE(N))
498 return;
499 break;
500 case ISD::ADDRSPACECAST:
501 SelectAddrSpaceCast(N);
502 return;
503 case ISD::ConstantFP:
504 if (tryConstantFP(N))
505 return;
506 break;
507 default:
508 break;
510 SelectCode(N);
513 bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
514 unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
515 switch (IID) {
516 default:
517 return false;
518 case Intrinsic::nvvm_ldg_global_f:
519 case Intrinsic::nvvm_ldg_global_i:
520 case Intrinsic::nvvm_ldg_global_p:
521 case Intrinsic::nvvm_ldu_global_f:
522 case Intrinsic::nvvm_ldu_global_i:
523 case Intrinsic::nvvm_ldu_global_p:
524 return tryLDGLDU(N);
528 // There's no way to specify FP16 and BF16 immediates in .(b)f16 ops, so we
529 // have to load them into an .(b)f16 register first.
530 bool NVPTXDAGToDAGISel::tryConstantFP(SDNode *N) {
531 if (N->getValueType(0) != MVT::f16 && N->getValueType(0) != MVT::bf16)
532 return false;
533 SDValue Val = CurDAG->getTargetConstantFP(
534 cast<ConstantFPSDNode>(N)->getValueAPF(), SDLoc(N), N->getValueType(0));
535 SDNode *LoadConstF16 = CurDAG->getMachineNode(
536 (N->getValueType(0) == MVT::f16 ? NVPTX::LOAD_CONST_F16
537 : NVPTX::LOAD_CONST_BF16),
538 SDLoc(N), N->getValueType(0), Val);
539 ReplaceNode(N, LoadConstF16);
540 return true;
543 // Map ISD:CONDCODE value to appropriate CmpMode expected by
544 // NVPTXInstPrinter::printCmpMode()
545 static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ) {
546 using NVPTX::PTXCmpMode::CmpMode;
547 unsigned PTXCmpMode = [](ISD::CondCode CC) {
548 switch (CC) {
549 default:
550 llvm_unreachable("Unexpected condition code.");
551 case ISD::SETOEQ:
552 return CmpMode::EQ;
553 case ISD::SETOGT:
554 return CmpMode::GT;
555 case ISD::SETOGE:
556 return CmpMode::GE;
557 case ISD::SETOLT:
558 return CmpMode::LT;
559 case ISD::SETOLE:
560 return CmpMode::LE;
561 case ISD::SETONE:
562 return CmpMode::NE;
563 case ISD::SETO:
564 return CmpMode::NUM;
565 case ISD::SETUO:
566 return CmpMode::NotANumber;
567 case ISD::SETUEQ:
568 return CmpMode::EQU;
569 case ISD::SETUGT:
570 return CmpMode::GTU;
571 case ISD::SETUGE:
572 return CmpMode::GEU;
573 case ISD::SETULT:
574 return CmpMode::LTU;
575 case ISD::SETULE:
576 return CmpMode::LEU;
577 case ISD::SETUNE:
578 return CmpMode::NEU;
579 case ISD::SETEQ:
580 return CmpMode::EQ;
581 case ISD::SETGT:
582 return CmpMode::GT;
583 case ISD::SETGE:
584 return CmpMode::GE;
585 case ISD::SETLT:
586 return CmpMode::LT;
587 case ISD::SETLE:
588 return CmpMode::LE;
589 case ISD::SETNE:
590 return CmpMode::NE;
592 }(CondCode.get());
594 if (FTZ)
595 PTXCmpMode |= NVPTX::PTXCmpMode::FTZ_FLAG;
597 return PTXCmpMode;
600 bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) {
601 unsigned PTXCmpMode =
602 getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ());
603 SDLoc DL(N);
604 SDNode *SetP = CurDAG->getMachineNode(
605 NVPTX::SETP_f16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0),
606 N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32));
607 ReplaceNode(N, SetP);
608 return true;
611 // Find all instances of extract_vector_elt that use this v2f16 vector
612 // and coalesce them into a scattering move instruction.
613 bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
614 SDValue Vector = N->getOperand(0);
616 // We only care about 16x2 as it's the only real vector type we
617 // need to deal with.
618 MVT VT = Vector.getSimpleValueType();
619 if (!Isv2x16VT(VT))
620 return false;
621 // Find and record all uses of this vector that extract element 0 or 1.
622 SmallVector<SDNode *, 4> E0, E1;
623 for (auto *U : Vector.getNode()->uses()) {
624 if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
625 continue;
626 if (U->getOperand(0) != Vector)
627 continue;
628 if (const ConstantSDNode *IdxConst =
629 dyn_cast<ConstantSDNode>(U->getOperand(1))) {
630 if (IdxConst->getZExtValue() == 0)
631 E0.push_back(U);
632 else if (IdxConst->getZExtValue() == 1)
633 E1.push_back(U);
634 else
635 llvm_unreachable("Invalid vector index.");
639 // There's no point scattering f16x2 if we only ever access one
640 // element of it.
641 if (E0.empty() || E1.empty())
642 return false;
644 // Merge (f16 extractelt(V, 0), f16 extractelt(V,1))
645 // into f16,f16 SplitF16x2(V)
646 MVT EltVT = VT.getVectorElementType();
647 SDNode *ScatterOp =
648 CurDAG->getMachineNode(NVPTX::I32toV2I16, SDLoc(N), EltVT, EltVT, Vector);
649 for (auto *Node : E0)
650 ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0));
651 for (auto *Node : E1)
652 ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 1));
654 return true;
657 static unsigned int getCodeAddrSpace(MemSDNode *N) {
658 const Value *Src = N->getMemOperand()->getValue();
660 if (!Src)
661 return NVPTX::PTXLdStInstCode::GENERIC;
663 if (auto *PT = dyn_cast<PointerType>(Src->getType())) {
664 switch (PT->getAddressSpace()) {
665 case llvm::ADDRESS_SPACE_LOCAL: return NVPTX::PTXLdStInstCode::LOCAL;
666 case llvm::ADDRESS_SPACE_GLOBAL: return NVPTX::PTXLdStInstCode::GLOBAL;
667 case llvm::ADDRESS_SPACE_SHARED: return NVPTX::PTXLdStInstCode::SHARED;
668 case llvm::ADDRESS_SPACE_GENERIC: return NVPTX::PTXLdStInstCode::GENERIC;
669 case llvm::ADDRESS_SPACE_PARAM: return NVPTX::PTXLdStInstCode::PARAM;
670 case llvm::ADDRESS_SPACE_CONST: return NVPTX::PTXLdStInstCode::CONSTANT;
671 default: break;
674 return NVPTX::PTXLdStInstCode::GENERIC;
677 static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
678 unsigned CodeAddrSpace, MachineFunction *F) {
679 // We use ldg (i.e. ld.global.nc) for invariant loads from the global address
680 // space.
682 // We have two ways of identifying invariant loads: Loads may be explicitly
683 // marked as invariant, or we may infer them to be invariant.
685 // We currently infer invariance for loads from
686 // - constant global variables, and
687 // - kernel function pointer params that are noalias (i.e. __restrict) and
688 // never written to.
690 // TODO: Perform a more powerful invariance analysis (ideally IPO, and ideally
691 // not during the SelectionDAG phase).
693 // TODO: Infer invariance only at -O2. We still want to use ldg at -O0 for
694 // explicitly invariant loads because these are how clang tells us to use ldg
695 // when the user uses a builtin.
696 if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL)
697 return false;
699 if (N->isInvariant())
700 return true;
702 bool IsKernelFn = isKernelFunction(F->getFunction());
704 // We use getUnderlyingObjects() here instead of getUnderlyingObject() mainly
705 // because the former looks through phi nodes while the latter does not. We
706 // need to look through phi nodes to handle pointer induction variables.
707 SmallVector<const Value *, 8> Objs;
708 getUnderlyingObjects(N->getMemOperand()->getValue(), Objs);
710 return all_of(Objs, [&](const Value *V) {
711 if (auto *A = dyn_cast<const Argument>(V))
712 return IsKernelFn && A->onlyReadsMemory() && A->hasNoAliasAttr();
713 if (auto *GV = dyn_cast<const GlobalVariable>(V))
714 return GV->isConstant();
715 return false;
719 bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) {
720 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
721 switch (IID) {
722 default:
723 return false;
724 case Intrinsic::nvvm_texsurf_handle_internal:
725 SelectTexSurfHandle(N);
726 return true;
730 void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) {
731 // Op 0 is the intrinsic ID
732 SDValue Wrapper = N->getOperand(1);
733 SDValue GlobalVal = Wrapper.getOperand(0);
734 ReplaceNode(N, CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N),
735 MVT::i64, GlobalVal));
738 void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
739 SDValue Src = N->getOperand(0);
740 AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
741 unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
742 unsigned DstAddrSpace = CastN->getDestAddressSpace();
743 assert(SrcAddrSpace != DstAddrSpace &&
744 "addrspacecast must be between different address spaces");
746 if (DstAddrSpace == ADDRESS_SPACE_GENERIC) {
747 // Specific to generic
748 unsigned Opc;
749 switch (SrcAddrSpace) {
750 default: report_fatal_error("Bad address space in addrspacecast");
751 case ADDRESS_SPACE_GLOBAL:
752 Opc = TM.is64Bit() ? NVPTX::cvta_global_yes_64 : NVPTX::cvta_global_yes;
753 break;
754 case ADDRESS_SPACE_SHARED:
755 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_shared_yes_6432
756 : NVPTX::cvta_shared_yes_64)
757 : NVPTX::cvta_shared_yes;
758 break;
759 case ADDRESS_SPACE_CONST:
760 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_const_yes_6432
761 : NVPTX::cvta_const_yes_64)
762 : NVPTX::cvta_const_yes;
763 break;
764 case ADDRESS_SPACE_LOCAL:
765 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_local_yes_6432
766 : NVPTX::cvta_local_yes_64)
767 : NVPTX::cvta_local_yes;
768 break;
770 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
771 Src));
772 return;
773 } else {
774 // Generic to specific
775 if (SrcAddrSpace != 0)
776 report_fatal_error("Cannot cast between two non-generic address spaces");
777 unsigned Opc;
778 switch (DstAddrSpace) {
779 default: report_fatal_error("Bad address space in addrspacecast");
780 case ADDRESS_SPACE_GLOBAL:
781 Opc = TM.is64Bit() ? NVPTX::cvta_to_global_yes_64
782 : NVPTX::cvta_to_global_yes;
783 break;
784 case ADDRESS_SPACE_SHARED:
785 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_shared_yes_3264
786 : NVPTX::cvta_to_shared_yes_64)
787 : NVPTX::cvta_to_shared_yes;
788 break;
789 case ADDRESS_SPACE_CONST:
790 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_const_yes_3264
791 : NVPTX::cvta_to_const_yes_64)
792 : NVPTX::cvta_to_const_yes;
793 break;
794 case ADDRESS_SPACE_LOCAL:
795 Opc = TM.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_local_yes_3264
796 : NVPTX::cvta_to_local_yes_64)
797 : NVPTX::cvta_to_local_yes;
798 break;
799 case ADDRESS_SPACE_PARAM:
800 Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64
801 : NVPTX::nvvm_ptr_gen_to_param;
802 break;
804 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
805 Src));
806 return;
810 // Helper function template to reduce amount of boilerplate code for
811 // opcode selection.
812 static std::optional<unsigned>
813 pickOpcodeForVT(MVT::SimpleValueType VT, unsigned Opcode_i8,
814 unsigned Opcode_i16, unsigned Opcode_i32,
815 std::optional<unsigned> Opcode_i64, unsigned Opcode_f32,
816 std::optional<unsigned> Opcode_f64) {
817 switch (VT) {
818 case MVT::i1:
819 case MVT::i8:
820 return Opcode_i8;
821 case MVT::i16:
822 return Opcode_i16;
823 case MVT::i32:
824 return Opcode_i32;
825 case MVT::i64:
826 return Opcode_i64;
827 case MVT::f16:
828 case MVT::bf16:
829 return Opcode_i16;
830 case MVT::v2f16:
831 case MVT::v2bf16:
832 case MVT::v2i16:
833 case MVT::v4i8:
834 return Opcode_i32;
835 case MVT::f32:
836 return Opcode_f32;
837 case MVT::f64:
838 return Opcode_f64;
839 default:
840 return std::nullopt;
844 static int getLdStRegType(EVT VT) {
845 if (VT.isFloatingPoint())
846 switch (VT.getSimpleVT().SimpleTy) {
847 case MVT::f16:
848 case MVT::bf16:
849 case MVT::v2f16:
850 case MVT::v2bf16:
851 return NVPTX::PTXLdStInstCode::Untyped;
852 default:
853 return NVPTX::PTXLdStInstCode::Float;
855 else
856 return NVPTX::PTXLdStInstCode::Unsigned;
859 bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
860 SDLoc dl(N);
861 MemSDNode *LD = cast<MemSDNode>(N);
862 assert(LD->readMem() && "Expected load");
863 LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(N);
864 EVT LoadedVT = LD->getMemoryVT();
865 SDNode *NVPTXLD = nullptr;
867 // do not support pre/post inc/dec
868 if (PlainLoad && PlainLoad->isIndexed())
869 return false;
871 if (!LoadedVT.isSimple())
872 return false;
874 AtomicOrdering Ordering = LD->getSuccessOrdering();
875 // In order to lower atomic loads with stronger guarantees we would need to
876 // use load.acquire or insert fences. However these features were only added
877 // with PTX ISA 6.0 / sm_70.
878 // TODO: Check if we can actually use the new instructions and implement them.
879 if (isStrongerThanMonotonic(Ordering))
880 return false;
882 // Address Space Setting
883 unsigned int CodeAddrSpace = getCodeAddrSpace(LD);
884 if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) {
885 return tryLDGLDU(N);
888 unsigned int PointerSize =
889 CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace());
891 // Volatile Setting
892 // - .volatile is only available for .global and .shared
893 // - .volatile has the same memory synchronization semantics as .relaxed.sys
894 bool isVolatile = LD->isVolatile() || Ordering == AtomicOrdering::Monotonic;
895 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
896 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
897 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
898 isVolatile = false;
900 // Type Setting: fromType + fromTypeWidth
902 // Sign : ISD::SEXTLOAD
903 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
904 // type is integer
905 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
906 MVT SimpleVT = LoadedVT.getSimpleVT();
907 MVT ScalarVT = SimpleVT.getScalarType();
908 // Read at least 8 bits (predicates are stored as 8-bit values)
909 unsigned fromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
910 unsigned int fromType;
912 // Vector Setting
913 unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
914 if (SimpleVT.isVector()) {
915 assert((Isv2x16VT(LoadedVT) || LoadedVT == MVT::v4i8) &&
916 "Unexpected vector type");
917 // v2f16/v2bf16/v2i16 is loaded using ld.b32
918 fromTypeWidth = 32;
921 if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
922 fromType = NVPTX::PTXLdStInstCode::Signed;
923 else
924 fromType = getLdStRegType(ScalarVT);
926 // Create the machine instruction DAG
927 SDValue Chain = N->getOperand(0);
928 SDValue N1 = N->getOperand(1);
929 SDValue Addr;
930 SDValue Offset, Base;
931 std::optional<unsigned> Opcode;
932 MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
934 if (SelectDirectAddr(N1, Addr)) {
935 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_avar, NVPTX::LD_i16_avar,
936 NVPTX::LD_i32_avar, NVPTX::LD_i64_avar,
937 NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
938 if (!Opcode)
939 return false;
940 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
941 getI32Imm(vecType, dl), getI32Imm(fromType, dl),
942 getI32Imm(fromTypeWidth, dl), Addr, Chain };
943 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
944 } else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
945 : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
946 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi,
947 NVPTX::LD_i32_asi, NVPTX::LD_i64_asi,
948 NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
949 if (!Opcode)
950 return false;
951 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
952 getI32Imm(vecType, dl), getI32Imm(fromType, dl),
953 getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
954 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
955 } else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
956 : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
957 if (PointerSize == 64)
958 Opcode =
959 pickOpcodeForVT(TargetVT, NVPTX::LD_i8_ari_64, NVPTX::LD_i16_ari_64,
960 NVPTX::LD_i32_ari_64, NVPTX::LD_i64_ari_64,
961 NVPTX::LD_f32_ari_64, NVPTX::LD_f64_ari_64);
962 else
963 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_ari, NVPTX::LD_i16_ari,
964 NVPTX::LD_i32_ari, NVPTX::LD_i64_ari,
965 NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
966 if (!Opcode)
967 return false;
968 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
969 getI32Imm(vecType, dl), getI32Imm(fromType, dl),
970 getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
971 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
972 } else {
973 if (PointerSize == 64)
974 Opcode =
975 pickOpcodeForVT(TargetVT, NVPTX::LD_i8_areg_64, NVPTX::LD_i16_areg_64,
976 NVPTX::LD_i32_areg_64, NVPTX::LD_i64_areg_64,
977 NVPTX::LD_f32_areg_64, NVPTX::LD_f64_areg_64);
978 else
979 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_areg, NVPTX::LD_i16_areg,
980 NVPTX::LD_i32_areg, NVPTX::LD_i64_areg,
981 NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
982 if (!Opcode)
983 return false;
984 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
985 getI32Imm(vecType, dl), getI32Imm(fromType, dl),
986 getI32Imm(fromTypeWidth, dl), N1, Chain };
987 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
990 if (!NVPTXLD)
991 return false;
993 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
994 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef});
996 ReplaceNode(N, NVPTXLD);
997 return true;
1000 bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
1002 SDValue Chain = N->getOperand(0);
1003 SDValue Op1 = N->getOperand(1);
1004 SDValue Addr, Offset, Base;
1005 std::optional<unsigned> Opcode;
1006 SDLoc DL(N);
1007 SDNode *LD;
1008 MemSDNode *MemSD = cast<MemSDNode>(N);
1009 EVT LoadedVT = MemSD->getMemoryVT();
1011 if (!LoadedVT.isSimple())
1012 return false;
1014 // Address Space Setting
1015 unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD);
1016 if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) {
1017 return tryLDGLDU(N);
1020 unsigned int PointerSize =
1021 CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
1023 // Volatile Setting
1024 // - .volatile is only availalble for .global and .shared
1025 bool IsVolatile = MemSD->isVolatile();
1026 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1027 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1028 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1029 IsVolatile = false;
1031 // Vector Setting
1032 MVT SimpleVT = LoadedVT.getSimpleVT();
1034 // Type Setting: fromType + fromTypeWidth
1036 // Sign : ISD::SEXTLOAD
1037 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
1038 // type is integer
1039 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
1040 MVT ScalarVT = SimpleVT.getScalarType();
1041 // Read at least 8 bits (predicates are stored as 8-bit values)
1042 unsigned FromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
1043 unsigned int FromType;
1044 // The last operand holds the original LoadSDNode::getExtensionType() value
1045 unsigned ExtensionType = cast<ConstantSDNode>(
1046 N->getOperand(N->getNumOperands() - 1))->getZExtValue();
1047 if (ExtensionType == ISD::SEXTLOAD)
1048 FromType = NVPTX::PTXLdStInstCode::Signed;
1049 else
1050 FromType = getLdStRegType(ScalarVT);
1052 unsigned VecType;
1054 switch (N->getOpcode()) {
1055 case NVPTXISD::LoadV2:
1056 VecType = NVPTX::PTXLdStInstCode::V2;
1057 break;
1058 case NVPTXISD::LoadV4:
1059 VecType = NVPTX::PTXLdStInstCode::V4;
1060 break;
1061 default:
1062 return false;
1065 EVT EltVT = N->getValueType(0);
1067 // v8x16 is a special case. PTX doesn't have ld.v8.16
1068 // instruction. Instead, we split the vector into v2x16 chunks and
1069 // load them with ld.v4.b32.
1070 if (Isv2x16VT(EltVT)) {
1071 assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode.");
1072 EltVT = MVT::i32;
1073 FromType = NVPTX::PTXLdStInstCode::Untyped;
1074 FromTypeWidth = 32;
1077 if (SelectDirectAddr(Op1, Addr)) {
1078 switch (N->getOpcode()) {
1079 default:
1080 return false;
1081 case NVPTXISD::LoadV2:
1082 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1083 NVPTX::LDV_i8_v2_avar, NVPTX::LDV_i16_v2_avar,
1084 NVPTX::LDV_i32_v2_avar, NVPTX::LDV_i64_v2_avar,
1085 NVPTX::LDV_f32_v2_avar, NVPTX::LDV_f64_v2_avar);
1086 break;
1087 case NVPTXISD::LoadV4:
1088 Opcode =
1089 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_avar,
1090 NVPTX::LDV_i16_v4_avar, NVPTX::LDV_i32_v4_avar,
1091 std::nullopt, NVPTX::LDV_f32_v4_avar, std::nullopt);
1092 break;
1094 if (!Opcode)
1095 return false;
1096 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1097 getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1098 getI32Imm(FromTypeWidth, DL), Addr, Chain };
1099 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1100 } else if (PointerSize == 64
1101 ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
1102 : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
1103 switch (N->getOpcode()) {
1104 default:
1105 return false;
1106 case NVPTXISD::LoadV2:
1107 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1108 NVPTX::LDV_i8_v2_asi, NVPTX::LDV_i16_v2_asi,
1109 NVPTX::LDV_i32_v2_asi, NVPTX::LDV_i64_v2_asi,
1110 NVPTX::LDV_f32_v2_asi, NVPTX::LDV_f64_v2_asi);
1111 break;
1112 case NVPTXISD::LoadV4:
1113 Opcode =
1114 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_asi,
1115 NVPTX::LDV_i16_v4_asi, NVPTX::LDV_i32_v4_asi,
1116 std::nullopt, NVPTX::LDV_f32_v4_asi, std::nullopt);
1117 break;
1119 if (!Opcode)
1120 return false;
1121 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1122 getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1123 getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
1124 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1125 } else if (PointerSize == 64
1126 ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1127 : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1128 if (PointerSize == 64) {
1129 switch (N->getOpcode()) {
1130 default:
1131 return false;
1132 case NVPTXISD::LoadV2:
1133 Opcode =
1134 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1135 NVPTX::LDV_i8_v2_ari_64, NVPTX::LDV_i16_v2_ari_64,
1136 NVPTX::LDV_i32_v2_ari_64, NVPTX::LDV_i64_v2_ari_64,
1137 NVPTX::LDV_f32_v2_ari_64, NVPTX::LDV_f64_v2_ari_64);
1138 break;
1139 case NVPTXISD::LoadV4:
1140 Opcode = pickOpcodeForVT(
1141 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari_64,
1142 NVPTX::LDV_i16_v4_ari_64, NVPTX::LDV_i32_v4_ari_64, std::nullopt,
1143 NVPTX::LDV_f32_v4_ari_64, std::nullopt);
1144 break;
1146 } else {
1147 switch (N->getOpcode()) {
1148 default:
1149 return false;
1150 case NVPTXISD::LoadV2:
1151 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1152 NVPTX::LDV_i8_v2_ari, NVPTX::LDV_i16_v2_ari,
1153 NVPTX::LDV_i32_v2_ari, NVPTX::LDV_i64_v2_ari,
1154 NVPTX::LDV_f32_v2_ari, NVPTX::LDV_f64_v2_ari);
1155 break;
1156 case NVPTXISD::LoadV4:
1157 Opcode =
1158 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari,
1159 NVPTX::LDV_i16_v4_ari, NVPTX::LDV_i32_v4_ari,
1160 std::nullopt, NVPTX::LDV_f32_v4_ari, std::nullopt);
1161 break;
1164 if (!Opcode)
1165 return false;
1166 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1167 getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1168 getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
1170 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1171 } else {
1172 if (PointerSize == 64) {
1173 switch (N->getOpcode()) {
1174 default:
1175 return false;
1176 case NVPTXISD::LoadV2:
1177 Opcode = pickOpcodeForVT(
1178 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg_64,
1179 NVPTX::LDV_i16_v2_areg_64, NVPTX::LDV_i32_v2_areg_64,
1180 NVPTX::LDV_i64_v2_areg_64, NVPTX::LDV_f32_v2_areg_64,
1181 NVPTX::LDV_f64_v2_areg_64);
1182 break;
1183 case NVPTXISD::LoadV4:
1184 Opcode = pickOpcodeForVT(
1185 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg_64,
1186 NVPTX::LDV_i16_v4_areg_64, NVPTX::LDV_i32_v4_areg_64, std::nullopt,
1187 NVPTX::LDV_f32_v4_areg_64, std::nullopt);
1188 break;
1190 } else {
1191 switch (N->getOpcode()) {
1192 default:
1193 return false;
1194 case NVPTXISD::LoadV2:
1195 Opcode =
1196 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg,
1197 NVPTX::LDV_i16_v2_areg, NVPTX::LDV_i32_v2_areg,
1198 NVPTX::LDV_i64_v2_areg, NVPTX::LDV_f32_v2_areg,
1199 NVPTX::LDV_f64_v2_areg);
1200 break;
1201 case NVPTXISD::LoadV4:
1202 Opcode =
1203 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg,
1204 NVPTX::LDV_i16_v4_areg, NVPTX::LDV_i32_v4_areg,
1205 std::nullopt, NVPTX::LDV_f32_v4_areg, std::nullopt);
1206 break;
1209 if (!Opcode)
1210 return false;
1211 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1212 getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1213 getI32Imm(FromTypeWidth, DL), Op1, Chain };
1214 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1217 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1218 CurDAG->setNodeMemRefs(cast<MachineSDNode>(LD), {MemRef});
1220 ReplaceNode(N, LD);
1221 return true;
1224 bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
1226 SDValue Chain = N->getOperand(0);
1227 SDValue Op1;
1228 MemSDNode *Mem;
1229 bool IsLDG = true;
1231 // If this is an LDG intrinsic, the address is the third operand. If its an
1232 // LDG/LDU SD node (from custom vector handling), then its the second operand
1233 if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
1234 Op1 = N->getOperand(2);
1235 Mem = cast<MemIntrinsicSDNode>(N);
1236 unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
1237 switch (IID) {
1238 default:
1239 return false;
1240 case Intrinsic::nvvm_ldg_global_f:
1241 case Intrinsic::nvvm_ldg_global_i:
1242 case Intrinsic::nvvm_ldg_global_p:
1243 IsLDG = true;
1244 break;
1245 case Intrinsic::nvvm_ldu_global_f:
1246 case Intrinsic::nvvm_ldu_global_i:
1247 case Intrinsic::nvvm_ldu_global_p:
1248 IsLDG = false;
1249 break;
1251 } else {
1252 Op1 = N->getOperand(1);
1253 Mem = cast<MemSDNode>(N);
1256 std::optional<unsigned> Opcode;
1257 SDLoc DL(N);
1258 SDNode *LD;
1259 SDValue Base, Offset, Addr;
1260 EVT OrigType = N->getValueType(0);
1262 EVT EltVT = Mem->getMemoryVT();
1263 unsigned NumElts = 1;
1264 if (EltVT.isVector()) {
1265 NumElts = EltVT.getVectorNumElements();
1266 EltVT = EltVT.getVectorElementType();
1267 // vectors of 16bits type are loaded/stored as multiples of v2x16 elements.
1268 if ((EltVT == MVT::f16 && OrigType == MVT::v2f16) ||
1269 (EltVT == MVT::bf16 && OrigType == MVT::v2bf16) ||
1270 (EltVT == MVT::i16 && OrigType == MVT::v2i16)) {
1271 assert(NumElts % 2 == 0 && "Vector must have even number of elements");
1272 EltVT = OrigType;
1273 NumElts /= 2;
1274 } else if (OrigType == MVT::v4i8) {
1275 EltVT = OrigType;
1276 NumElts = 1;
1280 // Build the "promoted" result VTList for the load. If we are really loading
1281 // i8s, then the return type will be promoted to i16 since we do not expose
1282 // 8-bit registers in NVPTX.
1283 EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT;
1284 SmallVector<EVT, 5> InstVTs;
1285 for (unsigned i = 0; i != NumElts; ++i) {
1286 InstVTs.push_back(NodeVT);
1288 InstVTs.push_back(MVT::Other);
1289 SDVTList InstVTList = CurDAG->getVTList(InstVTs);
1291 if (SelectDirectAddr(Op1, Addr)) {
1292 switch (N->getOpcode()) {
1293 default:
1294 return false;
1295 case ISD::LOAD:
1296 case ISD::INTRINSIC_W_CHAIN:
1297 if (IsLDG)
1298 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1299 NVPTX::INT_PTX_LDG_GLOBAL_i8avar,
1300 NVPTX::INT_PTX_LDG_GLOBAL_i16avar,
1301 NVPTX::INT_PTX_LDG_GLOBAL_i32avar,
1302 NVPTX::INT_PTX_LDG_GLOBAL_i64avar,
1303 NVPTX::INT_PTX_LDG_GLOBAL_f32avar,
1304 NVPTX::INT_PTX_LDG_GLOBAL_f64avar);
1305 else
1306 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1307 NVPTX::INT_PTX_LDU_GLOBAL_i8avar,
1308 NVPTX::INT_PTX_LDU_GLOBAL_i16avar,
1309 NVPTX::INT_PTX_LDU_GLOBAL_i32avar,
1310 NVPTX::INT_PTX_LDU_GLOBAL_i64avar,
1311 NVPTX::INT_PTX_LDU_GLOBAL_f32avar,
1312 NVPTX::INT_PTX_LDU_GLOBAL_f64avar);
1313 break;
1314 case NVPTXISD::LoadV2:
1315 case NVPTXISD::LDGV2:
1316 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1317 NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar,
1318 NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar,
1319 NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar,
1320 NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar,
1321 NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar,
1322 NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar);
1323 break;
1324 case NVPTXISD::LDUV2:
1325 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1326 NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar,
1327 NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar,
1328 NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar,
1329 NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar,
1330 NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar,
1331 NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar);
1332 break;
1333 case NVPTXISD::LoadV4:
1334 case NVPTXISD::LDGV4:
1335 Opcode = pickOpcodeForVT(
1336 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar,
1337 NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar,
1338 NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, std::nullopt,
1339 NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, std::nullopt);
1340 break;
1341 case NVPTXISD::LDUV4:
1342 Opcode = pickOpcodeForVT(
1343 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar,
1344 NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar,
1345 NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, std::nullopt,
1346 NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, std::nullopt);
1347 break;
1349 if (!Opcode)
1350 return false;
1351 SDValue Ops[] = { Addr, Chain };
1352 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1353 } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1354 : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1355 if (TM.is64Bit()) {
1356 switch (N->getOpcode()) {
1357 default:
1358 return false;
1359 case ISD::LOAD:
1360 case ISD::INTRINSIC_W_CHAIN:
1361 if (IsLDG)
1362 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1363 NVPTX::INT_PTX_LDG_GLOBAL_i8ari64,
1364 NVPTX::INT_PTX_LDG_GLOBAL_i16ari64,
1365 NVPTX::INT_PTX_LDG_GLOBAL_i32ari64,
1366 NVPTX::INT_PTX_LDG_GLOBAL_i64ari64,
1367 NVPTX::INT_PTX_LDG_GLOBAL_f32ari64,
1368 NVPTX::INT_PTX_LDG_GLOBAL_f64ari64);
1369 else
1370 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1371 NVPTX::INT_PTX_LDU_GLOBAL_i8ari64,
1372 NVPTX::INT_PTX_LDU_GLOBAL_i16ari64,
1373 NVPTX::INT_PTX_LDU_GLOBAL_i32ari64,
1374 NVPTX::INT_PTX_LDU_GLOBAL_i64ari64,
1375 NVPTX::INT_PTX_LDU_GLOBAL_f32ari64,
1376 NVPTX::INT_PTX_LDU_GLOBAL_f64ari64);
1377 break;
1378 case NVPTXISD::LoadV2:
1379 case NVPTXISD::LDGV2:
1380 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1381 NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64,
1382 NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64,
1383 NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64,
1384 NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64,
1385 NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64,
1386 NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64);
1387 break;
1388 case NVPTXISD::LDUV2:
1389 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1390 NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64,
1391 NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64,
1392 NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64,
1393 NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64,
1394 NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64,
1395 NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64);
1396 break;
1397 case NVPTXISD::LoadV4:
1398 case NVPTXISD::LDGV4:
1399 Opcode = pickOpcodeForVT(
1400 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64,
1401 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64,
1402 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64, std::nullopt,
1403 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64, std::nullopt);
1404 break;
1405 case NVPTXISD::LDUV4:
1406 Opcode = pickOpcodeForVT(
1407 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64,
1408 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64,
1409 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64, std::nullopt,
1410 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64, std::nullopt);
1411 break;
1413 } else {
1414 switch (N->getOpcode()) {
1415 default:
1416 return false;
1417 case ISD::LOAD:
1418 case ISD::INTRINSIC_W_CHAIN:
1419 if (IsLDG)
1420 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1421 NVPTX::INT_PTX_LDG_GLOBAL_i8ari,
1422 NVPTX::INT_PTX_LDG_GLOBAL_i16ari,
1423 NVPTX::INT_PTX_LDG_GLOBAL_i32ari,
1424 NVPTX::INT_PTX_LDG_GLOBAL_i64ari,
1425 NVPTX::INT_PTX_LDG_GLOBAL_f32ari,
1426 NVPTX::INT_PTX_LDG_GLOBAL_f64ari);
1427 else
1428 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1429 NVPTX::INT_PTX_LDU_GLOBAL_i8ari,
1430 NVPTX::INT_PTX_LDU_GLOBAL_i16ari,
1431 NVPTX::INT_PTX_LDU_GLOBAL_i32ari,
1432 NVPTX::INT_PTX_LDU_GLOBAL_i64ari,
1433 NVPTX::INT_PTX_LDU_GLOBAL_f32ari,
1434 NVPTX::INT_PTX_LDU_GLOBAL_f64ari);
1435 break;
1436 case NVPTXISD::LoadV2:
1437 case NVPTXISD::LDGV2:
1438 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1439 NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32,
1440 NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32,
1441 NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32,
1442 NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32,
1443 NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32,
1444 NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32);
1445 break;
1446 case NVPTXISD::LDUV2:
1447 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1448 NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32,
1449 NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32,
1450 NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32,
1451 NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32,
1452 NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32,
1453 NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32);
1454 break;
1455 case NVPTXISD::LoadV4:
1456 case NVPTXISD::LDGV4:
1457 Opcode = pickOpcodeForVT(
1458 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32,
1459 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32,
1460 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32, std::nullopt,
1461 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32, std::nullopt);
1462 break;
1463 case NVPTXISD::LDUV4:
1464 Opcode = pickOpcodeForVT(
1465 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32,
1466 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32,
1467 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32, std::nullopt,
1468 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32, std::nullopt);
1469 break;
1472 if (!Opcode)
1473 return false;
1474 SDValue Ops[] = {Base, Offset, Chain};
1475 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1476 } else {
1477 if (TM.is64Bit()) {
1478 switch (N->getOpcode()) {
1479 default:
1480 return false;
1481 case ISD::LOAD:
1482 case ISD::INTRINSIC_W_CHAIN:
1483 if (IsLDG)
1484 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1485 NVPTX::INT_PTX_LDG_GLOBAL_i8areg64,
1486 NVPTX::INT_PTX_LDG_GLOBAL_i16areg64,
1487 NVPTX::INT_PTX_LDG_GLOBAL_i32areg64,
1488 NVPTX::INT_PTX_LDG_GLOBAL_i64areg64,
1489 NVPTX::INT_PTX_LDG_GLOBAL_f32areg64,
1490 NVPTX::INT_PTX_LDG_GLOBAL_f64areg64);
1491 else
1492 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1493 NVPTX::INT_PTX_LDU_GLOBAL_i8areg64,
1494 NVPTX::INT_PTX_LDU_GLOBAL_i16areg64,
1495 NVPTX::INT_PTX_LDU_GLOBAL_i32areg64,
1496 NVPTX::INT_PTX_LDU_GLOBAL_i64areg64,
1497 NVPTX::INT_PTX_LDU_GLOBAL_f32areg64,
1498 NVPTX::INT_PTX_LDU_GLOBAL_f64areg64);
1499 break;
1500 case NVPTXISD::LoadV2:
1501 case NVPTXISD::LDGV2:
1502 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1503 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64,
1504 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64,
1505 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64,
1506 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64,
1507 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64,
1508 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64);
1509 break;
1510 case NVPTXISD::LDUV2:
1511 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1512 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64,
1513 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64,
1514 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64,
1515 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64,
1516 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64,
1517 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64);
1518 break;
1519 case NVPTXISD::LoadV4:
1520 case NVPTXISD::LDGV4:
1521 Opcode = pickOpcodeForVT(
1522 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64,
1523 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64,
1524 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64, std::nullopt,
1525 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64, std::nullopt);
1526 break;
1527 case NVPTXISD::LDUV4:
1528 Opcode = pickOpcodeForVT(
1529 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64,
1530 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64,
1531 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64, std::nullopt,
1532 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64, std::nullopt);
1533 break;
1535 } else {
1536 switch (N->getOpcode()) {
1537 default:
1538 return false;
1539 case ISD::LOAD:
1540 case ISD::INTRINSIC_W_CHAIN:
1541 if (IsLDG)
1542 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1543 NVPTX::INT_PTX_LDG_GLOBAL_i8areg,
1544 NVPTX::INT_PTX_LDG_GLOBAL_i16areg,
1545 NVPTX::INT_PTX_LDG_GLOBAL_i32areg,
1546 NVPTX::INT_PTX_LDG_GLOBAL_i64areg,
1547 NVPTX::INT_PTX_LDG_GLOBAL_f32areg,
1548 NVPTX::INT_PTX_LDG_GLOBAL_f64areg);
1549 else
1550 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1551 NVPTX::INT_PTX_LDU_GLOBAL_i8areg,
1552 NVPTX::INT_PTX_LDU_GLOBAL_i16areg,
1553 NVPTX::INT_PTX_LDU_GLOBAL_i32areg,
1554 NVPTX::INT_PTX_LDU_GLOBAL_i64areg,
1555 NVPTX::INT_PTX_LDU_GLOBAL_f32areg,
1556 NVPTX::INT_PTX_LDU_GLOBAL_f64areg);
1557 break;
1558 case NVPTXISD::LoadV2:
1559 case NVPTXISD::LDGV2:
1560 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1561 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32,
1562 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32,
1563 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32,
1564 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32,
1565 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32,
1566 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32);
1567 break;
1568 case NVPTXISD::LDUV2:
1569 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1570 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32,
1571 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32,
1572 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32,
1573 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32,
1574 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32,
1575 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32);
1576 break;
1577 case NVPTXISD::LoadV4:
1578 case NVPTXISD::LDGV4:
1579 Opcode = pickOpcodeForVT(
1580 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32,
1581 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32,
1582 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32, std::nullopt,
1583 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32, std::nullopt);
1584 break;
1585 case NVPTXISD::LDUV4:
1586 Opcode = pickOpcodeForVT(
1587 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32,
1588 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32,
1589 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32, std::nullopt,
1590 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32, std::nullopt);
1591 break;
1594 if (!Opcode)
1595 return false;
1596 SDValue Ops[] = { Op1, Chain };
1597 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1600 // For automatic generation of LDG (through SelectLoad[Vector], not the
1601 // intrinsics), we may have an extending load like:
1603 // i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64
1605 // In this case, the matching logic above will select a load for the original
1606 // memory type (in this case, i8) and our types will not match (the node needs
1607 // to return an i32 in this case). Our LDG/LDU nodes do not support the
1608 // concept of sign-/zero-extension, so emulate it here by adding an explicit
1609 // CVT instruction. Ptxas should clean up any redundancies here.
1611 LoadSDNode *LdNode = dyn_cast<LoadSDNode>(N);
1613 if (OrigType != EltVT &&
1614 (LdNode || (OrigType.isFloatingPoint() && EltVT.isFloatingPoint()))) {
1615 // We have an extending-load. The instruction we selected operates on the
1616 // smaller type, but the SDNode we are replacing has the larger type. We
1617 // need to emit a CVT to make the types match.
1618 unsigned CvtOpc =
1619 GetConvertOpcode(OrigType.getSimpleVT(), EltVT.getSimpleVT(), LdNode);
1621 // For each output value, apply the manual sign/zero-extension and make sure
1622 // all users of the load go through that CVT.
1623 for (unsigned i = 0; i != NumElts; ++i) {
1624 SDValue Res(LD, i);
1625 SDValue OrigVal(N, i);
1627 SDNode *CvtNode =
1628 CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res,
1629 CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE,
1630 DL, MVT::i32));
1631 ReplaceUses(OrigVal, SDValue(CvtNode, 0));
1635 ReplaceNode(N, LD);
1636 return true;
1639 bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
1640 SDLoc dl(N);
1641 MemSDNode *ST = cast<MemSDNode>(N);
1642 assert(ST->writeMem() && "Expected store");
1643 StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(N);
1644 AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(N);
1645 assert((PlainStore || AtomicStore) && "Expected store");
1646 EVT StoreVT = ST->getMemoryVT();
1647 SDNode *NVPTXST = nullptr;
1649 // do not support pre/post inc/dec
1650 if (PlainStore && PlainStore->isIndexed())
1651 return false;
1653 if (!StoreVT.isSimple())
1654 return false;
1656 AtomicOrdering Ordering = ST->getSuccessOrdering();
1657 // In order to lower atomic loads with stronger guarantees we would need to
1658 // use store.release or insert fences. However these features were only added
1659 // with PTX ISA 6.0 / sm_70.
1660 // TODO: Check if we can actually use the new instructions and implement them.
1661 if (isStrongerThanMonotonic(Ordering))
1662 return false;
1664 // Address Space Setting
1665 unsigned int CodeAddrSpace = getCodeAddrSpace(ST);
1666 unsigned int PointerSize =
1667 CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace());
1669 // Volatile Setting
1670 // - .volatile is only available for .global and .shared
1671 // - .volatile has the same memory synchronization semantics as .relaxed.sys
1672 bool isVolatile = ST->isVolatile() || Ordering == AtomicOrdering::Monotonic;
1673 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1674 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1675 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1676 isVolatile = false;
1678 // Vector Setting
1679 MVT SimpleVT = StoreVT.getSimpleVT();
1680 unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
1682 // Type Setting: toType + toTypeWidth
1683 // - for integer type, always use 'u'
1685 MVT ScalarVT = SimpleVT.getScalarType();
1686 unsigned toTypeWidth = ScalarVT.getSizeInBits();
1687 if (SimpleVT.isVector()) {
1688 assert((Isv2x16VT(StoreVT) || StoreVT == MVT::v4i8) &&
1689 "Unexpected vector type");
1690 // v2x16 is stored using st.b32
1691 toTypeWidth = 32;
1694 unsigned int toType = getLdStRegType(ScalarVT);
1696 // Create the machine instruction DAG
1697 SDValue Chain = ST->getChain();
1698 SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
1699 SDValue BasePtr = ST->getBasePtr();
1700 SDValue Addr;
1701 SDValue Offset, Base;
1702 std::optional<unsigned> Opcode;
1703 MVT::SimpleValueType SourceVT =
1704 Value.getNode()->getSimpleValueType(0).SimpleTy;
1706 if (SelectDirectAddr(BasePtr, Addr)) {
1707 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar,
1708 NVPTX::ST_i32_avar, NVPTX::ST_i64_avar,
1709 NVPTX::ST_f32_avar, NVPTX::ST_f64_avar);
1710 if (!Opcode)
1711 return false;
1712 SDValue Ops[] = {Value,
1713 getI32Imm(isVolatile, dl),
1714 getI32Imm(CodeAddrSpace, dl),
1715 getI32Imm(vecType, dl),
1716 getI32Imm(toType, dl),
1717 getI32Imm(toTypeWidth, dl),
1718 Addr,
1719 Chain};
1720 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1721 } else if (PointerSize == 64
1722 ? SelectADDRsi64(BasePtr.getNode(), BasePtr, Base, Offset)
1723 : SelectADDRsi(BasePtr.getNode(), BasePtr, Base, Offset)) {
1724 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi,
1725 NVPTX::ST_i32_asi, NVPTX::ST_i64_asi,
1726 NVPTX::ST_f32_asi, NVPTX::ST_f64_asi);
1727 if (!Opcode)
1728 return false;
1729 SDValue Ops[] = {Value,
1730 getI32Imm(isVolatile, dl),
1731 getI32Imm(CodeAddrSpace, dl),
1732 getI32Imm(vecType, dl),
1733 getI32Imm(toType, dl),
1734 getI32Imm(toTypeWidth, dl),
1735 Base,
1736 Offset,
1737 Chain};
1738 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1739 } else if (PointerSize == 64
1740 ? SelectADDRri64(BasePtr.getNode(), BasePtr, Base, Offset)
1741 : SelectADDRri(BasePtr.getNode(), BasePtr, Base, Offset)) {
1742 if (PointerSize == 64)
1743 Opcode =
1744 pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64,
1745 NVPTX::ST_i32_ari_64, NVPTX::ST_i64_ari_64,
1746 NVPTX::ST_f32_ari_64, NVPTX::ST_f64_ari_64);
1747 else
1748 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari, NVPTX::ST_i16_ari,
1749 NVPTX::ST_i32_ari, NVPTX::ST_i64_ari,
1750 NVPTX::ST_f32_ari, NVPTX::ST_f64_ari);
1751 if (!Opcode)
1752 return false;
1754 SDValue Ops[] = {Value,
1755 getI32Imm(isVolatile, dl),
1756 getI32Imm(CodeAddrSpace, dl),
1757 getI32Imm(vecType, dl),
1758 getI32Imm(toType, dl),
1759 getI32Imm(toTypeWidth, dl),
1760 Base,
1761 Offset,
1762 Chain};
1763 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1764 } else {
1765 if (PointerSize == 64)
1766 Opcode =
1767 pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg_64, NVPTX::ST_i16_areg_64,
1768 NVPTX::ST_i32_areg_64, NVPTX::ST_i64_areg_64,
1769 NVPTX::ST_f32_areg_64, NVPTX::ST_f64_areg_64);
1770 else
1771 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg, NVPTX::ST_i16_areg,
1772 NVPTX::ST_i32_areg, NVPTX::ST_i64_areg,
1773 NVPTX::ST_f32_areg, NVPTX::ST_f64_areg);
1774 if (!Opcode)
1775 return false;
1776 SDValue Ops[] = {Value,
1777 getI32Imm(isVolatile, dl),
1778 getI32Imm(CodeAddrSpace, dl),
1779 getI32Imm(vecType, dl),
1780 getI32Imm(toType, dl),
1781 getI32Imm(toTypeWidth, dl),
1782 BasePtr,
1783 Chain};
1784 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1787 if (!NVPTXST)
1788 return false;
1790 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1791 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef});
1792 ReplaceNode(N, NVPTXST);
1793 return true;
1796 bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
1797 SDValue Chain = N->getOperand(0);
1798 SDValue Op1 = N->getOperand(1);
1799 SDValue Addr, Offset, Base;
1800 std::optional<unsigned> Opcode;
1801 SDLoc DL(N);
1802 SDNode *ST;
1803 EVT EltVT = Op1.getValueType();
1804 MemSDNode *MemSD = cast<MemSDNode>(N);
1805 EVT StoreVT = MemSD->getMemoryVT();
1807 // Address Space Setting
1808 unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);
1809 if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) {
1810 report_fatal_error("Cannot store to pointer that points to constant "
1811 "memory space");
1813 unsigned int PointerSize =
1814 CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
1816 // Volatile Setting
1817 // - .volatile is only availalble for .global and .shared
1818 bool IsVolatile = MemSD->isVolatile();
1819 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1820 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1821 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1822 IsVolatile = false;
1824 // Type Setting: toType + toTypeWidth
1825 // - for integer type, always use 'u'
1826 assert(StoreVT.isSimple() && "Store value is not simple");
1827 MVT ScalarVT = StoreVT.getSimpleVT().getScalarType();
1828 unsigned ToTypeWidth = ScalarVT.getSizeInBits();
1829 unsigned ToType = getLdStRegType(ScalarVT);
1831 SmallVector<SDValue, 12> StOps;
1832 SDValue N2;
1833 unsigned VecType;
1835 switch (N->getOpcode()) {
1836 case NVPTXISD::StoreV2:
1837 VecType = NVPTX::PTXLdStInstCode::V2;
1838 StOps.push_back(N->getOperand(1));
1839 StOps.push_back(N->getOperand(2));
1840 N2 = N->getOperand(3);
1841 break;
1842 case NVPTXISD::StoreV4:
1843 VecType = NVPTX::PTXLdStInstCode::V4;
1844 StOps.push_back(N->getOperand(1));
1845 StOps.push_back(N->getOperand(2));
1846 StOps.push_back(N->getOperand(3));
1847 StOps.push_back(N->getOperand(4));
1848 N2 = N->getOperand(5);
1849 break;
1850 default:
1851 return false;
1854 // v8x16 is a special case. PTX doesn't have st.v8.x16
1855 // instruction. Instead, we split the vector into v2x16 chunks and
1856 // store them with st.v4.b32.
1857 if (Isv2x16VT(EltVT)) {
1858 assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode.");
1859 EltVT = MVT::i32;
1860 ToType = NVPTX::PTXLdStInstCode::Untyped;
1861 ToTypeWidth = 32;
1864 StOps.push_back(getI32Imm(IsVolatile, DL));
1865 StOps.push_back(getI32Imm(CodeAddrSpace, DL));
1866 StOps.push_back(getI32Imm(VecType, DL));
1867 StOps.push_back(getI32Imm(ToType, DL));
1868 StOps.push_back(getI32Imm(ToTypeWidth, DL));
1870 if (SelectDirectAddr(N2, Addr)) {
1871 switch (N->getOpcode()) {
1872 default:
1873 return false;
1874 case NVPTXISD::StoreV2:
1875 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1876 NVPTX::STV_i8_v2_avar, NVPTX::STV_i16_v2_avar,
1877 NVPTX::STV_i32_v2_avar, NVPTX::STV_i64_v2_avar,
1878 NVPTX::STV_f32_v2_avar, NVPTX::STV_f64_v2_avar);
1879 break;
1880 case NVPTXISD::StoreV4:
1881 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1882 NVPTX::STV_i8_v4_avar, NVPTX::STV_i16_v4_avar,
1883 NVPTX::STV_i32_v4_avar, std::nullopt,
1884 NVPTX::STV_f32_v4_avar, std::nullopt);
1885 break;
1887 StOps.push_back(Addr);
1888 } else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
1889 : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
1890 switch (N->getOpcode()) {
1891 default:
1892 return false;
1893 case NVPTXISD::StoreV2:
1894 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1895 NVPTX::STV_i8_v2_asi, NVPTX::STV_i16_v2_asi,
1896 NVPTX::STV_i32_v2_asi, NVPTX::STV_i64_v2_asi,
1897 NVPTX::STV_f32_v2_asi, NVPTX::STV_f64_v2_asi);
1898 break;
1899 case NVPTXISD::StoreV4:
1900 Opcode =
1901 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_asi,
1902 NVPTX::STV_i16_v4_asi, NVPTX::STV_i32_v4_asi,
1903 std::nullopt, NVPTX::STV_f32_v4_asi, std::nullopt);
1904 break;
1906 StOps.push_back(Base);
1907 StOps.push_back(Offset);
1908 } else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
1909 : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
1910 if (PointerSize == 64) {
1911 switch (N->getOpcode()) {
1912 default:
1913 return false;
1914 case NVPTXISD::StoreV2:
1915 Opcode =
1916 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1917 NVPTX::STV_i8_v2_ari_64, NVPTX::STV_i16_v2_ari_64,
1918 NVPTX::STV_i32_v2_ari_64, NVPTX::STV_i64_v2_ari_64,
1919 NVPTX::STV_f32_v2_ari_64, NVPTX::STV_f64_v2_ari_64);
1920 break;
1921 case NVPTXISD::StoreV4:
1922 Opcode = pickOpcodeForVT(
1923 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari_64,
1924 NVPTX::STV_i16_v4_ari_64, NVPTX::STV_i32_v4_ari_64, std::nullopt,
1925 NVPTX::STV_f32_v4_ari_64, std::nullopt);
1926 break;
1928 } else {
1929 switch (N->getOpcode()) {
1930 default:
1931 return false;
1932 case NVPTXISD::StoreV2:
1933 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1934 NVPTX::STV_i8_v2_ari, NVPTX::STV_i16_v2_ari,
1935 NVPTX::STV_i32_v2_ari, NVPTX::STV_i64_v2_ari,
1936 NVPTX::STV_f32_v2_ari, NVPTX::STV_f64_v2_ari);
1937 break;
1938 case NVPTXISD::StoreV4:
1939 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1940 NVPTX::STV_i8_v4_ari, NVPTX::STV_i16_v4_ari,
1941 NVPTX::STV_i32_v4_ari, std::nullopt,
1942 NVPTX::STV_f32_v4_ari, std::nullopt);
1943 break;
1946 StOps.push_back(Base);
1947 StOps.push_back(Offset);
1948 } else {
1949 if (PointerSize == 64) {
1950 switch (N->getOpcode()) {
1951 default:
1952 return false;
1953 case NVPTXISD::StoreV2:
1954 Opcode = pickOpcodeForVT(
1955 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg_64,
1956 NVPTX::STV_i16_v2_areg_64, NVPTX::STV_i32_v2_areg_64,
1957 NVPTX::STV_i64_v2_areg_64, NVPTX::STV_f32_v2_areg_64,
1958 NVPTX::STV_f64_v2_areg_64);
1959 break;
1960 case NVPTXISD::StoreV4:
1961 Opcode = pickOpcodeForVT(
1962 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg_64,
1963 NVPTX::STV_i16_v4_areg_64, NVPTX::STV_i32_v4_areg_64, std::nullopt,
1964 NVPTX::STV_f32_v4_areg_64, std::nullopt);
1965 break;
1967 } else {
1968 switch (N->getOpcode()) {
1969 default:
1970 return false;
1971 case NVPTXISD::StoreV2:
1972 Opcode =
1973 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg,
1974 NVPTX::STV_i16_v2_areg, NVPTX::STV_i32_v2_areg,
1975 NVPTX::STV_i64_v2_areg, NVPTX::STV_f32_v2_areg,
1976 NVPTX::STV_f64_v2_areg);
1977 break;
1978 case NVPTXISD::StoreV4:
1979 Opcode =
1980 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg,
1981 NVPTX::STV_i16_v4_areg, NVPTX::STV_i32_v4_areg,
1982 std::nullopt, NVPTX::STV_f32_v4_areg, std::nullopt);
1983 break;
1986 StOps.push_back(N2);
1989 if (!Opcode)
1990 return false;
1992 StOps.push_back(Chain);
1994 ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, StOps);
1996 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1997 CurDAG->setNodeMemRefs(cast<MachineSDNode>(ST), {MemRef});
1999 ReplaceNode(N, ST);
2000 return true;
2003 bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
2004 SDValue Chain = Node->getOperand(0);
2005 SDValue Offset = Node->getOperand(2);
2006 SDValue Glue = Node->getOperand(3);
2007 SDLoc DL(Node);
2008 MemSDNode *Mem = cast<MemSDNode>(Node);
2010 unsigned VecSize;
2011 switch (Node->getOpcode()) {
2012 default:
2013 return false;
2014 case NVPTXISD::LoadParam:
2015 VecSize = 1;
2016 break;
2017 case NVPTXISD::LoadParamV2:
2018 VecSize = 2;
2019 break;
2020 case NVPTXISD::LoadParamV4:
2021 VecSize = 4;
2022 break;
2025 EVT EltVT = Node->getValueType(0);
2026 EVT MemVT = Mem->getMemoryVT();
2028 std::optional<unsigned> Opcode;
2030 switch (VecSize) {
2031 default:
2032 return false;
2033 case 1:
2034 Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy,
2035 NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16,
2036 NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64,
2037 NVPTX::LoadParamMemF32, NVPTX::LoadParamMemF64);
2038 break;
2039 case 2:
2040 Opcode =
2041 pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8,
2042 NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32,
2043 NVPTX::LoadParamMemV2I64, NVPTX::LoadParamMemV2F32,
2044 NVPTX::LoadParamMemV2F64);
2045 break;
2046 case 4:
2047 Opcode =
2048 pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV4I8,
2049 NVPTX::LoadParamMemV4I16, NVPTX::LoadParamMemV4I32,
2050 std::nullopt, NVPTX::LoadParamMemV4F32, std::nullopt);
2051 break;
2053 if (!Opcode)
2054 return false;
2056 SDVTList VTs;
2057 if (VecSize == 1) {
2058 VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue);
2059 } else if (VecSize == 2) {
2060 VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue);
2061 } else {
2062 EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue };
2063 VTs = CurDAG->getVTList(EVTs);
2066 unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
2068 SmallVector<SDValue, 2> Ops;
2069 Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2070 Ops.push_back(Chain);
2071 Ops.push_back(Glue);
2073 ReplaceNode(Node, CurDAG->getMachineNode(*Opcode, DL, VTs, Ops));
2074 return true;
2077 bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
2078 SDLoc DL(N);
2079 SDValue Chain = N->getOperand(0);
2080 SDValue Offset = N->getOperand(1);
2081 unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
2082 MemSDNode *Mem = cast<MemSDNode>(N);
2084 // How many elements do we have?
2085 unsigned NumElts = 1;
2086 switch (N->getOpcode()) {
2087 default:
2088 return false;
2089 case NVPTXISD::StoreRetval:
2090 NumElts = 1;
2091 break;
2092 case NVPTXISD::StoreRetvalV2:
2093 NumElts = 2;
2094 break;
2095 case NVPTXISD::StoreRetvalV4:
2096 NumElts = 4;
2097 break;
2100 // Build vector of operands
2101 SmallVector<SDValue, 6> Ops;
2102 for (unsigned i = 0; i < NumElts; ++i)
2103 Ops.push_back(N->getOperand(i + 2));
2104 Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2105 Ops.push_back(Chain);
2107 // Determine target opcode
2108 // If we have an i1, use an 8-bit store. The lowering code in
2109 // NVPTXISelLowering will have already emitted an upcast.
2110 std::optional<unsigned> Opcode = 0;
2111 switch (NumElts) {
2112 default:
2113 return false;
2114 case 1:
2115 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2116 NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16,
2117 NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64,
2118 NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64);
2119 break;
2120 case 2:
2121 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2122 NVPTX::StoreRetvalV2I8, NVPTX::StoreRetvalV2I16,
2123 NVPTX::StoreRetvalV2I32, NVPTX::StoreRetvalV2I64,
2124 NVPTX::StoreRetvalV2F32, NVPTX::StoreRetvalV2F64);
2125 break;
2126 case 4:
2127 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2128 NVPTX::StoreRetvalV4I8, NVPTX::StoreRetvalV4I16,
2129 NVPTX::StoreRetvalV4I32, std::nullopt,
2130 NVPTX::StoreRetvalV4F32, std::nullopt);
2131 break;
2133 if (!Opcode)
2134 return false;
2136 SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
2137 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2138 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
2140 ReplaceNode(N, Ret);
2141 return true;
2144 bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
2145 SDLoc DL(N);
2146 SDValue Chain = N->getOperand(0);
2147 SDValue Param = N->getOperand(1);
2148 unsigned ParamVal = cast<ConstantSDNode>(Param)->getZExtValue();
2149 SDValue Offset = N->getOperand(2);
2150 unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
2151 MemSDNode *Mem = cast<MemSDNode>(N);
2152 SDValue Glue = N->getOperand(N->getNumOperands() - 1);
2154 // How many elements do we have?
2155 unsigned NumElts = 1;
2156 switch (N->getOpcode()) {
2157 default:
2158 return false;
2159 case NVPTXISD::StoreParamU32:
2160 case NVPTXISD::StoreParamS32:
2161 case NVPTXISD::StoreParam:
2162 NumElts = 1;
2163 break;
2164 case NVPTXISD::StoreParamV2:
2165 NumElts = 2;
2166 break;
2167 case NVPTXISD::StoreParamV4:
2168 NumElts = 4;
2169 break;
2172 // Build vector of operands
2173 SmallVector<SDValue, 8> Ops;
2174 for (unsigned i = 0; i < NumElts; ++i)
2175 Ops.push_back(N->getOperand(i + 3));
2176 Ops.push_back(CurDAG->getTargetConstant(ParamVal, DL, MVT::i32));
2177 Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2178 Ops.push_back(Chain);
2179 Ops.push_back(Glue);
2181 // Determine target opcode
2182 // If we have an i1, use an 8-bit store. The lowering code in
2183 // NVPTXISelLowering will have already emitted an upcast.
2184 std::optional<unsigned> Opcode = 0;
2185 switch (N->getOpcode()) {
2186 default:
2187 switch (NumElts) {
2188 default:
2189 return false;
2190 case 1:
2191 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2192 NVPTX::StoreParamI8, NVPTX::StoreParamI16,
2193 NVPTX::StoreParamI32, NVPTX::StoreParamI64,
2194 NVPTX::StoreParamF32, NVPTX::StoreParamF64);
2195 break;
2196 case 2:
2197 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2198 NVPTX::StoreParamV2I8, NVPTX::StoreParamV2I16,
2199 NVPTX::StoreParamV2I32, NVPTX::StoreParamV2I64,
2200 NVPTX::StoreParamV2F32, NVPTX::StoreParamV2F64);
2201 break;
2202 case 4:
2203 Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
2204 NVPTX::StoreParamV4I8, NVPTX::StoreParamV4I16,
2205 NVPTX::StoreParamV4I32, std::nullopt,
2206 NVPTX::StoreParamV4F32, std::nullopt);
2207 break;
2209 if (!Opcode)
2210 return false;
2211 break;
2212 // Special case: if we have a sign-extend/zero-extend node, insert the
2213 // conversion instruction first, and use that as the value operand to
2214 // the selected StoreParam node.
2215 case NVPTXISD::StoreParamU32: {
2216 Opcode = NVPTX::StoreParamI32;
2217 SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL,
2218 MVT::i32);
2219 SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u32_u16, DL,
2220 MVT::i32, Ops[0], CvtNone);
2221 Ops[0] = SDValue(Cvt, 0);
2222 break;
2224 case NVPTXISD::StoreParamS32: {
2225 Opcode = NVPTX::StoreParamI32;
2226 SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL,
2227 MVT::i32);
2228 SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_s32_s16, DL,
2229 MVT::i32, Ops[0], CvtNone);
2230 Ops[0] = SDValue(Cvt, 0);
2231 break;
2235 SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
2236 SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, RetVTs, Ops);
2237 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2238 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
2240 ReplaceNode(N, Ret);
2241 return true;
2244 bool NVPTXDAGToDAGISel::tryTextureIntrinsic(SDNode *N) {
2245 unsigned Opc = 0;
2247 switch (N->getOpcode()) {
2248 default: return false;
2249 case NVPTXISD::Tex1DFloatS32:
2250 Opc = NVPTX::TEX_1D_F32_S32_RR;
2251 break;
2252 case NVPTXISD::Tex1DFloatFloat:
2253 Opc = NVPTX::TEX_1D_F32_F32_RR;
2254 break;
2255 case NVPTXISD::Tex1DFloatFloatLevel:
2256 Opc = NVPTX::TEX_1D_F32_F32_LEVEL_RR;
2257 break;
2258 case NVPTXISD::Tex1DFloatFloatGrad:
2259 Opc = NVPTX::TEX_1D_F32_F32_GRAD_RR;
2260 break;
2261 case NVPTXISD::Tex1DS32S32:
2262 Opc = NVPTX::TEX_1D_S32_S32_RR;
2263 break;
2264 case NVPTXISD::Tex1DS32Float:
2265 Opc = NVPTX::TEX_1D_S32_F32_RR;
2266 break;
2267 case NVPTXISD::Tex1DS32FloatLevel:
2268 Opc = NVPTX::TEX_1D_S32_F32_LEVEL_RR;
2269 break;
2270 case NVPTXISD::Tex1DS32FloatGrad:
2271 Opc = NVPTX::TEX_1D_S32_F32_GRAD_RR;
2272 break;
2273 case NVPTXISD::Tex1DU32S32:
2274 Opc = NVPTX::TEX_1D_U32_S32_RR;
2275 break;
2276 case NVPTXISD::Tex1DU32Float:
2277 Opc = NVPTX::TEX_1D_U32_F32_RR;
2278 break;
2279 case NVPTXISD::Tex1DU32FloatLevel:
2280 Opc = NVPTX::TEX_1D_U32_F32_LEVEL_RR;
2281 break;
2282 case NVPTXISD::Tex1DU32FloatGrad:
2283 Opc = NVPTX::TEX_1D_U32_F32_GRAD_RR;
2284 break;
2285 case NVPTXISD::Tex1DArrayFloatS32:
2286 Opc = NVPTX::TEX_1D_ARRAY_F32_S32_RR;
2287 break;
2288 case NVPTXISD::Tex1DArrayFloatFloat:
2289 Opc = NVPTX::TEX_1D_ARRAY_F32_F32_RR;
2290 break;
2291 case NVPTXISD::Tex1DArrayFloatFloatLevel:
2292 Opc = NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL_RR;
2293 break;
2294 case NVPTXISD::Tex1DArrayFloatFloatGrad:
2295 Opc = NVPTX::TEX_1D_ARRAY_F32_F32_GRAD_RR;
2296 break;
2297 case NVPTXISD::Tex1DArrayS32S32:
2298 Opc = NVPTX::TEX_1D_ARRAY_S32_S32_RR;
2299 break;
2300 case NVPTXISD::Tex1DArrayS32Float:
2301 Opc = NVPTX::TEX_1D_ARRAY_S32_F32_RR;
2302 break;
2303 case NVPTXISD::Tex1DArrayS32FloatLevel:
2304 Opc = NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL_RR;
2305 break;
2306 case NVPTXISD::Tex1DArrayS32FloatGrad:
2307 Opc = NVPTX::TEX_1D_ARRAY_S32_F32_GRAD_RR;
2308 break;
2309 case NVPTXISD::Tex1DArrayU32S32:
2310 Opc = NVPTX::TEX_1D_ARRAY_U32_S32_RR;
2311 break;
2312 case NVPTXISD::Tex1DArrayU32Float:
2313 Opc = NVPTX::TEX_1D_ARRAY_U32_F32_RR;
2314 break;
2315 case NVPTXISD::Tex1DArrayU32FloatLevel:
2316 Opc = NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL_RR;
2317 break;
2318 case NVPTXISD::Tex1DArrayU32FloatGrad:
2319 Opc = NVPTX::TEX_1D_ARRAY_U32_F32_GRAD_RR;
2320 break;
2321 case NVPTXISD::Tex2DFloatS32:
2322 Opc = NVPTX::TEX_2D_F32_S32_RR;
2323 break;
2324 case NVPTXISD::Tex2DFloatFloat:
2325 Opc = NVPTX::TEX_2D_F32_F32_RR;
2326 break;
2327 case NVPTXISD::Tex2DFloatFloatLevel:
2328 Opc = NVPTX::TEX_2D_F32_F32_LEVEL_RR;
2329 break;
2330 case NVPTXISD::Tex2DFloatFloatGrad:
2331 Opc = NVPTX::TEX_2D_F32_F32_GRAD_RR;
2332 break;
2333 case NVPTXISD::Tex2DS32S32:
2334 Opc = NVPTX::TEX_2D_S32_S32_RR;
2335 break;
2336 case NVPTXISD::Tex2DS32Float:
2337 Opc = NVPTX::TEX_2D_S32_F32_RR;
2338 break;
2339 case NVPTXISD::Tex2DS32FloatLevel:
2340 Opc = NVPTX::TEX_2D_S32_F32_LEVEL_RR;
2341 break;
2342 case NVPTXISD::Tex2DS32FloatGrad:
2343 Opc = NVPTX::TEX_2D_S32_F32_GRAD_RR;
2344 break;
2345 case NVPTXISD::Tex2DU32S32:
2346 Opc = NVPTX::TEX_2D_U32_S32_RR;
2347 break;
2348 case NVPTXISD::Tex2DU32Float:
2349 Opc = NVPTX::TEX_2D_U32_F32_RR;
2350 break;
2351 case NVPTXISD::Tex2DU32FloatLevel:
2352 Opc = NVPTX::TEX_2D_U32_F32_LEVEL_RR;
2353 break;
2354 case NVPTXISD::Tex2DU32FloatGrad:
2355 Opc = NVPTX::TEX_2D_U32_F32_GRAD_RR;
2356 break;
2357 case NVPTXISD::Tex2DArrayFloatS32:
2358 Opc = NVPTX::TEX_2D_ARRAY_F32_S32_RR;
2359 break;
2360 case NVPTXISD::Tex2DArrayFloatFloat:
2361 Opc = NVPTX::TEX_2D_ARRAY_F32_F32_RR;
2362 break;
2363 case NVPTXISD::Tex2DArrayFloatFloatLevel:
2364 Opc = NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL_RR;
2365 break;
2366 case NVPTXISD::Tex2DArrayFloatFloatGrad:
2367 Opc = NVPTX::TEX_2D_ARRAY_F32_F32_GRAD_RR;
2368 break;
2369 case NVPTXISD::Tex2DArrayS32S32:
2370 Opc = NVPTX::TEX_2D_ARRAY_S32_S32_RR;
2371 break;
2372 case NVPTXISD::Tex2DArrayS32Float:
2373 Opc = NVPTX::TEX_2D_ARRAY_S32_F32_RR;
2374 break;
2375 case NVPTXISD::Tex2DArrayS32FloatLevel:
2376 Opc = NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL_RR;
2377 break;
2378 case NVPTXISD::Tex2DArrayS32FloatGrad:
2379 Opc = NVPTX::TEX_2D_ARRAY_S32_F32_GRAD_RR;
2380 break;
2381 case NVPTXISD::Tex2DArrayU32S32:
2382 Opc = NVPTX::TEX_2D_ARRAY_U32_S32_RR;
2383 break;
2384 case NVPTXISD::Tex2DArrayU32Float:
2385 Opc = NVPTX::TEX_2D_ARRAY_U32_F32_RR;
2386 break;
2387 case NVPTXISD::Tex2DArrayU32FloatLevel:
2388 Opc = NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL_RR;
2389 break;
2390 case NVPTXISD::Tex2DArrayU32FloatGrad:
2391 Opc = NVPTX::TEX_2D_ARRAY_U32_F32_GRAD_RR;
2392 break;
2393 case NVPTXISD::Tex3DFloatS32:
2394 Opc = NVPTX::TEX_3D_F32_S32_RR;
2395 break;
2396 case NVPTXISD::Tex3DFloatFloat:
2397 Opc = NVPTX::TEX_3D_F32_F32_RR;
2398 break;
2399 case NVPTXISD::Tex3DFloatFloatLevel:
2400 Opc = NVPTX::TEX_3D_F32_F32_LEVEL_RR;
2401 break;
2402 case NVPTXISD::Tex3DFloatFloatGrad:
2403 Opc = NVPTX::TEX_3D_F32_F32_GRAD_RR;
2404 break;
2405 case NVPTXISD::Tex3DS32S32:
2406 Opc = NVPTX::TEX_3D_S32_S32_RR;
2407 break;
2408 case NVPTXISD::Tex3DS32Float:
2409 Opc = NVPTX::TEX_3D_S32_F32_RR;
2410 break;
2411 case NVPTXISD::Tex3DS32FloatLevel:
2412 Opc = NVPTX::TEX_3D_S32_F32_LEVEL_RR;
2413 break;
2414 case NVPTXISD::Tex3DS32FloatGrad:
2415 Opc = NVPTX::TEX_3D_S32_F32_GRAD_RR;
2416 break;
2417 case NVPTXISD::Tex3DU32S32:
2418 Opc = NVPTX::TEX_3D_U32_S32_RR;
2419 break;
2420 case NVPTXISD::Tex3DU32Float:
2421 Opc = NVPTX::TEX_3D_U32_F32_RR;
2422 break;
2423 case NVPTXISD::Tex3DU32FloatLevel:
2424 Opc = NVPTX::TEX_3D_U32_F32_LEVEL_RR;
2425 break;
2426 case NVPTXISD::Tex3DU32FloatGrad:
2427 Opc = NVPTX::TEX_3D_U32_F32_GRAD_RR;
2428 break;
2429 case NVPTXISD::TexCubeFloatFloat:
2430 Opc = NVPTX::TEX_CUBE_F32_F32_RR;
2431 break;
2432 case NVPTXISD::TexCubeFloatFloatLevel:
2433 Opc = NVPTX::TEX_CUBE_F32_F32_LEVEL_RR;
2434 break;
2435 case NVPTXISD::TexCubeS32Float:
2436 Opc = NVPTX::TEX_CUBE_S32_F32_RR;
2437 break;
2438 case NVPTXISD::TexCubeS32FloatLevel:
2439 Opc = NVPTX::TEX_CUBE_S32_F32_LEVEL_RR;
2440 break;
2441 case NVPTXISD::TexCubeU32Float:
2442 Opc = NVPTX::TEX_CUBE_U32_F32_RR;
2443 break;
2444 case NVPTXISD::TexCubeU32FloatLevel:
2445 Opc = NVPTX::TEX_CUBE_U32_F32_LEVEL_RR;
2446 break;
2447 case NVPTXISD::TexCubeArrayFloatFloat:
2448 Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_RR;
2449 break;
2450 case NVPTXISD::TexCubeArrayFloatFloatLevel:
2451 Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL_RR;
2452 break;
2453 case NVPTXISD::TexCubeArrayS32Float:
2454 Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_RR;
2455 break;
2456 case NVPTXISD::TexCubeArrayS32FloatLevel:
2457 Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL_RR;
2458 break;
2459 case NVPTXISD::TexCubeArrayU32Float:
2460 Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_RR;
2461 break;
2462 case NVPTXISD::TexCubeArrayU32FloatLevel:
2463 Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL_RR;
2464 break;
2465 case NVPTXISD::Tld4R2DFloatFloat:
2466 Opc = NVPTX::TLD4_R_2D_F32_F32_RR;
2467 break;
2468 case NVPTXISD::Tld4G2DFloatFloat:
2469 Opc = NVPTX::TLD4_G_2D_F32_F32_RR;
2470 break;
2471 case NVPTXISD::Tld4B2DFloatFloat:
2472 Opc = NVPTX::TLD4_B_2D_F32_F32_RR;
2473 break;
2474 case NVPTXISD::Tld4A2DFloatFloat:
2475 Opc = NVPTX::TLD4_A_2D_F32_F32_RR;
2476 break;
2477 case NVPTXISD::Tld4R2DS64Float:
2478 Opc = NVPTX::TLD4_R_2D_S32_F32_RR;
2479 break;
2480 case NVPTXISD::Tld4G2DS64Float:
2481 Opc = NVPTX::TLD4_G_2D_S32_F32_RR;
2482 break;
2483 case NVPTXISD::Tld4B2DS64Float:
2484 Opc = NVPTX::TLD4_B_2D_S32_F32_RR;
2485 break;
2486 case NVPTXISD::Tld4A2DS64Float:
2487 Opc = NVPTX::TLD4_A_2D_S32_F32_RR;
2488 break;
2489 case NVPTXISD::Tld4R2DU64Float:
2490 Opc = NVPTX::TLD4_R_2D_U32_F32_RR;
2491 break;
2492 case NVPTXISD::Tld4G2DU64Float:
2493 Opc = NVPTX::TLD4_G_2D_U32_F32_RR;
2494 break;
2495 case NVPTXISD::Tld4B2DU64Float:
2496 Opc = NVPTX::TLD4_B_2D_U32_F32_RR;
2497 break;
2498 case NVPTXISD::Tld4A2DU64Float:
2499 Opc = NVPTX::TLD4_A_2D_U32_F32_RR;
2500 break;
2501 case NVPTXISD::TexUnified1DFloatS32:
2502 Opc = NVPTX::TEX_UNIFIED_1D_F32_S32_R;
2503 break;
2504 case NVPTXISD::TexUnified1DFloatFloat:
2505 Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_R;
2506 break;
2507 case NVPTXISD::TexUnified1DFloatFloatLevel:
2508 Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_LEVEL_R;
2509 break;
2510 case NVPTXISD::TexUnified1DFloatFloatGrad:
2511 Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_GRAD_R;
2512 break;
2513 case NVPTXISD::TexUnified1DS32S32:
2514 Opc = NVPTX::TEX_UNIFIED_1D_S32_S32_R;
2515 break;
2516 case NVPTXISD::TexUnified1DS32Float:
2517 Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_R;
2518 break;
2519 case NVPTXISD::TexUnified1DS32FloatLevel:
2520 Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_LEVEL_R;
2521 break;
2522 case NVPTXISD::TexUnified1DS32FloatGrad:
2523 Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_GRAD_R;
2524 break;
2525 case NVPTXISD::TexUnified1DU32S32:
2526 Opc = NVPTX::TEX_UNIFIED_1D_U32_S32_R;
2527 break;
2528 case NVPTXISD::TexUnified1DU32Float:
2529 Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_R;
2530 break;
2531 case NVPTXISD::TexUnified1DU32FloatLevel:
2532 Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_LEVEL_R;
2533 break;
2534 case NVPTXISD::TexUnified1DU32FloatGrad:
2535 Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_GRAD_R;
2536 break;
2537 case NVPTXISD::TexUnified1DArrayFloatS32:
2538 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_S32_R;
2539 break;
2540 case NVPTXISD::TexUnified1DArrayFloatFloat:
2541 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_R;
2542 break;
2543 case NVPTXISD::TexUnified1DArrayFloatFloatLevel:
2544 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL_R;
2545 break;
2546 case NVPTXISD::TexUnified1DArrayFloatFloatGrad:
2547 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD_R;
2548 break;
2549 case NVPTXISD::TexUnified1DArrayS32S32:
2550 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_S32_R;
2551 break;
2552 case NVPTXISD::TexUnified1DArrayS32Float:
2553 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_R;
2554 break;
2555 case NVPTXISD::TexUnified1DArrayS32FloatLevel:
2556 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL_R;
2557 break;
2558 case NVPTXISD::TexUnified1DArrayS32FloatGrad:
2559 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD_R;
2560 break;
2561 case NVPTXISD::TexUnified1DArrayU32S32:
2562 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_S32_R;
2563 break;
2564 case NVPTXISD::TexUnified1DArrayU32Float:
2565 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_R;
2566 break;
2567 case NVPTXISD::TexUnified1DArrayU32FloatLevel:
2568 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL_R;
2569 break;
2570 case NVPTXISD::TexUnified1DArrayU32FloatGrad:
2571 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD_R;
2572 break;
2573 case NVPTXISD::TexUnified2DFloatS32:
2574 Opc = NVPTX::TEX_UNIFIED_2D_F32_S32_R;
2575 break;
2576 case NVPTXISD::TexUnified2DFloatFloat:
2577 Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_R;
2578 break;
2579 case NVPTXISD::TexUnified2DFloatFloatLevel:
2580 Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_LEVEL_R;
2581 break;
2582 case NVPTXISD::TexUnified2DFloatFloatGrad:
2583 Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_GRAD_R;
2584 break;
2585 case NVPTXISD::TexUnified2DS32S32:
2586 Opc = NVPTX::TEX_UNIFIED_2D_S32_S32_R;
2587 break;
2588 case NVPTXISD::TexUnified2DS32Float:
2589 Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_R;
2590 break;
2591 case NVPTXISD::TexUnified2DS32FloatLevel:
2592 Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_LEVEL_R;
2593 break;
2594 case NVPTXISD::TexUnified2DS32FloatGrad:
2595 Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_GRAD_R;
2596 break;
2597 case NVPTXISD::TexUnified2DU32S32:
2598 Opc = NVPTX::TEX_UNIFIED_2D_U32_S32_R;
2599 break;
2600 case NVPTXISD::TexUnified2DU32Float:
2601 Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_R;
2602 break;
2603 case NVPTXISD::TexUnified2DU32FloatLevel:
2604 Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_LEVEL_R;
2605 break;
2606 case NVPTXISD::TexUnified2DU32FloatGrad:
2607 Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_GRAD_R;
2608 break;
2609 case NVPTXISD::TexUnified2DArrayFloatS32:
2610 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_S32_R;
2611 break;
2612 case NVPTXISD::TexUnified2DArrayFloatFloat:
2613 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_R;
2614 break;
2615 case NVPTXISD::TexUnified2DArrayFloatFloatLevel:
2616 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL_R;
2617 break;
2618 case NVPTXISD::TexUnified2DArrayFloatFloatGrad:
2619 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD_R;
2620 break;
2621 case NVPTXISD::TexUnified2DArrayS32S32:
2622 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_S32_R;
2623 break;
2624 case NVPTXISD::TexUnified2DArrayS32Float:
2625 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_R;
2626 break;
2627 case NVPTXISD::TexUnified2DArrayS32FloatLevel:
2628 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL_R;
2629 break;
2630 case NVPTXISD::TexUnified2DArrayS32FloatGrad:
2631 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD_R;
2632 break;
2633 case NVPTXISD::TexUnified2DArrayU32S32:
2634 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_S32_R;
2635 break;
2636 case NVPTXISD::TexUnified2DArrayU32Float:
2637 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_R;
2638 break;
2639 case NVPTXISD::TexUnified2DArrayU32FloatLevel:
2640 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL_R;
2641 break;
2642 case NVPTXISD::TexUnified2DArrayU32FloatGrad:
2643 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD_R;
2644 break;
2645 case NVPTXISD::TexUnified3DFloatS32:
2646 Opc = NVPTX::TEX_UNIFIED_3D_F32_S32_R;
2647 break;
2648 case NVPTXISD::TexUnified3DFloatFloat:
2649 Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_R;
2650 break;
2651 case NVPTXISD::TexUnified3DFloatFloatLevel:
2652 Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_LEVEL_R;
2653 break;
2654 case NVPTXISD::TexUnified3DFloatFloatGrad:
2655 Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_GRAD_R;
2656 break;
2657 case NVPTXISD::TexUnified3DS32S32:
2658 Opc = NVPTX::TEX_UNIFIED_3D_S32_S32_R;
2659 break;
2660 case NVPTXISD::TexUnified3DS32Float:
2661 Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_R;
2662 break;
2663 case NVPTXISD::TexUnified3DS32FloatLevel:
2664 Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_LEVEL_R;
2665 break;
2666 case NVPTXISD::TexUnified3DS32FloatGrad:
2667 Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_GRAD_R;
2668 break;
2669 case NVPTXISD::TexUnified3DU32S32:
2670 Opc = NVPTX::TEX_UNIFIED_3D_U32_S32_R;
2671 break;
2672 case NVPTXISD::TexUnified3DU32Float:
2673 Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_R;
2674 break;
2675 case NVPTXISD::TexUnified3DU32FloatLevel:
2676 Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_LEVEL_R;
2677 break;
2678 case NVPTXISD::TexUnified3DU32FloatGrad:
2679 Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_GRAD_R;
2680 break;
2681 case NVPTXISD::TexUnifiedCubeFloatFloat:
2682 Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_R;
2683 break;
2684 case NVPTXISD::TexUnifiedCubeFloatFloatLevel:
2685 Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_LEVEL_R;
2686 break;
2687 case NVPTXISD::TexUnifiedCubeS32Float:
2688 Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_R;
2689 break;
2690 case NVPTXISD::TexUnifiedCubeS32FloatLevel:
2691 Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_LEVEL_R;
2692 break;
2693 case NVPTXISD::TexUnifiedCubeU32Float:
2694 Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_R;
2695 break;
2696 case NVPTXISD::TexUnifiedCubeU32FloatLevel:
2697 Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_LEVEL_R;
2698 break;
2699 case NVPTXISD::TexUnifiedCubeArrayFloatFloat:
2700 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_R;
2701 break;
2702 case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel:
2703 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL_R;
2704 break;
2705 case NVPTXISD::TexUnifiedCubeArrayS32Float:
2706 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_R;
2707 break;
2708 case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel:
2709 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL_R;
2710 break;
2711 case NVPTXISD::TexUnifiedCubeArrayU32Float:
2712 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_R;
2713 break;
2714 case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel:
2715 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL_R;
2716 break;
2717 case NVPTXISD::Tld4UnifiedR2DFloatFloat:
2718 Opc = NVPTX::TLD4_UNIFIED_R_2D_F32_F32_R;
2719 break;
2720 case NVPTXISD::Tld4UnifiedG2DFloatFloat:
2721 Opc = NVPTX::TLD4_UNIFIED_G_2D_F32_F32_R;
2722 break;
2723 case NVPTXISD::Tld4UnifiedB2DFloatFloat:
2724 Opc = NVPTX::TLD4_UNIFIED_B_2D_F32_F32_R;
2725 break;
2726 case NVPTXISD::Tld4UnifiedA2DFloatFloat:
2727 Opc = NVPTX::TLD4_UNIFIED_A_2D_F32_F32_R;
2728 break;
2729 case NVPTXISD::Tld4UnifiedR2DS64Float:
2730 Opc = NVPTX::TLD4_UNIFIED_R_2D_S32_F32_R;
2731 break;
2732 case NVPTXISD::Tld4UnifiedG2DS64Float:
2733 Opc = NVPTX::TLD4_UNIFIED_G_2D_S32_F32_R;
2734 break;
2735 case NVPTXISD::Tld4UnifiedB2DS64Float:
2736 Opc = NVPTX::TLD4_UNIFIED_B_2D_S32_F32_R;
2737 break;
2738 case NVPTXISD::Tld4UnifiedA2DS64Float:
2739 Opc = NVPTX::TLD4_UNIFIED_A_2D_S32_F32_R;
2740 break;
2741 case NVPTXISD::Tld4UnifiedR2DU64Float:
2742 Opc = NVPTX::TLD4_UNIFIED_R_2D_U32_F32_R;
2743 break;
2744 case NVPTXISD::Tld4UnifiedG2DU64Float:
2745 Opc = NVPTX::TLD4_UNIFIED_G_2D_U32_F32_R;
2746 break;
2747 case NVPTXISD::Tld4UnifiedB2DU64Float:
2748 Opc = NVPTX::TLD4_UNIFIED_B_2D_U32_F32_R;
2749 break;
2750 case NVPTXISD::Tld4UnifiedA2DU64Float:
2751 Opc = NVPTX::TLD4_UNIFIED_A_2D_U32_F32_R;
2752 break;
2755 // Copy over operands
2756 SmallVector<SDValue, 8> Ops(drop_begin(N->ops()));
2757 Ops.push_back(N->getOperand(0)); // Move chain to the back.
2759 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
2760 return true;
2763 bool NVPTXDAGToDAGISel::trySurfaceIntrinsic(SDNode *N) {
2764 unsigned Opc = 0;
2765 switch (N->getOpcode()) {
2766 default: return false;
2767 case NVPTXISD::Suld1DI8Clamp:
2768 Opc = NVPTX::SULD_1D_I8_CLAMP_R;
2769 break;
2770 case NVPTXISD::Suld1DI16Clamp:
2771 Opc = NVPTX::SULD_1D_I16_CLAMP_R;
2772 break;
2773 case NVPTXISD::Suld1DI32Clamp:
2774 Opc = NVPTX::SULD_1D_I32_CLAMP_R;
2775 break;
2776 case NVPTXISD::Suld1DI64Clamp:
2777 Opc = NVPTX::SULD_1D_I64_CLAMP_R;
2778 break;
2779 case NVPTXISD::Suld1DV2I8Clamp:
2780 Opc = NVPTX::SULD_1D_V2I8_CLAMP_R;
2781 break;
2782 case NVPTXISD::Suld1DV2I16Clamp:
2783 Opc = NVPTX::SULD_1D_V2I16_CLAMP_R;
2784 break;
2785 case NVPTXISD::Suld1DV2I32Clamp:
2786 Opc = NVPTX::SULD_1D_V2I32_CLAMP_R;
2787 break;
2788 case NVPTXISD::Suld1DV2I64Clamp:
2789 Opc = NVPTX::SULD_1D_V2I64_CLAMP_R;
2790 break;
2791 case NVPTXISD::Suld1DV4I8Clamp:
2792 Opc = NVPTX::SULD_1D_V4I8_CLAMP_R;
2793 break;
2794 case NVPTXISD::Suld1DV4I16Clamp:
2795 Opc = NVPTX::SULD_1D_V4I16_CLAMP_R;
2796 break;
2797 case NVPTXISD::Suld1DV4I32Clamp:
2798 Opc = NVPTX::SULD_1D_V4I32_CLAMP_R;
2799 break;
2800 case NVPTXISD::Suld1DArrayI8Clamp:
2801 Opc = NVPTX::SULD_1D_ARRAY_I8_CLAMP_R;
2802 break;
2803 case NVPTXISD::Suld1DArrayI16Clamp:
2804 Opc = NVPTX::SULD_1D_ARRAY_I16_CLAMP_R;
2805 break;
2806 case NVPTXISD::Suld1DArrayI32Clamp:
2807 Opc = NVPTX::SULD_1D_ARRAY_I32_CLAMP_R;
2808 break;
2809 case NVPTXISD::Suld1DArrayI64Clamp:
2810 Opc = NVPTX::SULD_1D_ARRAY_I64_CLAMP_R;
2811 break;
2812 case NVPTXISD::Suld1DArrayV2I8Clamp:
2813 Opc = NVPTX::SULD_1D_ARRAY_V2I8_CLAMP_R;
2814 break;
2815 case NVPTXISD::Suld1DArrayV2I16Clamp:
2816 Opc = NVPTX::SULD_1D_ARRAY_V2I16_CLAMP_R;
2817 break;
2818 case NVPTXISD::Suld1DArrayV2I32Clamp:
2819 Opc = NVPTX::SULD_1D_ARRAY_V2I32_CLAMP_R;
2820 break;
2821 case NVPTXISD::Suld1DArrayV2I64Clamp:
2822 Opc = NVPTX::SULD_1D_ARRAY_V2I64_CLAMP_R;
2823 break;
2824 case NVPTXISD::Suld1DArrayV4I8Clamp:
2825 Opc = NVPTX::SULD_1D_ARRAY_V4I8_CLAMP_R;
2826 break;
2827 case NVPTXISD::Suld1DArrayV4I16Clamp:
2828 Opc = NVPTX::SULD_1D_ARRAY_V4I16_CLAMP_R;
2829 break;
2830 case NVPTXISD::Suld1DArrayV4I32Clamp:
2831 Opc = NVPTX::SULD_1D_ARRAY_V4I32_CLAMP_R;
2832 break;
2833 case NVPTXISD::Suld2DI8Clamp:
2834 Opc = NVPTX::SULD_2D_I8_CLAMP_R;
2835 break;
2836 case NVPTXISD::Suld2DI16Clamp:
2837 Opc = NVPTX::SULD_2D_I16_CLAMP_R;
2838 break;
2839 case NVPTXISD::Suld2DI32Clamp:
2840 Opc = NVPTX::SULD_2D_I32_CLAMP_R;
2841 break;
2842 case NVPTXISD::Suld2DI64Clamp:
2843 Opc = NVPTX::SULD_2D_I64_CLAMP_R;
2844 break;
2845 case NVPTXISD::Suld2DV2I8Clamp:
2846 Opc = NVPTX::SULD_2D_V2I8_CLAMP_R;
2847 break;
2848 case NVPTXISD::Suld2DV2I16Clamp:
2849 Opc = NVPTX::SULD_2D_V2I16_CLAMP_R;
2850 break;
2851 case NVPTXISD::Suld2DV2I32Clamp:
2852 Opc = NVPTX::SULD_2D_V2I32_CLAMP_R;
2853 break;
2854 case NVPTXISD::Suld2DV2I64Clamp:
2855 Opc = NVPTX::SULD_2D_V2I64_CLAMP_R;
2856 break;
2857 case NVPTXISD::Suld2DV4I8Clamp:
2858 Opc = NVPTX::SULD_2D_V4I8_CLAMP_R;
2859 break;
2860 case NVPTXISD::Suld2DV4I16Clamp:
2861 Opc = NVPTX::SULD_2D_V4I16_CLAMP_R;
2862 break;
2863 case NVPTXISD::Suld2DV4I32Clamp:
2864 Opc = NVPTX::SULD_2D_V4I32_CLAMP_R;
2865 break;
2866 case NVPTXISD::Suld2DArrayI8Clamp:
2867 Opc = NVPTX::SULD_2D_ARRAY_I8_CLAMP_R;
2868 break;
2869 case NVPTXISD::Suld2DArrayI16Clamp:
2870 Opc = NVPTX::SULD_2D_ARRAY_I16_CLAMP_R;
2871 break;
2872 case NVPTXISD::Suld2DArrayI32Clamp:
2873 Opc = NVPTX::SULD_2D_ARRAY_I32_CLAMP_R;
2874 break;
2875 case NVPTXISD::Suld2DArrayI64Clamp:
2876 Opc = NVPTX::SULD_2D_ARRAY_I64_CLAMP_R;
2877 break;
2878 case NVPTXISD::Suld2DArrayV2I8Clamp:
2879 Opc = NVPTX::SULD_2D_ARRAY_V2I8_CLAMP_R;
2880 break;
2881 case NVPTXISD::Suld2DArrayV2I16Clamp:
2882 Opc = NVPTX::SULD_2D_ARRAY_V2I16_CLAMP_R;
2883 break;
2884 case NVPTXISD::Suld2DArrayV2I32Clamp:
2885 Opc = NVPTX::SULD_2D_ARRAY_V2I32_CLAMP_R;
2886 break;
2887 case NVPTXISD::Suld2DArrayV2I64Clamp:
2888 Opc = NVPTX::SULD_2D_ARRAY_V2I64_CLAMP_R;
2889 break;
2890 case NVPTXISD::Suld2DArrayV4I8Clamp:
2891 Opc = NVPTX::SULD_2D_ARRAY_V4I8_CLAMP_R;
2892 break;
2893 case NVPTXISD::Suld2DArrayV4I16Clamp:
2894 Opc = NVPTX::SULD_2D_ARRAY_V4I16_CLAMP_R;
2895 break;
2896 case NVPTXISD::Suld2DArrayV4I32Clamp:
2897 Opc = NVPTX::SULD_2D_ARRAY_V4I32_CLAMP_R;
2898 break;
2899 case NVPTXISD::Suld3DI8Clamp:
2900 Opc = NVPTX::SULD_3D_I8_CLAMP_R;
2901 break;
2902 case NVPTXISD::Suld3DI16Clamp:
2903 Opc = NVPTX::SULD_3D_I16_CLAMP_R;
2904 break;
2905 case NVPTXISD::Suld3DI32Clamp:
2906 Opc = NVPTX::SULD_3D_I32_CLAMP_R;
2907 break;
2908 case NVPTXISD::Suld3DI64Clamp:
2909 Opc = NVPTX::SULD_3D_I64_CLAMP_R;
2910 break;
2911 case NVPTXISD::Suld3DV2I8Clamp:
2912 Opc = NVPTX::SULD_3D_V2I8_CLAMP_R;
2913 break;
2914 case NVPTXISD::Suld3DV2I16Clamp:
2915 Opc = NVPTX::SULD_3D_V2I16_CLAMP_R;
2916 break;
2917 case NVPTXISD::Suld3DV2I32Clamp:
2918 Opc = NVPTX::SULD_3D_V2I32_CLAMP_R;
2919 break;
2920 case NVPTXISD::Suld3DV2I64Clamp:
2921 Opc = NVPTX::SULD_3D_V2I64_CLAMP_R;
2922 break;
2923 case NVPTXISD::Suld3DV4I8Clamp:
2924 Opc = NVPTX::SULD_3D_V4I8_CLAMP_R;
2925 break;
2926 case NVPTXISD::Suld3DV4I16Clamp:
2927 Opc = NVPTX::SULD_3D_V4I16_CLAMP_R;
2928 break;
2929 case NVPTXISD::Suld3DV4I32Clamp:
2930 Opc = NVPTX::SULD_3D_V4I32_CLAMP_R;
2931 break;
2932 case NVPTXISD::Suld1DI8Trap:
2933 Opc = NVPTX::SULD_1D_I8_TRAP_R;
2934 break;
2935 case NVPTXISD::Suld1DI16Trap:
2936 Opc = NVPTX::SULD_1D_I16_TRAP_R;
2937 break;
2938 case NVPTXISD::Suld1DI32Trap:
2939 Opc = NVPTX::SULD_1D_I32_TRAP_R;
2940 break;
2941 case NVPTXISD::Suld1DI64Trap:
2942 Opc = NVPTX::SULD_1D_I64_TRAP_R;
2943 break;
2944 case NVPTXISD::Suld1DV2I8Trap:
2945 Opc = NVPTX::SULD_1D_V2I8_TRAP_R;
2946 break;
2947 case NVPTXISD::Suld1DV2I16Trap:
2948 Opc = NVPTX::SULD_1D_V2I16_TRAP_R;
2949 break;
2950 case NVPTXISD::Suld1DV2I32Trap:
2951 Opc = NVPTX::SULD_1D_V2I32_TRAP_R;
2952 break;
2953 case NVPTXISD::Suld1DV2I64Trap:
2954 Opc = NVPTX::SULD_1D_V2I64_TRAP_R;
2955 break;
2956 case NVPTXISD::Suld1DV4I8Trap:
2957 Opc = NVPTX::SULD_1D_V4I8_TRAP_R;
2958 break;
2959 case NVPTXISD::Suld1DV4I16Trap:
2960 Opc = NVPTX::SULD_1D_V4I16_TRAP_R;
2961 break;
2962 case NVPTXISD::Suld1DV4I32Trap:
2963 Opc = NVPTX::SULD_1D_V4I32_TRAP_R;
2964 break;
2965 case NVPTXISD::Suld1DArrayI8Trap:
2966 Opc = NVPTX::SULD_1D_ARRAY_I8_TRAP_R;
2967 break;
2968 case NVPTXISD::Suld1DArrayI16Trap:
2969 Opc = NVPTX::SULD_1D_ARRAY_I16_TRAP_R;
2970 break;
2971 case NVPTXISD::Suld1DArrayI32Trap:
2972 Opc = NVPTX::SULD_1D_ARRAY_I32_TRAP_R;
2973 break;
2974 case NVPTXISD::Suld1DArrayI64Trap:
2975 Opc = NVPTX::SULD_1D_ARRAY_I64_TRAP_R;
2976 break;
2977 case NVPTXISD::Suld1DArrayV2I8Trap:
2978 Opc = NVPTX::SULD_1D_ARRAY_V2I8_TRAP_R;
2979 break;
2980 case NVPTXISD::Suld1DArrayV2I16Trap:
2981 Opc = NVPTX::SULD_1D_ARRAY_V2I16_TRAP_R;
2982 break;
2983 case NVPTXISD::Suld1DArrayV2I32Trap:
2984 Opc = NVPTX::SULD_1D_ARRAY_V2I32_TRAP_R;
2985 break;
2986 case NVPTXISD::Suld1DArrayV2I64Trap:
2987 Opc = NVPTX::SULD_1D_ARRAY_V2I64_TRAP_R;
2988 break;
2989 case NVPTXISD::Suld1DArrayV4I8Trap:
2990 Opc = NVPTX::SULD_1D_ARRAY_V4I8_TRAP_R;
2991 break;
2992 case NVPTXISD::Suld1DArrayV4I16Trap:
2993 Opc = NVPTX::SULD_1D_ARRAY_V4I16_TRAP_R;
2994 break;
2995 case NVPTXISD::Suld1DArrayV4I32Trap:
2996 Opc = NVPTX::SULD_1D_ARRAY_V4I32_TRAP_R;
2997 break;
2998 case NVPTXISD::Suld2DI8Trap:
2999 Opc = NVPTX::SULD_2D_I8_TRAP_R;
3000 break;
3001 case NVPTXISD::Suld2DI16Trap:
3002 Opc = NVPTX::SULD_2D_I16_TRAP_R;
3003 break;
3004 case NVPTXISD::Suld2DI32Trap:
3005 Opc = NVPTX::SULD_2D_I32_TRAP_R;
3006 break;
3007 case NVPTXISD::Suld2DI64Trap:
3008 Opc = NVPTX::SULD_2D_I64_TRAP_R;
3009 break;
3010 case NVPTXISD::Suld2DV2I8Trap:
3011 Opc = NVPTX::SULD_2D_V2I8_TRAP_R;
3012 break;
3013 case NVPTXISD::Suld2DV2I16Trap:
3014 Opc = NVPTX::SULD_2D_V2I16_TRAP_R;
3015 break;
3016 case NVPTXISD::Suld2DV2I32Trap:
3017 Opc = NVPTX::SULD_2D_V2I32_TRAP_R;
3018 break;
3019 case NVPTXISD::Suld2DV2I64Trap:
3020 Opc = NVPTX::SULD_2D_V2I64_TRAP_R;
3021 break;
3022 case NVPTXISD::Suld2DV4I8Trap:
3023 Opc = NVPTX::SULD_2D_V4I8_TRAP_R;
3024 break;
3025 case NVPTXISD::Suld2DV4I16Trap:
3026 Opc = NVPTX::SULD_2D_V4I16_TRAP_R;
3027 break;
3028 case NVPTXISD::Suld2DV4I32Trap:
3029 Opc = NVPTX::SULD_2D_V4I32_TRAP_R;
3030 break;
3031 case NVPTXISD::Suld2DArrayI8Trap:
3032 Opc = NVPTX::SULD_2D_ARRAY_I8_TRAP_R;
3033 break;
3034 case NVPTXISD::Suld2DArrayI16Trap:
3035 Opc = NVPTX::SULD_2D_ARRAY_I16_TRAP_R;
3036 break;
3037 case NVPTXISD::Suld2DArrayI32Trap:
3038 Opc = NVPTX::SULD_2D_ARRAY_I32_TRAP_R;
3039 break;
3040 case NVPTXISD::Suld2DArrayI64Trap:
3041 Opc = NVPTX::SULD_2D_ARRAY_I64_TRAP_R;
3042 break;
3043 case NVPTXISD::Suld2DArrayV2I8Trap:
3044 Opc = NVPTX::SULD_2D_ARRAY_V2I8_TRAP_R;
3045 break;
3046 case NVPTXISD::Suld2DArrayV2I16Trap:
3047 Opc = NVPTX::SULD_2D_ARRAY_V2I16_TRAP_R;
3048 break;
3049 case NVPTXISD::Suld2DArrayV2I32Trap:
3050 Opc = NVPTX::SULD_2D_ARRAY_V2I32_TRAP_R;
3051 break;
3052 case NVPTXISD::Suld2DArrayV2I64Trap:
3053 Opc = NVPTX::SULD_2D_ARRAY_V2I64_TRAP_R;
3054 break;
3055 case NVPTXISD::Suld2DArrayV4I8Trap:
3056 Opc = NVPTX::SULD_2D_ARRAY_V4I8_TRAP_R;
3057 break;
3058 case NVPTXISD::Suld2DArrayV4I16Trap:
3059 Opc = NVPTX::SULD_2D_ARRAY_V4I16_TRAP_R;
3060 break;
3061 case NVPTXISD::Suld2DArrayV4I32Trap:
3062 Opc = NVPTX::SULD_2D_ARRAY_V4I32_TRAP_R;
3063 break;
3064 case NVPTXISD::Suld3DI8Trap:
3065 Opc = NVPTX::SULD_3D_I8_TRAP_R;
3066 break;
3067 case NVPTXISD::Suld3DI16Trap:
3068 Opc = NVPTX::SULD_3D_I16_TRAP_R;
3069 break;
3070 case NVPTXISD::Suld3DI32Trap:
3071 Opc = NVPTX::SULD_3D_I32_TRAP_R;
3072 break;
3073 case NVPTXISD::Suld3DI64Trap:
3074 Opc = NVPTX::SULD_3D_I64_TRAP_R;
3075 break;
3076 case NVPTXISD::Suld3DV2I8Trap:
3077 Opc = NVPTX::SULD_3D_V2I8_TRAP_R;
3078 break;
3079 case NVPTXISD::Suld3DV2I16Trap:
3080 Opc = NVPTX::SULD_3D_V2I16_TRAP_R;
3081 break;
3082 case NVPTXISD::Suld3DV2I32Trap:
3083 Opc = NVPTX::SULD_3D_V2I32_TRAP_R;
3084 break;
3085 case NVPTXISD::Suld3DV2I64Trap:
3086 Opc = NVPTX::SULD_3D_V2I64_TRAP_R;
3087 break;
3088 case NVPTXISD::Suld3DV4I8Trap:
3089 Opc = NVPTX::SULD_3D_V4I8_TRAP_R;
3090 break;
3091 case NVPTXISD::Suld3DV4I16Trap:
3092 Opc = NVPTX::SULD_3D_V4I16_TRAP_R;
3093 break;
3094 case NVPTXISD::Suld3DV4I32Trap:
3095 Opc = NVPTX::SULD_3D_V4I32_TRAP_R;
3096 break;
3097 case NVPTXISD::Suld1DI8Zero:
3098 Opc = NVPTX::SULD_1D_I8_ZERO_R;
3099 break;
3100 case NVPTXISD::Suld1DI16Zero:
3101 Opc = NVPTX::SULD_1D_I16_ZERO_R;
3102 break;
3103 case NVPTXISD::Suld1DI32Zero:
3104 Opc = NVPTX::SULD_1D_I32_ZERO_R;
3105 break;
3106 case NVPTXISD::Suld1DI64Zero:
3107 Opc = NVPTX::SULD_1D_I64_ZERO_R;
3108 break;
3109 case NVPTXISD::Suld1DV2I8Zero:
3110 Opc = NVPTX::SULD_1D_V2I8_ZERO_R;
3111 break;
3112 case NVPTXISD::Suld1DV2I16Zero:
3113 Opc = NVPTX::SULD_1D_V2I16_ZERO_R;
3114 break;
3115 case NVPTXISD::Suld1DV2I32Zero:
3116 Opc = NVPTX::SULD_1D_V2I32_ZERO_R;
3117 break;
3118 case NVPTXISD::Suld1DV2I64Zero:
3119 Opc = NVPTX::SULD_1D_V2I64_ZERO_R;
3120 break;
3121 case NVPTXISD::Suld1DV4I8Zero:
3122 Opc = NVPTX::SULD_1D_V4I8_ZERO_R;
3123 break;
3124 case NVPTXISD::Suld1DV4I16Zero:
3125 Opc = NVPTX::SULD_1D_V4I16_ZERO_R;
3126 break;
3127 case NVPTXISD::Suld1DV4I32Zero:
3128 Opc = NVPTX::SULD_1D_V4I32_ZERO_R;
3129 break;
3130 case NVPTXISD::Suld1DArrayI8Zero:
3131 Opc = NVPTX::SULD_1D_ARRAY_I8_ZERO_R;
3132 break;
3133 case NVPTXISD::Suld1DArrayI16Zero:
3134 Opc = NVPTX::SULD_1D_ARRAY_I16_ZERO_R;
3135 break;
3136 case NVPTXISD::Suld1DArrayI32Zero:
3137 Opc = NVPTX::SULD_1D_ARRAY_I32_ZERO_R;
3138 break;
3139 case NVPTXISD::Suld1DArrayI64Zero:
3140 Opc = NVPTX::SULD_1D_ARRAY_I64_ZERO_R;
3141 break;
3142 case NVPTXISD::Suld1DArrayV2I8Zero:
3143 Opc = NVPTX::SULD_1D_ARRAY_V2I8_ZERO_R;
3144 break;
3145 case NVPTXISD::Suld1DArrayV2I16Zero:
3146 Opc = NVPTX::SULD_1D_ARRAY_V2I16_ZERO_R;
3147 break;
3148 case NVPTXISD::Suld1DArrayV2I32Zero:
3149 Opc = NVPTX::SULD_1D_ARRAY_V2I32_ZERO_R;
3150 break;
3151 case NVPTXISD::Suld1DArrayV2I64Zero:
3152 Opc = NVPTX::SULD_1D_ARRAY_V2I64_ZERO_R;
3153 break;
3154 case NVPTXISD::Suld1DArrayV4I8Zero:
3155 Opc = NVPTX::SULD_1D_ARRAY_V4I8_ZERO_R;
3156 break;
3157 case NVPTXISD::Suld1DArrayV4I16Zero:
3158 Opc = NVPTX::SULD_1D_ARRAY_V4I16_ZERO_R;
3159 break;
3160 case NVPTXISD::Suld1DArrayV4I32Zero:
3161 Opc = NVPTX::SULD_1D_ARRAY_V4I32_ZERO_R;
3162 break;
3163 case NVPTXISD::Suld2DI8Zero:
3164 Opc = NVPTX::SULD_2D_I8_ZERO_R;
3165 break;
3166 case NVPTXISD::Suld2DI16Zero:
3167 Opc = NVPTX::SULD_2D_I16_ZERO_R;
3168 break;
3169 case NVPTXISD::Suld2DI32Zero:
3170 Opc = NVPTX::SULD_2D_I32_ZERO_R;
3171 break;
3172 case NVPTXISD::Suld2DI64Zero:
3173 Opc = NVPTX::SULD_2D_I64_ZERO_R;
3174 break;
3175 case NVPTXISD::Suld2DV2I8Zero:
3176 Opc = NVPTX::SULD_2D_V2I8_ZERO_R;
3177 break;
3178 case NVPTXISD::Suld2DV2I16Zero:
3179 Opc = NVPTX::SULD_2D_V2I16_ZERO_R;
3180 break;
3181 case NVPTXISD::Suld2DV2I32Zero:
3182 Opc = NVPTX::SULD_2D_V2I32_ZERO_R;
3183 break;
3184 case NVPTXISD::Suld2DV2I64Zero:
3185 Opc = NVPTX::SULD_2D_V2I64_ZERO_R;
3186 break;
3187 case NVPTXISD::Suld2DV4I8Zero:
3188 Opc = NVPTX::SULD_2D_V4I8_ZERO_R;
3189 break;
3190 case NVPTXISD::Suld2DV4I16Zero:
3191 Opc = NVPTX::SULD_2D_V4I16_ZERO_R;
3192 break;
3193 case NVPTXISD::Suld2DV4I32Zero:
3194 Opc = NVPTX::SULD_2D_V4I32_ZERO_R;
3195 break;
3196 case NVPTXISD::Suld2DArrayI8Zero:
3197 Opc = NVPTX::SULD_2D_ARRAY_I8_ZERO_R;
3198 break;
3199 case NVPTXISD::Suld2DArrayI16Zero:
3200 Opc = NVPTX::SULD_2D_ARRAY_I16_ZERO_R;
3201 break;
3202 case NVPTXISD::Suld2DArrayI32Zero:
3203 Opc = NVPTX::SULD_2D_ARRAY_I32_ZERO_R;
3204 break;
3205 case NVPTXISD::Suld2DArrayI64Zero:
3206 Opc = NVPTX::SULD_2D_ARRAY_I64_ZERO_R;
3207 break;
3208 case NVPTXISD::Suld2DArrayV2I8Zero:
3209 Opc = NVPTX::SULD_2D_ARRAY_V2I8_ZERO_R;
3210 break;
3211 case NVPTXISD::Suld2DArrayV2I16Zero:
3212 Opc = NVPTX::SULD_2D_ARRAY_V2I16_ZERO_R;
3213 break;
3214 case NVPTXISD::Suld2DArrayV2I32Zero:
3215 Opc = NVPTX::SULD_2D_ARRAY_V2I32_ZERO_R;
3216 break;
3217 case NVPTXISD::Suld2DArrayV2I64Zero:
3218 Opc = NVPTX::SULD_2D_ARRAY_V2I64_ZERO_R;
3219 break;
3220 case NVPTXISD::Suld2DArrayV4I8Zero:
3221 Opc = NVPTX::SULD_2D_ARRAY_V4I8_ZERO_R;
3222 break;
3223 case NVPTXISD::Suld2DArrayV4I16Zero:
3224 Opc = NVPTX::SULD_2D_ARRAY_V4I16_ZERO_R;
3225 break;
3226 case NVPTXISD::Suld2DArrayV4I32Zero:
3227 Opc = NVPTX::SULD_2D_ARRAY_V4I32_ZERO_R;
3228 break;
3229 case NVPTXISD::Suld3DI8Zero:
3230 Opc = NVPTX::SULD_3D_I8_ZERO_R;
3231 break;
3232 case NVPTXISD::Suld3DI16Zero:
3233 Opc = NVPTX::SULD_3D_I16_ZERO_R;
3234 break;
3235 case NVPTXISD::Suld3DI32Zero:
3236 Opc = NVPTX::SULD_3D_I32_ZERO_R;
3237 break;
3238 case NVPTXISD::Suld3DI64Zero:
3239 Opc = NVPTX::SULD_3D_I64_ZERO_R;
3240 break;
3241 case NVPTXISD::Suld3DV2I8Zero:
3242 Opc = NVPTX::SULD_3D_V2I8_ZERO_R;
3243 break;
3244 case NVPTXISD::Suld3DV2I16Zero:
3245 Opc = NVPTX::SULD_3D_V2I16_ZERO_R;
3246 break;
3247 case NVPTXISD::Suld3DV2I32Zero:
3248 Opc = NVPTX::SULD_3D_V2I32_ZERO_R;
3249 break;
3250 case NVPTXISD::Suld3DV2I64Zero:
3251 Opc = NVPTX::SULD_3D_V2I64_ZERO_R;
3252 break;
3253 case NVPTXISD::Suld3DV4I8Zero:
3254 Opc = NVPTX::SULD_3D_V4I8_ZERO_R;
3255 break;
3256 case NVPTXISD::Suld3DV4I16Zero:
3257 Opc = NVPTX::SULD_3D_V4I16_ZERO_R;
3258 break;
3259 case NVPTXISD::Suld3DV4I32Zero:
3260 Opc = NVPTX::SULD_3D_V4I32_ZERO_R;
3261 break;
3264 // Copy over operands
3265 SmallVector<SDValue, 8> Ops(drop_begin(N->ops()));
3266 Ops.push_back(N->getOperand(0)); // Move chain to the back.
3268 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
3269 return true;
3273 /// SelectBFE - Look for instruction sequences that can be made more efficient
3274 /// by using the 'bfe' (bit-field extract) PTX instruction
3275 bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
3276 SDLoc DL(N);
3277 SDValue LHS = N->getOperand(0);
3278 SDValue RHS = N->getOperand(1);
3279 SDValue Len;
3280 SDValue Start;
3281 SDValue Val;
3282 bool IsSigned = false;
3284 if (N->getOpcode() == ISD::AND) {
3285 // Canonicalize the operands
3286 // We want 'and %val, %mask'
3287 if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) {
3288 std::swap(LHS, RHS);
3291 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS);
3292 if (!Mask) {
3293 // We need a constant mask on the RHS of the AND
3294 return false;
3297 // Extract the mask bits
3298 uint64_t MaskVal = Mask->getZExtValue();
3299 if (!isMask_64(MaskVal)) {
3300 // We *could* handle shifted masks here, but doing so would require an
3301 // 'and' operation to fix up the low-order bits so we would trade
3302 // shr+and for bfe+and, which has the same throughput
3303 return false;
3306 // How many bits are in our mask?
3307 int64_t NumBits = countr_one(MaskVal);
3308 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
3310 if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) {
3311 // We have a 'srl/and' pair, extract the effective start bit and length
3312 Val = LHS.getNode()->getOperand(0);
3313 Start = LHS.getNode()->getOperand(1);
3314 ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start);
3315 if (StartConst) {
3316 uint64_t StartVal = StartConst->getZExtValue();
3317 // How many "good" bits do we have left? "good" is defined here as bits
3318 // that exist in the original value, not shifted in.
3319 int64_t GoodBits = Start.getValueSizeInBits() - StartVal;
3320 if (NumBits > GoodBits) {
3321 // Do not handle the case where bits have been shifted in. In theory
3322 // we could handle this, but the cost is likely higher than just
3323 // emitting the srl/and pair.
3324 return false;
3326 Start = CurDAG->getTargetConstant(StartVal, DL, MVT::i32);
3327 } else {
3328 // Do not handle the case where the shift amount (can be zero if no srl
3329 // was found) is not constant. We could handle this case, but it would
3330 // require run-time logic that would be more expensive than just
3331 // emitting the srl/and pair.
3332 return false;
3334 } else {
3335 // Do not handle the case where the LHS of the and is not a shift. While
3336 // it would be trivial to handle this case, it would just transform
3337 // 'and' -> 'bfe', but 'and' has higher-throughput.
3338 return false;
3340 } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) {
3341 if (LHS->getOpcode() == ISD::AND) {
3342 ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS);
3343 if (!ShiftCnst) {
3344 // Shift amount must be constant
3345 return false;
3348 uint64_t ShiftAmt = ShiftCnst->getZExtValue();
3350 SDValue AndLHS = LHS->getOperand(0);
3351 SDValue AndRHS = LHS->getOperand(1);
3353 // Canonicalize the AND to have the mask on the RHS
3354 if (isa<ConstantSDNode>(AndLHS)) {
3355 std::swap(AndLHS, AndRHS);
3358 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS);
3359 if (!MaskCnst) {
3360 // Mask must be constant
3361 return false;
3364 uint64_t MaskVal = MaskCnst->getZExtValue();
3365 uint64_t NumZeros;
3366 uint64_t NumBits;
3367 if (isMask_64(MaskVal)) {
3368 NumZeros = 0;
3369 // The number of bits in the result bitfield will be the number of
3370 // trailing ones (the AND) minus the number of bits we shift off
3371 NumBits = llvm::countr_one(MaskVal) - ShiftAmt;
3372 } else if (isShiftedMask_64(MaskVal)) {
3373 NumZeros = llvm::countr_zero(MaskVal);
3374 unsigned NumOnes = llvm::countr_one(MaskVal >> NumZeros);
3375 // The number of bits in the result bitfield will be the number of
3376 // trailing zeros plus the number of set bits in the mask minus the
3377 // number of bits we shift off
3378 NumBits = NumZeros + NumOnes - ShiftAmt;
3379 } else {
3380 // This is not a mask we can handle
3381 return false;
3384 if (ShiftAmt < NumZeros) {
3385 // Handling this case would require extra logic that would make this
3386 // transformation non-profitable
3387 return false;
3390 Val = AndLHS;
3391 Start = CurDAG->getTargetConstant(ShiftAmt, DL, MVT::i32);
3392 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
3393 } else if (LHS->getOpcode() == ISD::SHL) {
3394 // Here, we have a pattern like:
3396 // (sra (shl val, NN), MM)
3397 // or
3398 // (srl (shl val, NN), MM)
3400 // If MM >= NN, we can efficiently optimize this with bfe
3401 Val = LHS->getOperand(0);
3403 SDValue ShlRHS = LHS->getOperand(1);
3404 ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS);
3405 if (!ShlCnst) {
3406 // Shift amount must be constant
3407 return false;
3409 uint64_t InnerShiftAmt = ShlCnst->getZExtValue();
3411 SDValue ShrRHS = RHS;
3412 ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS);
3413 if (!ShrCnst) {
3414 // Shift amount must be constant
3415 return false;
3417 uint64_t OuterShiftAmt = ShrCnst->getZExtValue();
3419 // To avoid extra codegen and be profitable, we need Outer >= Inner
3420 if (OuterShiftAmt < InnerShiftAmt) {
3421 return false;
3424 // If the outer shift is more than the type size, we have no bitfield to
3425 // extract (since we also check that the inner shift is <= the outer shift
3426 // then this also implies that the inner shift is < the type size)
3427 if (OuterShiftAmt >= Val.getValueSizeInBits()) {
3428 return false;
3431 Start = CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, DL,
3432 MVT::i32);
3433 Len = CurDAG->getTargetConstant(Val.getValueSizeInBits() - OuterShiftAmt,
3434 DL, MVT::i32);
3436 if (N->getOpcode() == ISD::SRA) {
3437 // If we have a arithmetic right shift, we need to use the signed bfe
3438 // variant
3439 IsSigned = true;
3441 } else {
3442 // No can do...
3443 return false;
3445 } else {
3446 // No can do...
3447 return false;
3451 unsigned Opc;
3452 // For the BFE operations we form here from "and" and "srl", always use the
3453 // unsigned variants.
3454 if (Val.getValueType() == MVT::i32) {
3455 if (IsSigned) {
3456 Opc = NVPTX::BFE_S32rii;
3457 } else {
3458 Opc = NVPTX::BFE_U32rii;
3460 } else if (Val.getValueType() == MVT::i64) {
3461 if (IsSigned) {
3462 Opc = NVPTX::BFE_S64rii;
3463 } else {
3464 Opc = NVPTX::BFE_U64rii;
3466 } else {
3467 // We cannot handle this type
3468 return false;
3471 SDValue Ops[] = {
3472 Val, Start, Len
3475 ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops));
3476 return true;
3479 // SelectDirectAddr - Match a direct address for DAG.
3480 // A direct address could be a globaladdress or externalsymbol.
3481 bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
3482 // Return true if TGA or ES.
3483 if (N.getOpcode() == ISD::TargetGlobalAddress ||
3484 N.getOpcode() == ISD::TargetExternalSymbol) {
3485 Address = N;
3486 return true;
3488 if (N.getOpcode() == NVPTXISD::Wrapper) {
3489 Address = N.getOperand(0);
3490 return true;
3492 // addrspacecast(MoveParam(arg_symbol) to addrspace(PARAM)) -> arg_symbol
3493 if (AddrSpaceCastSDNode *CastN = dyn_cast<AddrSpaceCastSDNode>(N)) {
3494 if (CastN->getSrcAddressSpace() == ADDRESS_SPACE_GENERIC &&
3495 CastN->getDestAddressSpace() == ADDRESS_SPACE_PARAM &&
3496 CastN->getOperand(0).getOpcode() == NVPTXISD::MoveParam)
3497 return SelectDirectAddr(CastN->getOperand(0).getOperand(0), Address);
3499 return false;
3502 // symbol+offset
3503 bool NVPTXDAGToDAGISel::SelectADDRsi_imp(
3504 SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
3505 if (Addr.getOpcode() == ISD::ADD) {
3506 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
3507 SDValue base = Addr.getOperand(0);
3508 if (SelectDirectAddr(base, Base)) {
3509 Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
3510 mvt);
3511 return true;
3515 return false;
3518 // symbol+offset
3519 bool NVPTXDAGToDAGISel::SelectADDRsi(SDNode *OpNode, SDValue Addr,
3520 SDValue &Base, SDValue &Offset) {
3521 return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i32);
3524 // symbol+offset
3525 bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode *OpNode, SDValue Addr,
3526 SDValue &Base, SDValue &Offset) {
3527 return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i64);
3530 // register+offset
3531 bool NVPTXDAGToDAGISel::SelectADDRri_imp(
3532 SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
3533 if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
3534 Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
3535 Offset = CurDAG->getTargetConstant(0, SDLoc(OpNode), mvt);
3536 return true;
3538 if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
3539 Addr.getOpcode() == ISD::TargetGlobalAddress)
3540 return false; // direct calls.
3542 if (Addr.getOpcode() == ISD::ADD) {
3543 if (SelectDirectAddr(Addr.getOperand(0), Addr)) {
3544 return false;
3546 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
3547 if (FrameIndexSDNode *FIN =
3548 dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
3549 // Constant offset from frame ref.
3550 Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
3551 else
3552 Base = Addr.getOperand(0);
3553 Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
3554 mvt);
3555 return true;
3558 return false;
3561 // register+offset
3562 bool NVPTXDAGToDAGISel::SelectADDRri(SDNode *OpNode, SDValue Addr,
3563 SDValue &Base, SDValue &Offset) {
3564 return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i32);
3567 // register+offset
3568 bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr,
3569 SDValue &Base, SDValue &Offset) {
3570 return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i64);
3573 bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
3574 unsigned int spN) const {
3575 const Value *Src = nullptr;
3576 if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) {
3577 if (spN == 0 && mN->getMemOperand()->getPseudoValue())
3578 return true;
3579 Src = mN->getMemOperand()->getValue();
3581 if (!Src)
3582 return false;
3583 if (auto *PT = dyn_cast<PointerType>(Src->getType()))
3584 return (PT->getAddressSpace() == spN);
3585 return false;
3588 /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
3589 /// inline asm expressions.
3590 bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand(
3591 const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
3592 std::vector<SDValue> &OutOps) {
3593 SDValue Op0, Op1;
3594 switch (ConstraintID) {
3595 default:
3596 return true;
3597 case InlineAsm::ConstraintCode::m: // memory
3598 if (SelectDirectAddr(Op, Op0)) {
3599 OutOps.push_back(Op0);
3600 OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32));
3601 return false;
3603 if (SelectADDRri(Op.getNode(), Op, Op0, Op1)) {
3604 OutOps.push_back(Op0);
3605 OutOps.push_back(Op1);
3606 return false;
3608 break;
3610 return true;
3613 /// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a
3614 /// conversion from \p SrcTy to \p DestTy.
3615 unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
3616 LoadSDNode *LdNode) {
3617 bool IsSigned = LdNode && LdNode->getExtensionType() == ISD::SEXTLOAD;
3618 switch (SrcTy.SimpleTy) {
3619 default:
3620 llvm_unreachable("Unhandled source type");
3621 case MVT::i8:
3622 switch (DestTy.SimpleTy) {
3623 default:
3624 llvm_unreachable("Unhandled dest type");
3625 case MVT::i16:
3626 return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8;
3627 case MVT::i32:
3628 return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8;
3629 case MVT::i64:
3630 return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8;
3632 case MVT::i16:
3633 switch (DestTy.SimpleTy) {
3634 default:
3635 llvm_unreachable("Unhandled dest type");
3636 case MVT::i8:
3637 return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16;
3638 case MVT::i32:
3639 return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16;
3640 case MVT::i64:
3641 return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16;
3643 case MVT::i32:
3644 switch (DestTy.SimpleTy) {
3645 default:
3646 llvm_unreachable("Unhandled dest type");
3647 case MVT::i8:
3648 return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32;
3649 case MVT::i16:
3650 return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32;
3651 case MVT::i64:
3652 return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32;
3654 case MVT::i64:
3655 switch (DestTy.SimpleTy) {
3656 default:
3657 llvm_unreachable("Unhandled dest type");
3658 case MVT::i8:
3659 return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64;
3660 case MVT::i16:
3661 return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64;
3662 case MVT::i32:
3663 return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64;
3665 case MVT::f16:
3666 switch (DestTy.SimpleTy) {
3667 default:
3668 llvm_unreachable("Unhandled dest type");
3669 case MVT::f32:
3670 return NVPTX::CVT_f32_f16;
3671 case MVT::f64:
3672 return NVPTX::CVT_f64_f16;