1 //===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file defines an instruction selector for the NVPTX target.
11 //===----------------------------------------------------------------------===//
13 #include "NVPTXISelDAGToDAG.h"
14 #include "MCTargetDesc/NVPTXBaseInfo.h"
15 #include "NVPTXUtilities.h"
16 #include "llvm/Analysis/ValueTracking.h"
17 #include "llvm/IR/GlobalValue.h"
18 #include "llvm/IR/Instructions.h"
19 #include "llvm/IR/IntrinsicsNVPTX.h"
20 #include "llvm/Support/AtomicOrdering.h"
21 #include "llvm/Support/CommandLine.h"
22 #include "llvm/Support/Debug.h"
23 #include "llvm/Support/ErrorHandling.h"
24 #include "llvm/Support/raw_ostream.h"
25 #include "llvm/Target/TargetIntrinsicInfo.h"
29 #define DEBUG_TYPE "nvptx-isel"
31 /// createNVPTXISelDag - This pass converts a legalized DAG into a
32 /// NVPTX-specific DAG, ready for instruction scheduling.
33 FunctionPass
*llvm::createNVPTXISelDag(NVPTXTargetMachine
&TM
,
34 llvm::CodeGenOpt::Level OptLevel
) {
35 return new NVPTXDAGToDAGISel(TM
, OptLevel
);
38 NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine
&tm
,
39 CodeGenOpt::Level OptLevel
)
40 : SelectionDAGISel(tm
, OptLevel
), TM(tm
) {
41 doMulWide
= (OptLevel
> 0);
44 bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction
&MF
) {
45 Subtarget
= &static_cast<const NVPTXSubtarget
&>(MF
.getSubtarget());
46 return SelectionDAGISel::runOnMachineFunction(MF
);
49 int NVPTXDAGToDAGISel::getDivF32Level() const {
50 return Subtarget
->getTargetLowering()->getDivF32Level();
53 bool NVPTXDAGToDAGISel::usePrecSqrtF32() const {
54 return Subtarget
->getTargetLowering()->usePrecSqrtF32();
57 bool NVPTXDAGToDAGISel::useF32FTZ() const {
58 return Subtarget
->getTargetLowering()->useF32FTZ(*MF
);
61 bool NVPTXDAGToDAGISel::allowFMA() const {
62 const NVPTXTargetLowering
*TL
= Subtarget
->getTargetLowering();
63 return TL
->allowFMA(*MF
, OptLevel
);
66 bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const {
67 const NVPTXTargetLowering
*TL
= Subtarget
->getTargetLowering();
68 return TL
->allowUnsafeFPMath(*MF
);
71 bool NVPTXDAGToDAGISel::useShortPointers() const {
72 return TM
.useShortPointers();
75 /// Select - Select instructions not customized! Used for
76 /// expanded, promoted and normal instructions.
77 void NVPTXDAGToDAGISel::Select(SDNode
*N
) {
79 if (N
->isMachineOpcode()) {
81 return; // Already selected.
84 switch (N
->getOpcode()) {
86 case ISD::ATOMIC_LOAD
:
91 case ISD::ATOMIC_STORE
:
95 case ISD::EXTRACT_VECTOR_ELT
:
96 if (tryEXTRACT_VECTOR_ELEMENT(N
))
99 case NVPTXISD::SETP_F16X2
:
103 case NVPTXISD::LoadV2
:
104 case NVPTXISD::LoadV4
:
105 if (tryLoadVector(N
))
108 case NVPTXISD::LDGV2
:
109 case NVPTXISD::LDGV4
:
110 case NVPTXISD::LDUV2
:
111 case NVPTXISD::LDUV4
:
115 case NVPTXISD::StoreV2
:
116 case NVPTXISD::StoreV4
:
117 if (tryStoreVector(N
))
120 case NVPTXISD::LoadParam
:
121 case NVPTXISD::LoadParamV2
:
122 case NVPTXISD::LoadParamV4
:
126 case NVPTXISD::StoreRetval
:
127 case NVPTXISD::StoreRetvalV2
:
128 case NVPTXISD::StoreRetvalV4
:
129 if (tryStoreRetval(N
))
132 case NVPTXISD::StoreParam
:
133 case NVPTXISD::StoreParamV2
:
134 case NVPTXISD::StoreParamV4
:
135 case NVPTXISD::StoreParamS32
:
136 case NVPTXISD::StoreParamU32
:
137 if (tryStoreParam(N
))
140 case ISD::INTRINSIC_WO_CHAIN
:
141 if (tryIntrinsicNoChain(N
))
144 case ISD::INTRINSIC_W_CHAIN
:
145 if (tryIntrinsicChain(N
))
148 case NVPTXISD::Tex1DFloatS32
:
149 case NVPTXISD::Tex1DFloatFloat
:
150 case NVPTXISD::Tex1DFloatFloatLevel
:
151 case NVPTXISD::Tex1DFloatFloatGrad
:
152 case NVPTXISD::Tex1DS32S32
:
153 case NVPTXISD::Tex1DS32Float
:
154 case NVPTXISD::Tex1DS32FloatLevel
:
155 case NVPTXISD::Tex1DS32FloatGrad
:
156 case NVPTXISD::Tex1DU32S32
:
157 case NVPTXISD::Tex1DU32Float
:
158 case NVPTXISD::Tex1DU32FloatLevel
:
159 case NVPTXISD::Tex1DU32FloatGrad
:
160 case NVPTXISD::Tex1DArrayFloatS32
:
161 case NVPTXISD::Tex1DArrayFloatFloat
:
162 case NVPTXISD::Tex1DArrayFloatFloatLevel
:
163 case NVPTXISD::Tex1DArrayFloatFloatGrad
:
164 case NVPTXISD::Tex1DArrayS32S32
:
165 case NVPTXISD::Tex1DArrayS32Float
:
166 case NVPTXISD::Tex1DArrayS32FloatLevel
:
167 case NVPTXISD::Tex1DArrayS32FloatGrad
:
168 case NVPTXISD::Tex1DArrayU32S32
:
169 case NVPTXISD::Tex1DArrayU32Float
:
170 case NVPTXISD::Tex1DArrayU32FloatLevel
:
171 case NVPTXISD::Tex1DArrayU32FloatGrad
:
172 case NVPTXISD::Tex2DFloatS32
:
173 case NVPTXISD::Tex2DFloatFloat
:
174 case NVPTXISD::Tex2DFloatFloatLevel
:
175 case NVPTXISD::Tex2DFloatFloatGrad
:
176 case NVPTXISD::Tex2DS32S32
:
177 case NVPTXISD::Tex2DS32Float
:
178 case NVPTXISD::Tex2DS32FloatLevel
:
179 case NVPTXISD::Tex2DS32FloatGrad
:
180 case NVPTXISD::Tex2DU32S32
:
181 case NVPTXISD::Tex2DU32Float
:
182 case NVPTXISD::Tex2DU32FloatLevel
:
183 case NVPTXISD::Tex2DU32FloatGrad
:
184 case NVPTXISD::Tex2DArrayFloatS32
:
185 case NVPTXISD::Tex2DArrayFloatFloat
:
186 case NVPTXISD::Tex2DArrayFloatFloatLevel
:
187 case NVPTXISD::Tex2DArrayFloatFloatGrad
:
188 case NVPTXISD::Tex2DArrayS32S32
:
189 case NVPTXISD::Tex2DArrayS32Float
:
190 case NVPTXISD::Tex2DArrayS32FloatLevel
:
191 case NVPTXISD::Tex2DArrayS32FloatGrad
:
192 case NVPTXISD::Tex2DArrayU32S32
:
193 case NVPTXISD::Tex2DArrayU32Float
:
194 case NVPTXISD::Tex2DArrayU32FloatLevel
:
195 case NVPTXISD::Tex2DArrayU32FloatGrad
:
196 case NVPTXISD::Tex3DFloatS32
:
197 case NVPTXISD::Tex3DFloatFloat
:
198 case NVPTXISD::Tex3DFloatFloatLevel
:
199 case NVPTXISD::Tex3DFloatFloatGrad
:
200 case NVPTXISD::Tex3DS32S32
:
201 case NVPTXISD::Tex3DS32Float
:
202 case NVPTXISD::Tex3DS32FloatLevel
:
203 case NVPTXISD::Tex3DS32FloatGrad
:
204 case NVPTXISD::Tex3DU32S32
:
205 case NVPTXISD::Tex3DU32Float
:
206 case NVPTXISD::Tex3DU32FloatLevel
:
207 case NVPTXISD::Tex3DU32FloatGrad
:
208 case NVPTXISD::TexCubeFloatFloat
:
209 case NVPTXISD::TexCubeFloatFloatLevel
:
210 case NVPTXISD::TexCubeS32Float
:
211 case NVPTXISD::TexCubeS32FloatLevel
:
212 case NVPTXISD::TexCubeU32Float
:
213 case NVPTXISD::TexCubeU32FloatLevel
:
214 case NVPTXISD::TexCubeArrayFloatFloat
:
215 case NVPTXISD::TexCubeArrayFloatFloatLevel
:
216 case NVPTXISD::TexCubeArrayS32Float
:
217 case NVPTXISD::TexCubeArrayS32FloatLevel
:
218 case NVPTXISD::TexCubeArrayU32Float
:
219 case NVPTXISD::TexCubeArrayU32FloatLevel
:
220 case NVPTXISD::Tld4R2DFloatFloat
:
221 case NVPTXISD::Tld4G2DFloatFloat
:
222 case NVPTXISD::Tld4B2DFloatFloat
:
223 case NVPTXISD::Tld4A2DFloatFloat
:
224 case NVPTXISD::Tld4R2DS64Float
:
225 case NVPTXISD::Tld4G2DS64Float
:
226 case NVPTXISD::Tld4B2DS64Float
:
227 case NVPTXISD::Tld4A2DS64Float
:
228 case NVPTXISD::Tld4R2DU64Float
:
229 case NVPTXISD::Tld4G2DU64Float
:
230 case NVPTXISD::Tld4B2DU64Float
:
231 case NVPTXISD::Tld4A2DU64Float
:
232 case NVPTXISD::TexUnified1DFloatS32
:
233 case NVPTXISD::TexUnified1DFloatFloat
:
234 case NVPTXISD::TexUnified1DFloatFloatLevel
:
235 case NVPTXISD::TexUnified1DFloatFloatGrad
:
236 case NVPTXISD::TexUnified1DS32S32
:
237 case NVPTXISD::TexUnified1DS32Float
:
238 case NVPTXISD::TexUnified1DS32FloatLevel
:
239 case NVPTXISD::TexUnified1DS32FloatGrad
:
240 case NVPTXISD::TexUnified1DU32S32
:
241 case NVPTXISD::TexUnified1DU32Float
:
242 case NVPTXISD::TexUnified1DU32FloatLevel
:
243 case NVPTXISD::TexUnified1DU32FloatGrad
:
244 case NVPTXISD::TexUnified1DArrayFloatS32
:
245 case NVPTXISD::TexUnified1DArrayFloatFloat
:
246 case NVPTXISD::TexUnified1DArrayFloatFloatLevel
:
247 case NVPTXISD::TexUnified1DArrayFloatFloatGrad
:
248 case NVPTXISD::TexUnified1DArrayS32S32
:
249 case NVPTXISD::TexUnified1DArrayS32Float
:
250 case NVPTXISD::TexUnified1DArrayS32FloatLevel
:
251 case NVPTXISD::TexUnified1DArrayS32FloatGrad
:
252 case NVPTXISD::TexUnified1DArrayU32S32
:
253 case NVPTXISD::TexUnified1DArrayU32Float
:
254 case NVPTXISD::TexUnified1DArrayU32FloatLevel
:
255 case NVPTXISD::TexUnified1DArrayU32FloatGrad
:
256 case NVPTXISD::TexUnified2DFloatS32
:
257 case NVPTXISD::TexUnified2DFloatFloat
:
258 case NVPTXISD::TexUnified2DFloatFloatLevel
:
259 case NVPTXISD::TexUnified2DFloatFloatGrad
:
260 case NVPTXISD::TexUnified2DS32S32
:
261 case NVPTXISD::TexUnified2DS32Float
:
262 case NVPTXISD::TexUnified2DS32FloatLevel
:
263 case NVPTXISD::TexUnified2DS32FloatGrad
:
264 case NVPTXISD::TexUnified2DU32S32
:
265 case NVPTXISD::TexUnified2DU32Float
:
266 case NVPTXISD::TexUnified2DU32FloatLevel
:
267 case NVPTXISD::TexUnified2DU32FloatGrad
:
268 case NVPTXISD::TexUnified2DArrayFloatS32
:
269 case NVPTXISD::TexUnified2DArrayFloatFloat
:
270 case NVPTXISD::TexUnified2DArrayFloatFloatLevel
:
271 case NVPTXISD::TexUnified2DArrayFloatFloatGrad
:
272 case NVPTXISD::TexUnified2DArrayS32S32
:
273 case NVPTXISD::TexUnified2DArrayS32Float
:
274 case NVPTXISD::TexUnified2DArrayS32FloatLevel
:
275 case NVPTXISD::TexUnified2DArrayS32FloatGrad
:
276 case NVPTXISD::TexUnified2DArrayU32S32
:
277 case NVPTXISD::TexUnified2DArrayU32Float
:
278 case NVPTXISD::TexUnified2DArrayU32FloatLevel
:
279 case NVPTXISD::TexUnified2DArrayU32FloatGrad
:
280 case NVPTXISD::TexUnified3DFloatS32
:
281 case NVPTXISD::TexUnified3DFloatFloat
:
282 case NVPTXISD::TexUnified3DFloatFloatLevel
:
283 case NVPTXISD::TexUnified3DFloatFloatGrad
:
284 case NVPTXISD::TexUnified3DS32S32
:
285 case NVPTXISD::TexUnified3DS32Float
:
286 case NVPTXISD::TexUnified3DS32FloatLevel
:
287 case NVPTXISD::TexUnified3DS32FloatGrad
:
288 case NVPTXISD::TexUnified3DU32S32
:
289 case NVPTXISD::TexUnified3DU32Float
:
290 case NVPTXISD::TexUnified3DU32FloatLevel
:
291 case NVPTXISD::TexUnified3DU32FloatGrad
:
292 case NVPTXISD::TexUnifiedCubeFloatFloat
:
293 case NVPTXISD::TexUnifiedCubeFloatFloatLevel
:
294 case NVPTXISD::TexUnifiedCubeS32Float
:
295 case NVPTXISD::TexUnifiedCubeS32FloatLevel
:
296 case NVPTXISD::TexUnifiedCubeU32Float
:
297 case NVPTXISD::TexUnifiedCubeU32FloatLevel
:
298 case NVPTXISD::TexUnifiedCubeArrayFloatFloat
:
299 case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel
:
300 case NVPTXISD::TexUnifiedCubeArrayS32Float
:
301 case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel
:
302 case NVPTXISD::TexUnifiedCubeArrayU32Float
:
303 case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel
:
304 case NVPTXISD::Tld4UnifiedR2DFloatFloat
:
305 case NVPTXISD::Tld4UnifiedG2DFloatFloat
:
306 case NVPTXISD::Tld4UnifiedB2DFloatFloat
:
307 case NVPTXISD::Tld4UnifiedA2DFloatFloat
:
308 case NVPTXISD::Tld4UnifiedR2DS64Float
:
309 case NVPTXISD::Tld4UnifiedG2DS64Float
:
310 case NVPTXISD::Tld4UnifiedB2DS64Float
:
311 case NVPTXISD::Tld4UnifiedA2DS64Float
:
312 case NVPTXISD::Tld4UnifiedR2DU64Float
:
313 case NVPTXISD::Tld4UnifiedG2DU64Float
:
314 case NVPTXISD::Tld4UnifiedB2DU64Float
:
315 case NVPTXISD::Tld4UnifiedA2DU64Float
:
316 if (tryTextureIntrinsic(N
))
319 case NVPTXISD::Suld1DI8Clamp
:
320 case NVPTXISD::Suld1DI16Clamp
:
321 case NVPTXISD::Suld1DI32Clamp
:
322 case NVPTXISD::Suld1DI64Clamp
:
323 case NVPTXISD::Suld1DV2I8Clamp
:
324 case NVPTXISD::Suld1DV2I16Clamp
:
325 case NVPTXISD::Suld1DV2I32Clamp
:
326 case NVPTXISD::Suld1DV2I64Clamp
:
327 case NVPTXISD::Suld1DV4I8Clamp
:
328 case NVPTXISD::Suld1DV4I16Clamp
:
329 case NVPTXISD::Suld1DV4I32Clamp
:
330 case NVPTXISD::Suld1DArrayI8Clamp
:
331 case NVPTXISD::Suld1DArrayI16Clamp
:
332 case NVPTXISD::Suld1DArrayI32Clamp
:
333 case NVPTXISD::Suld1DArrayI64Clamp
:
334 case NVPTXISD::Suld1DArrayV2I8Clamp
:
335 case NVPTXISD::Suld1DArrayV2I16Clamp
:
336 case NVPTXISD::Suld1DArrayV2I32Clamp
:
337 case NVPTXISD::Suld1DArrayV2I64Clamp
:
338 case NVPTXISD::Suld1DArrayV4I8Clamp
:
339 case NVPTXISD::Suld1DArrayV4I16Clamp
:
340 case NVPTXISD::Suld1DArrayV4I32Clamp
:
341 case NVPTXISD::Suld2DI8Clamp
:
342 case NVPTXISD::Suld2DI16Clamp
:
343 case NVPTXISD::Suld2DI32Clamp
:
344 case NVPTXISD::Suld2DI64Clamp
:
345 case NVPTXISD::Suld2DV2I8Clamp
:
346 case NVPTXISD::Suld2DV2I16Clamp
:
347 case NVPTXISD::Suld2DV2I32Clamp
:
348 case NVPTXISD::Suld2DV2I64Clamp
:
349 case NVPTXISD::Suld2DV4I8Clamp
:
350 case NVPTXISD::Suld2DV4I16Clamp
:
351 case NVPTXISD::Suld2DV4I32Clamp
:
352 case NVPTXISD::Suld2DArrayI8Clamp
:
353 case NVPTXISD::Suld2DArrayI16Clamp
:
354 case NVPTXISD::Suld2DArrayI32Clamp
:
355 case NVPTXISD::Suld2DArrayI64Clamp
:
356 case NVPTXISD::Suld2DArrayV2I8Clamp
:
357 case NVPTXISD::Suld2DArrayV2I16Clamp
:
358 case NVPTXISD::Suld2DArrayV2I32Clamp
:
359 case NVPTXISD::Suld2DArrayV2I64Clamp
:
360 case NVPTXISD::Suld2DArrayV4I8Clamp
:
361 case NVPTXISD::Suld2DArrayV4I16Clamp
:
362 case NVPTXISD::Suld2DArrayV4I32Clamp
:
363 case NVPTXISD::Suld3DI8Clamp
:
364 case NVPTXISD::Suld3DI16Clamp
:
365 case NVPTXISD::Suld3DI32Clamp
:
366 case NVPTXISD::Suld3DI64Clamp
:
367 case NVPTXISD::Suld3DV2I8Clamp
:
368 case NVPTXISD::Suld3DV2I16Clamp
:
369 case NVPTXISD::Suld3DV2I32Clamp
:
370 case NVPTXISD::Suld3DV2I64Clamp
:
371 case NVPTXISD::Suld3DV4I8Clamp
:
372 case NVPTXISD::Suld3DV4I16Clamp
:
373 case NVPTXISD::Suld3DV4I32Clamp
:
374 case NVPTXISD::Suld1DI8Trap
:
375 case NVPTXISD::Suld1DI16Trap
:
376 case NVPTXISD::Suld1DI32Trap
:
377 case NVPTXISD::Suld1DI64Trap
:
378 case NVPTXISD::Suld1DV2I8Trap
:
379 case NVPTXISD::Suld1DV2I16Trap
:
380 case NVPTXISD::Suld1DV2I32Trap
:
381 case NVPTXISD::Suld1DV2I64Trap
:
382 case NVPTXISD::Suld1DV4I8Trap
:
383 case NVPTXISD::Suld1DV4I16Trap
:
384 case NVPTXISD::Suld1DV4I32Trap
:
385 case NVPTXISD::Suld1DArrayI8Trap
:
386 case NVPTXISD::Suld1DArrayI16Trap
:
387 case NVPTXISD::Suld1DArrayI32Trap
:
388 case NVPTXISD::Suld1DArrayI64Trap
:
389 case NVPTXISD::Suld1DArrayV2I8Trap
:
390 case NVPTXISD::Suld1DArrayV2I16Trap
:
391 case NVPTXISD::Suld1DArrayV2I32Trap
:
392 case NVPTXISD::Suld1DArrayV2I64Trap
:
393 case NVPTXISD::Suld1DArrayV4I8Trap
:
394 case NVPTXISD::Suld1DArrayV4I16Trap
:
395 case NVPTXISD::Suld1DArrayV4I32Trap
:
396 case NVPTXISD::Suld2DI8Trap
:
397 case NVPTXISD::Suld2DI16Trap
:
398 case NVPTXISD::Suld2DI32Trap
:
399 case NVPTXISD::Suld2DI64Trap
:
400 case NVPTXISD::Suld2DV2I8Trap
:
401 case NVPTXISD::Suld2DV2I16Trap
:
402 case NVPTXISD::Suld2DV2I32Trap
:
403 case NVPTXISD::Suld2DV2I64Trap
:
404 case NVPTXISD::Suld2DV4I8Trap
:
405 case NVPTXISD::Suld2DV4I16Trap
:
406 case NVPTXISD::Suld2DV4I32Trap
:
407 case NVPTXISD::Suld2DArrayI8Trap
:
408 case NVPTXISD::Suld2DArrayI16Trap
:
409 case NVPTXISD::Suld2DArrayI32Trap
:
410 case NVPTXISD::Suld2DArrayI64Trap
:
411 case NVPTXISD::Suld2DArrayV2I8Trap
:
412 case NVPTXISD::Suld2DArrayV2I16Trap
:
413 case NVPTXISD::Suld2DArrayV2I32Trap
:
414 case NVPTXISD::Suld2DArrayV2I64Trap
:
415 case NVPTXISD::Suld2DArrayV4I8Trap
:
416 case NVPTXISD::Suld2DArrayV4I16Trap
:
417 case NVPTXISD::Suld2DArrayV4I32Trap
:
418 case NVPTXISD::Suld3DI8Trap
:
419 case NVPTXISD::Suld3DI16Trap
:
420 case NVPTXISD::Suld3DI32Trap
:
421 case NVPTXISD::Suld3DI64Trap
:
422 case NVPTXISD::Suld3DV2I8Trap
:
423 case NVPTXISD::Suld3DV2I16Trap
:
424 case NVPTXISD::Suld3DV2I32Trap
:
425 case NVPTXISD::Suld3DV2I64Trap
:
426 case NVPTXISD::Suld3DV4I8Trap
:
427 case NVPTXISD::Suld3DV4I16Trap
:
428 case NVPTXISD::Suld3DV4I32Trap
:
429 case NVPTXISD::Suld1DI8Zero
:
430 case NVPTXISD::Suld1DI16Zero
:
431 case NVPTXISD::Suld1DI32Zero
:
432 case NVPTXISD::Suld1DI64Zero
:
433 case NVPTXISD::Suld1DV2I8Zero
:
434 case NVPTXISD::Suld1DV2I16Zero
:
435 case NVPTXISD::Suld1DV2I32Zero
:
436 case NVPTXISD::Suld1DV2I64Zero
:
437 case NVPTXISD::Suld1DV4I8Zero
:
438 case NVPTXISD::Suld1DV4I16Zero
:
439 case NVPTXISD::Suld1DV4I32Zero
:
440 case NVPTXISD::Suld1DArrayI8Zero
:
441 case NVPTXISD::Suld1DArrayI16Zero
:
442 case NVPTXISD::Suld1DArrayI32Zero
:
443 case NVPTXISD::Suld1DArrayI64Zero
:
444 case NVPTXISD::Suld1DArrayV2I8Zero
:
445 case NVPTXISD::Suld1DArrayV2I16Zero
:
446 case NVPTXISD::Suld1DArrayV2I32Zero
:
447 case NVPTXISD::Suld1DArrayV2I64Zero
:
448 case NVPTXISD::Suld1DArrayV4I8Zero
:
449 case NVPTXISD::Suld1DArrayV4I16Zero
:
450 case NVPTXISD::Suld1DArrayV4I32Zero
:
451 case NVPTXISD::Suld2DI8Zero
:
452 case NVPTXISD::Suld2DI16Zero
:
453 case NVPTXISD::Suld2DI32Zero
:
454 case NVPTXISD::Suld2DI64Zero
:
455 case NVPTXISD::Suld2DV2I8Zero
:
456 case NVPTXISD::Suld2DV2I16Zero
:
457 case NVPTXISD::Suld2DV2I32Zero
:
458 case NVPTXISD::Suld2DV2I64Zero
:
459 case NVPTXISD::Suld2DV4I8Zero
:
460 case NVPTXISD::Suld2DV4I16Zero
:
461 case NVPTXISD::Suld2DV4I32Zero
:
462 case NVPTXISD::Suld2DArrayI8Zero
:
463 case NVPTXISD::Suld2DArrayI16Zero
:
464 case NVPTXISD::Suld2DArrayI32Zero
:
465 case NVPTXISD::Suld2DArrayI64Zero
:
466 case NVPTXISD::Suld2DArrayV2I8Zero
:
467 case NVPTXISD::Suld2DArrayV2I16Zero
:
468 case NVPTXISD::Suld2DArrayV2I32Zero
:
469 case NVPTXISD::Suld2DArrayV2I64Zero
:
470 case NVPTXISD::Suld2DArrayV4I8Zero
:
471 case NVPTXISD::Suld2DArrayV4I16Zero
:
472 case NVPTXISD::Suld2DArrayV4I32Zero
:
473 case NVPTXISD::Suld3DI8Zero
:
474 case NVPTXISD::Suld3DI16Zero
:
475 case NVPTXISD::Suld3DI32Zero
:
476 case NVPTXISD::Suld3DI64Zero
:
477 case NVPTXISD::Suld3DV2I8Zero
:
478 case NVPTXISD::Suld3DV2I16Zero
:
479 case NVPTXISD::Suld3DV2I32Zero
:
480 case NVPTXISD::Suld3DV2I64Zero
:
481 case NVPTXISD::Suld3DV4I8Zero
:
482 case NVPTXISD::Suld3DV4I16Zero
:
483 case NVPTXISD::Suld3DV4I32Zero
:
484 if (trySurfaceIntrinsic(N
))
494 case ISD::ADDRSPACECAST
:
495 SelectAddrSpaceCast(N
);
497 case ISD::ConstantFP
:
498 if (tryConstantFP16(N
))
507 bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode
*N
) {
508 unsigned IID
= cast
<ConstantSDNode
>(N
->getOperand(1))->getZExtValue();
512 case Intrinsic::nvvm_ldg_global_f
:
513 case Intrinsic::nvvm_ldg_global_i
:
514 case Intrinsic::nvvm_ldg_global_p
:
515 case Intrinsic::nvvm_ldu_global_f
:
516 case Intrinsic::nvvm_ldu_global_i
:
517 case Intrinsic::nvvm_ldu_global_p
:
522 // There's no way to specify FP16 immediates in .f16 ops, so we have to
523 // load them into an .f16 register first.
524 bool NVPTXDAGToDAGISel::tryConstantFP16(SDNode
*N
) {
525 if (N
->getValueType(0) != MVT::f16
)
527 SDValue Val
= CurDAG
->getTargetConstantFP(
528 cast
<ConstantFPSDNode
>(N
)->getValueAPF(), SDLoc(N
), MVT::f16
);
529 SDNode
*LoadConstF16
=
530 CurDAG
->getMachineNode(NVPTX::LOAD_CONST_F16
, SDLoc(N
), MVT::f16
, Val
);
531 ReplaceNode(N
, LoadConstF16
);
535 // Map ISD:CONDCODE value to appropriate CmpMode expected by
536 // NVPTXInstPrinter::printCmpMode()
537 static unsigned getPTXCmpMode(const CondCodeSDNode
&CondCode
, bool FTZ
) {
538 using NVPTX::PTXCmpMode::CmpMode
;
539 unsigned PTXCmpMode
= [](ISD::CondCode CC
) {
542 llvm_unreachable("Unexpected condition code.");
558 return CmpMode::NotANumber
;
587 PTXCmpMode
|= NVPTX::PTXCmpMode::FTZ_FLAG
;
592 bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode
*N
) {
593 unsigned PTXCmpMode
=
594 getPTXCmpMode(*cast
<CondCodeSDNode
>(N
->getOperand(2)), useF32FTZ());
596 SDNode
*SetP
= CurDAG
->getMachineNode(
597 NVPTX::SETP_f16x2rr
, DL
, MVT::i1
, MVT::i1
, N
->getOperand(0),
598 N
->getOperand(1), CurDAG
->getTargetConstant(PTXCmpMode
, DL
, MVT::i32
));
599 ReplaceNode(N
, SetP
);
603 // Find all instances of extract_vector_elt that use this v2f16 vector
604 // and coalesce them into a scattering move instruction.
605 bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode
*N
) {
606 SDValue Vector
= N
->getOperand(0);
608 // We only care about f16x2 as it's the only real vector type we
609 // need to deal with.
610 if (Vector
.getSimpleValueType() != MVT::v2f16
)
613 // Find and record all uses of this vector that extract element 0 or 1.
614 SmallVector
<SDNode
*, 4> E0
, E1
;
615 for (auto U
: Vector
.getNode()->uses()) {
616 if (U
->getOpcode() != ISD::EXTRACT_VECTOR_ELT
)
618 if (U
->getOperand(0) != Vector
)
620 if (const ConstantSDNode
*IdxConst
=
621 dyn_cast
<ConstantSDNode
>(U
->getOperand(1))) {
622 if (IdxConst
->getZExtValue() == 0)
624 else if (IdxConst
->getZExtValue() == 1)
627 llvm_unreachable("Invalid vector index.");
631 // There's no point scattering f16x2 if we only ever access one
633 if (E0
.empty() || E1
.empty())
636 unsigned Op
= NVPTX::SplitF16x2
;
637 // If the vector has been BITCAST'ed from i32, we can use original
638 // value directly and avoid register-to-register move.
639 SDValue Source
= Vector
;
640 if (Vector
->getOpcode() == ISD::BITCAST
) {
641 Op
= NVPTX::SplitI32toF16x2
;
642 Source
= Vector
->getOperand(0);
644 // Merge (f16 extractelt(V, 0), f16 extractelt(V,1))
645 // into f16,f16 SplitF16x2(V)
647 CurDAG
->getMachineNode(Op
, SDLoc(N
), MVT::f16
, MVT::f16
, Source
);
648 for (auto *Node
: E0
)
649 ReplaceUses(SDValue(Node
, 0), SDValue(ScatterOp
, 0));
650 for (auto *Node
: E1
)
651 ReplaceUses(SDValue(Node
, 0), SDValue(ScatterOp
, 1));
656 static unsigned int getCodeAddrSpace(MemSDNode
*N
) {
657 const Value
*Src
= N
->getMemOperand()->getValue();
660 return NVPTX::PTXLdStInstCode::GENERIC
;
662 if (auto *PT
= dyn_cast
<PointerType
>(Src
->getType())) {
663 switch (PT
->getAddressSpace()) {
664 case llvm::ADDRESS_SPACE_LOCAL
: return NVPTX::PTXLdStInstCode::LOCAL
;
665 case llvm::ADDRESS_SPACE_GLOBAL
: return NVPTX::PTXLdStInstCode::GLOBAL
;
666 case llvm::ADDRESS_SPACE_SHARED
: return NVPTX::PTXLdStInstCode::SHARED
;
667 case llvm::ADDRESS_SPACE_GENERIC
: return NVPTX::PTXLdStInstCode::GENERIC
;
668 case llvm::ADDRESS_SPACE_PARAM
: return NVPTX::PTXLdStInstCode::PARAM
;
669 case llvm::ADDRESS_SPACE_CONST
: return NVPTX::PTXLdStInstCode::CONSTANT
;
673 return NVPTX::PTXLdStInstCode::GENERIC
;
676 static bool canLowerToLDG(MemSDNode
*N
, const NVPTXSubtarget
&Subtarget
,
677 unsigned CodeAddrSpace
, MachineFunction
*F
) {
678 // We use ldg (i.e. ld.global.nc) for invariant loads from the global address
681 // We have two ways of identifying invariant loads: Loads may be explicitly
682 // marked as invariant, or we may infer them to be invariant.
684 // We currently infer invariance for loads from
685 // - constant global variables, and
686 // - kernel function pointer params that are noalias (i.e. __restrict) and
689 // TODO: Perform a more powerful invariance analysis (ideally IPO, and ideally
690 // not during the SelectionDAG phase).
692 // TODO: Infer invariance only at -O2. We still want to use ldg at -O0 for
693 // explicitly invariant loads because these are how clang tells us to use ldg
694 // when the user uses a builtin.
695 if (!Subtarget
.hasLDG() || CodeAddrSpace
!= NVPTX::PTXLdStInstCode::GLOBAL
)
698 if (N
->isInvariant())
701 bool IsKernelFn
= isKernelFunction(F
->getFunction());
703 // We use getUnderlyingObjects() here instead of getUnderlyingObject() mainly
704 // because the former looks through phi nodes while the latter does not. We
705 // need to look through phi nodes to handle pointer induction variables.
706 SmallVector
<const Value
*, 8> Objs
;
707 getUnderlyingObjects(N
->getMemOperand()->getValue(), Objs
);
709 return all_of(Objs
, [&](const Value
*V
) {
710 if (auto *A
= dyn_cast
<const Argument
>(V
))
711 return IsKernelFn
&& A
->onlyReadsMemory() && A
->hasNoAliasAttr();
712 if (auto *GV
= dyn_cast
<const GlobalVariable
>(V
))
713 return GV
->isConstant();
718 bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode
*N
) {
719 unsigned IID
= cast
<ConstantSDNode
>(N
->getOperand(0))->getZExtValue();
723 case Intrinsic::nvvm_texsurf_handle_internal
:
724 SelectTexSurfHandle(N
);
729 void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode
*N
) {
730 // Op 0 is the intrinsic ID
731 SDValue Wrapper
= N
->getOperand(1);
732 SDValue GlobalVal
= Wrapper
.getOperand(0);
733 ReplaceNode(N
, CurDAG
->getMachineNode(NVPTX::texsurf_handles
, SDLoc(N
),
734 MVT::i64
, GlobalVal
));
737 void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode
*N
) {
738 SDValue Src
= N
->getOperand(0);
739 AddrSpaceCastSDNode
*CastN
= cast
<AddrSpaceCastSDNode
>(N
);
740 unsigned SrcAddrSpace
= CastN
->getSrcAddressSpace();
741 unsigned DstAddrSpace
= CastN
->getDestAddressSpace();
742 assert(SrcAddrSpace
!= DstAddrSpace
&&
743 "addrspacecast must be between different address spaces");
745 if (DstAddrSpace
== ADDRESS_SPACE_GENERIC
) {
746 // Specific to generic
748 switch (SrcAddrSpace
) {
749 default: report_fatal_error("Bad address space in addrspacecast");
750 case ADDRESS_SPACE_GLOBAL
:
751 Opc
= TM
.is64Bit() ? NVPTX::cvta_global_yes_64
: NVPTX::cvta_global_yes
;
753 case ADDRESS_SPACE_SHARED
:
754 Opc
= TM
.is64Bit() ? (useShortPointers() ? NVPTX::cvta_shared_yes_6432
755 : NVPTX::cvta_shared_yes_64
)
756 : NVPTX::cvta_shared_yes
;
758 case ADDRESS_SPACE_CONST
:
759 Opc
= TM
.is64Bit() ? (useShortPointers() ? NVPTX::cvta_const_yes_6432
760 : NVPTX::cvta_const_yes_64
)
761 : NVPTX::cvta_const_yes
;
763 case ADDRESS_SPACE_LOCAL
:
764 Opc
= TM
.is64Bit() ? (useShortPointers() ? NVPTX::cvta_local_yes_6432
765 : NVPTX::cvta_local_yes_64
)
766 : NVPTX::cvta_local_yes
;
769 ReplaceNode(N
, CurDAG
->getMachineNode(Opc
, SDLoc(N
), N
->getValueType(0),
773 // Generic to specific
774 if (SrcAddrSpace
!= 0)
775 report_fatal_error("Cannot cast between two non-generic address spaces");
777 switch (DstAddrSpace
) {
778 default: report_fatal_error("Bad address space in addrspacecast");
779 case ADDRESS_SPACE_GLOBAL
:
780 Opc
= TM
.is64Bit() ? NVPTX::cvta_to_global_yes_64
781 : NVPTX::cvta_to_global_yes
;
783 case ADDRESS_SPACE_SHARED
:
784 Opc
= TM
.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_shared_yes_3264
785 : NVPTX::cvta_to_shared_yes_64
)
786 : NVPTX::cvta_to_shared_yes
;
788 case ADDRESS_SPACE_CONST
:
789 Opc
= TM
.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_const_yes_3264
790 : NVPTX::cvta_to_const_yes_64
)
791 : NVPTX::cvta_to_const_yes
;
793 case ADDRESS_SPACE_LOCAL
:
794 Opc
= TM
.is64Bit() ? (useShortPointers() ? NVPTX::cvta_to_local_yes_3264
795 : NVPTX::cvta_to_local_yes_64
)
796 : NVPTX::cvta_to_local_yes
;
798 case ADDRESS_SPACE_PARAM
:
799 Opc
= TM
.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64
800 : NVPTX::nvvm_ptr_gen_to_param
;
803 ReplaceNode(N
, CurDAG
->getMachineNode(Opc
, SDLoc(N
), N
->getValueType(0),
809 // Helper function template to reduce amount of boilerplate code for
811 static Optional
<unsigned> pickOpcodeForVT(
812 MVT::SimpleValueType VT
, unsigned Opcode_i8
, unsigned Opcode_i16
,
813 unsigned Opcode_i32
, Optional
<unsigned> Opcode_i64
, unsigned Opcode_f16
,
814 unsigned Opcode_f16x2
, unsigned Opcode_f32
, Optional
<unsigned> Opcode_f64
) {
838 bool NVPTXDAGToDAGISel::tryLoad(SDNode
*N
) {
840 MemSDNode
*LD
= cast
<MemSDNode
>(N
);
841 assert(LD
->readMem() && "Expected load");
842 LoadSDNode
*PlainLoad
= dyn_cast
<LoadSDNode
>(N
);
843 EVT LoadedVT
= LD
->getMemoryVT();
844 SDNode
*NVPTXLD
= nullptr;
846 // do not support pre/post inc/dec
847 if (PlainLoad
&& PlainLoad
->isIndexed())
850 if (!LoadedVT
.isSimple())
853 AtomicOrdering Ordering
= LD
->getSuccessOrdering();
854 // In order to lower atomic loads with stronger guarantees we would need to
855 // use load.acquire or insert fences. However these features were only added
856 // with PTX ISA 6.0 / sm_70.
857 // TODO: Check if we can actually use the new instructions and implement them.
858 if (isStrongerThanMonotonic(Ordering
))
861 // Address Space Setting
862 unsigned int CodeAddrSpace
= getCodeAddrSpace(LD
);
863 if (canLowerToLDG(LD
, *Subtarget
, CodeAddrSpace
, MF
)) {
867 unsigned int PointerSize
=
868 CurDAG
->getDataLayout().getPointerSizeInBits(LD
->getAddressSpace());
871 // - .volatile is only available for .global and .shared
872 // - .volatile has the same memory synchronization semantics as .relaxed.sys
873 bool isVolatile
= LD
->isVolatile() || Ordering
== AtomicOrdering::Monotonic
;
874 if (CodeAddrSpace
!= NVPTX::PTXLdStInstCode::GLOBAL
&&
875 CodeAddrSpace
!= NVPTX::PTXLdStInstCode::SHARED
&&
876 CodeAddrSpace
!= NVPTX::PTXLdStInstCode::GENERIC
)
879 // Type Setting: fromType + fromTypeWidth
881 // Sign : ISD::SEXTLOAD
882 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
884 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
885 MVT SimpleVT
= LoadedVT
.getSimpleVT();
886 MVT ScalarVT
= SimpleVT
.getScalarType();
887 // Read at least 8 bits (predicates are stored as 8-bit values)
888 unsigned fromTypeWidth
= std::max(8U, (unsigned)ScalarVT
.getSizeInBits());
889 unsigned int fromType
;
892 unsigned vecType
= NVPTX::PTXLdStInstCode::Scalar
;
893 if (SimpleVT
.isVector()) {
894 assert(LoadedVT
== MVT::v2f16
&& "Unexpected vector type");
895 // v2f16 is loaded using ld.b32
899 if (PlainLoad
&& (PlainLoad
->getExtensionType() == ISD::SEXTLOAD
))
900 fromType
= NVPTX::PTXLdStInstCode::Signed
;
901 else if (ScalarVT
.isFloatingPoint())
902 // f16 uses .b16 as its storage type.
903 fromType
= ScalarVT
.SimpleTy
== MVT::f16
? NVPTX::PTXLdStInstCode::Untyped
904 : NVPTX::PTXLdStInstCode::Float
;
906 fromType
= NVPTX::PTXLdStInstCode::Unsigned
;
908 // Create the machine instruction DAG
909 SDValue Chain
= N
->getOperand(0);
910 SDValue N1
= N
->getOperand(1);
912 SDValue Offset
, Base
;
913 Optional
<unsigned> Opcode
;
914 MVT::SimpleValueType TargetVT
= LD
->getSimpleValueType(0).SimpleTy
;
916 if (SelectDirectAddr(N1
, Addr
)) {
917 Opcode
= pickOpcodeForVT(
918 TargetVT
, NVPTX::LD_i8_avar
, NVPTX::LD_i16_avar
, NVPTX::LD_i32_avar
,
919 NVPTX::LD_i64_avar
, NVPTX::LD_f16_avar
, NVPTX::LD_f16x2_avar
,
920 NVPTX::LD_f32_avar
, NVPTX::LD_f64_avar
);
923 SDValue Ops
[] = { getI32Imm(isVolatile
, dl
), getI32Imm(CodeAddrSpace
, dl
),
924 getI32Imm(vecType
, dl
), getI32Imm(fromType
, dl
),
925 getI32Imm(fromTypeWidth
, dl
), Addr
, Chain
};
926 NVPTXLD
= CurDAG
->getMachineNode(Opcode
.getValue(), dl
, TargetVT
,
928 } else if (PointerSize
== 64 ? SelectADDRsi64(N1
.getNode(), N1
, Base
, Offset
)
929 : SelectADDRsi(N1
.getNode(), N1
, Base
, Offset
)) {
930 Opcode
= pickOpcodeForVT(TargetVT
, NVPTX::LD_i8_asi
, NVPTX::LD_i16_asi
,
931 NVPTX::LD_i32_asi
, NVPTX::LD_i64_asi
,
932 NVPTX::LD_f16_asi
, NVPTX::LD_f16x2_asi
,
933 NVPTX::LD_f32_asi
, NVPTX::LD_f64_asi
);
936 SDValue Ops
[] = { getI32Imm(isVolatile
, dl
), getI32Imm(CodeAddrSpace
, dl
),
937 getI32Imm(vecType
, dl
), getI32Imm(fromType
, dl
),
938 getI32Imm(fromTypeWidth
, dl
), Base
, Offset
, Chain
};
939 NVPTXLD
= CurDAG
->getMachineNode(Opcode
.getValue(), dl
, TargetVT
,
941 } else if (PointerSize
== 64 ? SelectADDRri64(N1
.getNode(), N1
, Base
, Offset
)
942 : SelectADDRri(N1
.getNode(), N1
, Base
, Offset
)) {
943 if (PointerSize
== 64)
944 Opcode
= pickOpcodeForVT(
945 TargetVT
, NVPTX::LD_i8_ari_64
, NVPTX::LD_i16_ari_64
,
946 NVPTX::LD_i32_ari_64
, NVPTX::LD_i64_ari_64
, NVPTX::LD_f16_ari_64
,
947 NVPTX::LD_f16x2_ari_64
, NVPTX::LD_f32_ari_64
, NVPTX::LD_f64_ari_64
);
949 Opcode
= pickOpcodeForVT(
950 TargetVT
, NVPTX::LD_i8_ari
, NVPTX::LD_i16_ari
, NVPTX::LD_i32_ari
,
951 NVPTX::LD_i64_ari
, NVPTX::LD_f16_ari
, NVPTX::LD_f16x2_ari
,
952 NVPTX::LD_f32_ari
, NVPTX::LD_f64_ari
);
955 SDValue Ops
[] = { getI32Imm(isVolatile
, dl
), getI32Imm(CodeAddrSpace
, dl
),
956 getI32Imm(vecType
, dl
), getI32Imm(fromType
, dl
),
957 getI32Imm(fromTypeWidth
, dl
), Base
, Offset
, Chain
};
958 NVPTXLD
= CurDAG
->getMachineNode(Opcode
.getValue(), dl
, TargetVT
,
961 if (PointerSize
== 64)
962 Opcode
= pickOpcodeForVT(
963 TargetVT
, NVPTX::LD_i8_areg_64
, NVPTX::LD_i16_areg_64
,
964 NVPTX::LD_i32_areg_64
, NVPTX::LD_i64_areg_64
, NVPTX::LD_f16_areg_64
,
965 NVPTX::LD_f16x2_areg_64
, NVPTX::LD_f32_areg_64
,
966 NVPTX::LD_f64_areg_64
);
968 Opcode
= pickOpcodeForVT(
969 TargetVT
, NVPTX::LD_i8_areg
, NVPTX::LD_i16_areg
, NVPTX::LD_i32_areg
,
970 NVPTX::LD_i64_areg
, NVPTX::LD_f16_areg
, NVPTX::LD_f16x2_areg
,
971 NVPTX::LD_f32_areg
, NVPTX::LD_f64_areg
);
974 SDValue Ops
[] = { getI32Imm(isVolatile
, dl
), getI32Imm(CodeAddrSpace
, dl
),
975 getI32Imm(vecType
, dl
), getI32Imm(fromType
, dl
),
976 getI32Imm(fromTypeWidth
, dl
), N1
, Chain
};
977 NVPTXLD
= CurDAG
->getMachineNode(Opcode
.getValue(), dl
, TargetVT
,
984 MachineMemOperand
*MemRef
= cast
<MemSDNode
>(N
)->getMemOperand();
985 CurDAG
->setNodeMemRefs(cast
<MachineSDNode
>(NVPTXLD
), {MemRef
});
987 ReplaceNode(N
, NVPTXLD
);
991 bool NVPTXDAGToDAGISel::tryLoadVector(SDNode
*N
) {
993 SDValue Chain
= N
->getOperand(0);
994 SDValue Op1
= N
->getOperand(1);
995 SDValue Addr
, Offset
, Base
;
996 Optional
<unsigned> Opcode
;
999 MemSDNode
*MemSD
= cast
<MemSDNode
>(N
);
1000 EVT LoadedVT
= MemSD
->getMemoryVT();
1002 if (!LoadedVT
.isSimple())
1005 // Address Space Setting
1006 unsigned int CodeAddrSpace
= getCodeAddrSpace(MemSD
);
1007 if (canLowerToLDG(MemSD
, *Subtarget
, CodeAddrSpace
, MF
)) {
1008 return tryLDGLDU(N
);
1011 unsigned int PointerSize
=
1012 CurDAG
->getDataLayout().getPointerSizeInBits(MemSD
->getAddressSpace());
1015 // - .volatile is only availalble for .global and .shared
1016 bool IsVolatile
= MemSD
->isVolatile();
1017 if (CodeAddrSpace
!= NVPTX::PTXLdStInstCode::GLOBAL
&&
1018 CodeAddrSpace
!= NVPTX::PTXLdStInstCode::SHARED
&&
1019 CodeAddrSpace
!= NVPTX::PTXLdStInstCode::GENERIC
)
1023 MVT SimpleVT
= LoadedVT
.getSimpleVT();
1025 // Type Setting: fromType + fromTypeWidth
1027 // Sign : ISD::SEXTLOAD
1028 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
1030 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
1031 MVT ScalarVT
= SimpleVT
.getScalarType();
1032 // Read at least 8 bits (predicates are stored as 8-bit values)
1033 unsigned FromTypeWidth
= std::max(8U, (unsigned)ScalarVT
.getSizeInBits());
1034 unsigned int FromType
;
1035 // The last operand holds the original LoadSDNode::getExtensionType() value
1036 unsigned ExtensionType
= cast
<ConstantSDNode
>(
1037 N
->getOperand(N
->getNumOperands() - 1))->getZExtValue();
1038 if (ExtensionType
== ISD::SEXTLOAD
)
1039 FromType
= NVPTX::PTXLdStInstCode::Signed
;
1040 else if (ScalarVT
.isFloatingPoint())
1041 FromType
= ScalarVT
.SimpleTy
== MVT::f16
? NVPTX::PTXLdStInstCode::Untyped
1042 : NVPTX::PTXLdStInstCode::Float
;
1044 FromType
= NVPTX::PTXLdStInstCode::Unsigned
;
1048 switch (N
->getOpcode()) {
1049 case NVPTXISD::LoadV2
:
1050 VecType
= NVPTX::PTXLdStInstCode::V2
;
1052 case NVPTXISD::LoadV4
:
1053 VecType
= NVPTX::PTXLdStInstCode::V4
;
1059 EVT EltVT
= N
->getValueType(0);
1061 // v8f16 is a special case. PTX doesn't have ld.v8.f16
1062 // instruction. Instead, we split the vector into v2f16 chunks and
1063 // load them with ld.v4.b32.
1064 if (EltVT
== MVT::v2f16
) {
1065 assert(N
->getOpcode() == NVPTXISD::LoadV4
&& "Unexpected load opcode.");
1067 FromType
= NVPTX::PTXLdStInstCode::Untyped
;
1071 if (SelectDirectAddr(Op1
, Addr
)) {
1072 switch (N
->getOpcode()) {
1075 case NVPTXISD::LoadV2
:
1076 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1077 NVPTX::LDV_i8_v2_avar
, NVPTX::LDV_i16_v2_avar
,
1078 NVPTX::LDV_i32_v2_avar
, NVPTX::LDV_i64_v2_avar
,
1079 NVPTX::LDV_f16_v2_avar
, NVPTX::LDV_f16x2_v2_avar
,
1080 NVPTX::LDV_f32_v2_avar
, NVPTX::LDV_f64_v2_avar
);
1082 case NVPTXISD::LoadV4
:
1084 pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
, NVPTX::LDV_i8_v4_avar
,
1085 NVPTX::LDV_i16_v4_avar
, NVPTX::LDV_i32_v4_avar
, None
,
1086 NVPTX::LDV_f16_v4_avar
, NVPTX::LDV_f16x2_v4_avar
,
1087 NVPTX::LDV_f32_v4_avar
, None
);
1092 SDValue Ops
[] = { getI32Imm(IsVolatile
, DL
), getI32Imm(CodeAddrSpace
, DL
),
1093 getI32Imm(VecType
, DL
), getI32Imm(FromType
, DL
),
1094 getI32Imm(FromTypeWidth
, DL
), Addr
, Chain
};
1095 LD
= CurDAG
->getMachineNode(Opcode
.getValue(), DL
, N
->getVTList(), Ops
);
1096 } else if (PointerSize
== 64
1097 ? SelectADDRsi64(Op1
.getNode(), Op1
, Base
, Offset
)
1098 : SelectADDRsi(Op1
.getNode(), Op1
, Base
, Offset
)) {
1099 switch (N
->getOpcode()) {
1102 case NVPTXISD::LoadV2
:
1103 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1104 NVPTX::LDV_i8_v2_asi
, NVPTX::LDV_i16_v2_asi
,
1105 NVPTX::LDV_i32_v2_asi
, NVPTX::LDV_i64_v2_asi
,
1106 NVPTX::LDV_f16_v2_asi
, NVPTX::LDV_f16x2_v2_asi
,
1107 NVPTX::LDV_f32_v2_asi
, NVPTX::LDV_f64_v2_asi
);
1109 case NVPTXISD::LoadV4
:
1111 pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
, NVPTX::LDV_i8_v4_asi
,
1112 NVPTX::LDV_i16_v4_asi
, NVPTX::LDV_i32_v4_asi
, None
,
1113 NVPTX::LDV_f16_v4_asi
, NVPTX::LDV_f16x2_v4_asi
,
1114 NVPTX::LDV_f32_v4_asi
, None
);
1119 SDValue Ops
[] = { getI32Imm(IsVolatile
, DL
), getI32Imm(CodeAddrSpace
, DL
),
1120 getI32Imm(VecType
, DL
), getI32Imm(FromType
, DL
),
1121 getI32Imm(FromTypeWidth
, DL
), Base
, Offset
, Chain
};
1122 LD
= CurDAG
->getMachineNode(Opcode
.getValue(), DL
, N
->getVTList(), Ops
);
1123 } else if (PointerSize
== 64
1124 ? SelectADDRri64(Op1
.getNode(), Op1
, Base
, Offset
)
1125 : SelectADDRri(Op1
.getNode(), Op1
, Base
, Offset
)) {
1126 if (PointerSize
== 64) {
1127 switch (N
->getOpcode()) {
1130 case NVPTXISD::LoadV2
:
1131 Opcode
= pickOpcodeForVT(
1132 EltVT
.getSimpleVT().SimpleTy
, NVPTX::LDV_i8_v2_ari_64
,
1133 NVPTX::LDV_i16_v2_ari_64
, NVPTX::LDV_i32_v2_ari_64
,
1134 NVPTX::LDV_i64_v2_ari_64
, NVPTX::LDV_f16_v2_ari_64
,
1135 NVPTX::LDV_f16x2_v2_ari_64
, NVPTX::LDV_f32_v2_ari_64
,
1136 NVPTX::LDV_f64_v2_ari_64
);
1138 case NVPTXISD::LoadV4
:
1139 Opcode
= pickOpcodeForVT(
1140 EltVT
.getSimpleVT().SimpleTy
, NVPTX::LDV_i8_v4_ari_64
,
1141 NVPTX::LDV_i16_v4_ari_64
, NVPTX::LDV_i32_v4_ari_64
, None
,
1142 NVPTX::LDV_f16_v4_ari_64
, NVPTX::LDV_f16x2_v4_ari_64
,
1143 NVPTX::LDV_f32_v4_ari_64
, None
);
1147 switch (N
->getOpcode()) {
1150 case NVPTXISD::LoadV2
:
1151 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1152 NVPTX::LDV_i8_v2_ari
, NVPTX::LDV_i16_v2_ari
,
1153 NVPTX::LDV_i32_v2_ari
, NVPTX::LDV_i64_v2_ari
,
1154 NVPTX::LDV_f16_v2_ari
, NVPTX::LDV_f16x2_v2_ari
,
1155 NVPTX::LDV_f32_v2_ari
, NVPTX::LDV_f64_v2_ari
);
1157 case NVPTXISD::LoadV4
:
1159 pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
, NVPTX::LDV_i8_v4_ari
,
1160 NVPTX::LDV_i16_v4_ari
, NVPTX::LDV_i32_v4_ari
, None
,
1161 NVPTX::LDV_f16_v4_ari
, NVPTX::LDV_f16x2_v4_ari
,
1162 NVPTX::LDV_f32_v4_ari
, None
);
1168 SDValue Ops
[] = { getI32Imm(IsVolatile
, DL
), getI32Imm(CodeAddrSpace
, DL
),
1169 getI32Imm(VecType
, DL
), getI32Imm(FromType
, DL
),
1170 getI32Imm(FromTypeWidth
, DL
), Base
, Offset
, Chain
};
1172 LD
= CurDAG
->getMachineNode(Opcode
.getValue(), DL
, N
->getVTList(), Ops
);
1174 if (PointerSize
== 64) {
1175 switch (N
->getOpcode()) {
1178 case NVPTXISD::LoadV2
:
1179 Opcode
= pickOpcodeForVT(
1180 EltVT
.getSimpleVT().SimpleTy
, NVPTX::LDV_i8_v2_areg_64
,
1181 NVPTX::LDV_i16_v2_areg_64
, NVPTX::LDV_i32_v2_areg_64
,
1182 NVPTX::LDV_i64_v2_areg_64
, NVPTX::LDV_f16_v2_areg_64
,
1183 NVPTX::LDV_f16x2_v2_areg_64
, NVPTX::LDV_f32_v2_areg_64
,
1184 NVPTX::LDV_f64_v2_areg_64
);
1186 case NVPTXISD::LoadV4
:
1187 Opcode
= pickOpcodeForVT(
1188 EltVT
.getSimpleVT().SimpleTy
, NVPTX::LDV_i8_v4_areg_64
,
1189 NVPTX::LDV_i16_v4_areg_64
, NVPTX::LDV_i32_v4_areg_64
, None
,
1190 NVPTX::LDV_f16_v4_areg_64
, NVPTX::LDV_f16x2_v4_areg_64
,
1191 NVPTX::LDV_f32_v4_areg_64
, None
);
1195 switch (N
->getOpcode()) {
1198 case NVPTXISD::LoadV2
:
1200 pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
, NVPTX::LDV_i8_v2_areg
,
1201 NVPTX::LDV_i16_v2_areg
, NVPTX::LDV_i32_v2_areg
,
1202 NVPTX::LDV_i64_v2_areg
, NVPTX::LDV_f16_v2_areg
,
1203 NVPTX::LDV_f16x2_v2_areg
, NVPTX::LDV_f32_v2_areg
,
1204 NVPTX::LDV_f64_v2_areg
);
1206 case NVPTXISD::LoadV4
:
1207 Opcode
= pickOpcodeForVT(
1208 EltVT
.getSimpleVT().SimpleTy
, NVPTX::LDV_i8_v4_areg
,
1209 NVPTX::LDV_i16_v4_areg
, NVPTX::LDV_i32_v4_areg
, None
,
1210 NVPTX::LDV_f16_v4_areg
, NVPTX::LDV_f16x2_v4_areg
,
1211 NVPTX::LDV_f32_v4_areg
, None
);
1217 SDValue Ops
[] = { getI32Imm(IsVolatile
, DL
), getI32Imm(CodeAddrSpace
, DL
),
1218 getI32Imm(VecType
, DL
), getI32Imm(FromType
, DL
),
1219 getI32Imm(FromTypeWidth
, DL
), Op1
, Chain
};
1220 LD
= CurDAG
->getMachineNode(Opcode
.getValue(), DL
, N
->getVTList(), Ops
);
1223 MachineMemOperand
*MemRef
= cast
<MemSDNode
>(N
)->getMemOperand();
1224 CurDAG
->setNodeMemRefs(cast
<MachineSDNode
>(LD
), {MemRef
});
1230 bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode
*N
) {
1232 SDValue Chain
= N
->getOperand(0);
1237 // If this is an LDG intrinsic, the address is the third operand. If its an
1238 // LDG/LDU SD node (from custom vector handling), then its the second operand
1239 if (N
->getOpcode() == ISD::INTRINSIC_W_CHAIN
) {
1240 Op1
= N
->getOperand(2);
1241 Mem
= cast
<MemIntrinsicSDNode
>(N
);
1242 unsigned IID
= cast
<ConstantSDNode
>(N
->getOperand(1))->getZExtValue();
1246 case Intrinsic::nvvm_ldg_global_f
:
1247 case Intrinsic::nvvm_ldg_global_i
:
1248 case Intrinsic::nvvm_ldg_global_p
:
1251 case Intrinsic::nvvm_ldu_global_f
:
1252 case Intrinsic::nvvm_ldu_global_i
:
1253 case Intrinsic::nvvm_ldu_global_p
:
1258 Op1
= N
->getOperand(1);
1259 Mem
= cast
<MemSDNode
>(N
);
1262 Optional
<unsigned> Opcode
;
1265 SDValue Base
, Offset
, Addr
;
1267 EVT EltVT
= Mem
->getMemoryVT();
1268 unsigned NumElts
= 1;
1269 if (EltVT
.isVector()) {
1270 NumElts
= EltVT
.getVectorNumElements();
1271 EltVT
= EltVT
.getVectorElementType();
1272 // vectors of f16 are loaded/stored as multiples of v2f16 elements.
1273 if (EltVT
== MVT::f16
&& N
->getValueType(0) == MVT::v2f16
) {
1274 assert(NumElts
% 2 == 0 && "Vector must have even number of elements");
1280 // Build the "promoted" result VTList for the load. If we are really loading
1281 // i8s, then the return type will be promoted to i16 since we do not expose
1282 // 8-bit registers in NVPTX.
1283 EVT NodeVT
= (EltVT
== MVT::i8
) ? MVT::i16
: EltVT
;
1284 SmallVector
<EVT
, 5> InstVTs
;
1285 for (unsigned i
= 0; i
!= NumElts
; ++i
) {
1286 InstVTs
.push_back(NodeVT
);
1288 InstVTs
.push_back(MVT::Other
);
1289 SDVTList InstVTList
= CurDAG
->getVTList(InstVTs
);
1291 if (SelectDirectAddr(Op1
, Addr
)) {
1292 switch (N
->getOpcode()) {
1296 case ISD::INTRINSIC_W_CHAIN
:
1298 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1299 NVPTX::INT_PTX_LDG_GLOBAL_i8avar
,
1300 NVPTX::INT_PTX_LDG_GLOBAL_i16avar
,
1301 NVPTX::INT_PTX_LDG_GLOBAL_i32avar
,
1302 NVPTX::INT_PTX_LDG_GLOBAL_i64avar
,
1303 NVPTX::INT_PTX_LDG_GLOBAL_f16avar
,
1304 NVPTX::INT_PTX_LDG_GLOBAL_f16x2avar
,
1305 NVPTX::INT_PTX_LDG_GLOBAL_f32avar
,
1306 NVPTX::INT_PTX_LDG_GLOBAL_f64avar
);
1308 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1309 NVPTX::INT_PTX_LDU_GLOBAL_i8avar
,
1310 NVPTX::INT_PTX_LDU_GLOBAL_i16avar
,
1311 NVPTX::INT_PTX_LDU_GLOBAL_i32avar
,
1312 NVPTX::INT_PTX_LDU_GLOBAL_i64avar
,
1313 NVPTX::INT_PTX_LDU_GLOBAL_f16avar
,
1314 NVPTX::INT_PTX_LDU_GLOBAL_f16x2avar
,
1315 NVPTX::INT_PTX_LDU_GLOBAL_f32avar
,
1316 NVPTX::INT_PTX_LDU_GLOBAL_f64avar
);
1318 case NVPTXISD::LoadV2
:
1319 case NVPTXISD::LDGV2
:
1320 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1321 NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar
,
1322 NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar
,
1323 NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar
,
1324 NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar
,
1325 NVPTX::INT_PTX_LDG_G_v2f16_ELE_avar
,
1326 NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_avar
,
1327 NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar
,
1328 NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar
);
1330 case NVPTXISD::LDUV2
:
1331 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1332 NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar
,
1333 NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar
,
1334 NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar
,
1335 NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar
,
1336 NVPTX::INT_PTX_LDU_G_v2f16_ELE_avar
,
1337 NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_avar
,
1338 NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar
,
1339 NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar
);
1341 case NVPTXISD::LoadV4
:
1342 case NVPTXISD::LDGV4
:
1343 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1344 NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar
,
1345 NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar
,
1346 NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar
, None
,
1347 NVPTX::INT_PTX_LDG_G_v4f16_ELE_avar
,
1348 NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_avar
,
1349 NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar
, None
);
1351 case NVPTXISD::LDUV4
:
1352 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1353 NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar
,
1354 NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar
,
1355 NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar
, None
,
1356 NVPTX::INT_PTX_LDU_G_v4f16_ELE_avar
,
1357 NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_avar
,
1358 NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar
, None
);
1363 SDValue Ops
[] = { Addr
, Chain
};
1364 LD
= CurDAG
->getMachineNode(Opcode
.getValue(), DL
, InstVTList
, Ops
);
1365 } else if (TM
.is64Bit() ? SelectADDRri64(Op1
.getNode(), Op1
, Base
, Offset
)
1366 : SelectADDRri(Op1
.getNode(), Op1
, Base
, Offset
)) {
1368 switch (N
->getOpcode()) {
1372 case ISD::INTRINSIC_W_CHAIN
:
1374 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1375 NVPTX::INT_PTX_LDG_GLOBAL_i8ari64
,
1376 NVPTX::INT_PTX_LDG_GLOBAL_i16ari64
,
1377 NVPTX::INT_PTX_LDG_GLOBAL_i32ari64
,
1378 NVPTX::INT_PTX_LDG_GLOBAL_i64ari64
,
1379 NVPTX::INT_PTX_LDG_GLOBAL_f16ari64
,
1380 NVPTX::INT_PTX_LDG_GLOBAL_f16x2ari64
,
1381 NVPTX::INT_PTX_LDG_GLOBAL_f32ari64
,
1382 NVPTX::INT_PTX_LDG_GLOBAL_f64ari64
);
1384 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1385 NVPTX::INT_PTX_LDU_GLOBAL_i8ari64
,
1386 NVPTX::INT_PTX_LDU_GLOBAL_i16ari64
,
1387 NVPTX::INT_PTX_LDU_GLOBAL_i32ari64
,
1388 NVPTX::INT_PTX_LDU_GLOBAL_i64ari64
,
1389 NVPTX::INT_PTX_LDU_GLOBAL_f16ari64
,
1390 NVPTX::INT_PTX_LDU_GLOBAL_f16x2ari64
,
1391 NVPTX::INT_PTX_LDU_GLOBAL_f32ari64
,
1392 NVPTX::INT_PTX_LDU_GLOBAL_f64ari64
);
1394 case NVPTXISD::LoadV2
:
1395 case NVPTXISD::LDGV2
:
1396 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1397 NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64
,
1398 NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64
,
1399 NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64
,
1400 NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64
,
1401 NVPTX::INT_PTX_LDG_G_v2f16_ELE_ari64
,
1402 NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_ari64
,
1403 NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64
,
1404 NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64
);
1406 case NVPTXISD::LDUV2
:
1407 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1408 NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64
,
1409 NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64
,
1410 NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64
,
1411 NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64
,
1412 NVPTX::INT_PTX_LDU_G_v2f16_ELE_ari64
,
1413 NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_ari64
,
1414 NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64
,
1415 NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64
);
1417 case NVPTXISD::LoadV4
:
1418 case NVPTXISD::LDGV4
:
1419 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1420 NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64
,
1421 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64
,
1422 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64
, None
,
1423 NVPTX::INT_PTX_LDG_G_v4f16_ELE_ari64
,
1424 NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_ari64
,
1425 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64
, None
);
1427 case NVPTXISD::LDUV4
:
1428 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1429 NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64
,
1430 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64
,
1431 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64
, None
,
1432 NVPTX::INT_PTX_LDU_G_v4f16_ELE_ari64
,
1433 NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_ari64
,
1434 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64
, None
);
1438 switch (N
->getOpcode()) {
1442 case ISD::INTRINSIC_W_CHAIN
:
1444 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1445 NVPTX::INT_PTX_LDG_GLOBAL_i8ari
,
1446 NVPTX::INT_PTX_LDG_GLOBAL_i16ari
,
1447 NVPTX::INT_PTX_LDG_GLOBAL_i32ari
,
1448 NVPTX::INT_PTX_LDG_GLOBAL_i64ari
,
1449 NVPTX::INT_PTX_LDG_GLOBAL_f16ari
,
1450 NVPTX::INT_PTX_LDG_GLOBAL_f16x2ari
,
1451 NVPTX::INT_PTX_LDG_GLOBAL_f32ari
,
1452 NVPTX::INT_PTX_LDG_GLOBAL_f64ari
);
1454 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1455 NVPTX::INT_PTX_LDU_GLOBAL_i8ari
,
1456 NVPTX::INT_PTX_LDU_GLOBAL_i16ari
,
1457 NVPTX::INT_PTX_LDU_GLOBAL_i32ari
,
1458 NVPTX::INT_PTX_LDU_GLOBAL_i64ari
,
1459 NVPTX::INT_PTX_LDU_GLOBAL_f16ari
,
1460 NVPTX::INT_PTX_LDU_GLOBAL_f16x2ari
,
1461 NVPTX::INT_PTX_LDU_GLOBAL_f32ari
,
1462 NVPTX::INT_PTX_LDU_GLOBAL_f64ari
);
1464 case NVPTXISD::LoadV2
:
1465 case NVPTXISD::LDGV2
:
1466 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1467 NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32
,
1468 NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32
,
1469 NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32
,
1470 NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32
,
1471 NVPTX::INT_PTX_LDG_G_v2f16_ELE_ari32
,
1472 NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_ari32
,
1473 NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32
,
1474 NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32
);
1476 case NVPTXISD::LDUV2
:
1477 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1478 NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32
,
1479 NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32
,
1480 NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32
,
1481 NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32
,
1482 NVPTX::INT_PTX_LDU_G_v2f16_ELE_ari32
,
1483 NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_ari32
,
1484 NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32
,
1485 NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32
);
1487 case NVPTXISD::LoadV4
:
1488 case NVPTXISD::LDGV4
:
1489 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1490 NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32
,
1491 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32
,
1492 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32
, None
,
1493 NVPTX::INT_PTX_LDG_G_v4f16_ELE_ari32
,
1494 NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_ari32
,
1495 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32
, None
);
1497 case NVPTXISD::LDUV4
:
1498 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1499 NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32
,
1500 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32
,
1501 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32
, None
,
1502 NVPTX::INT_PTX_LDU_G_v4f16_ELE_ari32
,
1503 NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_ari32
,
1504 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32
, None
);
1510 SDValue Ops
[] = {Base
, Offset
, Chain
};
1511 LD
= CurDAG
->getMachineNode(Opcode
.getValue(), DL
, InstVTList
, Ops
);
1514 switch (N
->getOpcode()) {
1518 case ISD::INTRINSIC_W_CHAIN
:
1520 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1521 NVPTX::INT_PTX_LDG_GLOBAL_i8areg64
,
1522 NVPTX::INT_PTX_LDG_GLOBAL_i16areg64
,
1523 NVPTX::INT_PTX_LDG_GLOBAL_i32areg64
,
1524 NVPTX::INT_PTX_LDG_GLOBAL_i64areg64
,
1525 NVPTX::INT_PTX_LDG_GLOBAL_f16areg64
,
1526 NVPTX::INT_PTX_LDG_GLOBAL_f16x2areg64
,
1527 NVPTX::INT_PTX_LDG_GLOBAL_f32areg64
,
1528 NVPTX::INT_PTX_LDG_GLOBAL_f64areg64
);
1530 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1531 NVPTX::INT_PTX_LDU_GLOBAL_i8areg64
,
1532 NVPTX::INT_PTX_LDU_GLOBAL_i16areg64
,
1533 NVPTX::INT_PTX_LDU_GLOBAL_i32areg64
,
1534 NVPTX::INT_PTX_LDU_GLOBAL_i64areg64
,
1535 NVPTX::INT_PTX_LDU_GLOBAL_f16areg64
,
1536 NVPTX::INT_PTX_LDU_GLOBAL_f16x2areg64
,
1537 NVPTX::INT_PTX_LDU_GLOBAL_f32areg64
,
1538 NVPTX::INT_PTX_LDU_GLOBAL_f64areg64
);
1540 case NVPTXISD::LoadV2
:
1541 case NVPTXISD::LDGV2
:
1542 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1543 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64
,
1544 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64
,
1545 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64
,
1546 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64
,
1547 NVPTX::INT_PTX_LDG_G_v2f16_ELE_areg64
,
1548 NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_areg64
,
1549 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64
,
1550 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64
);
1552 case NVPTXISD::LDUV2
:
1553 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1554 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64
,
1555 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64
,
1556 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64
,
1557 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64
,
1558 NVPTX::INT_PTX_LDU_G_v2f16_ELE_areg64
,
1559 NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_areg64
,
1560 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64
,
1561 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64
);
1563 case NVPTXISD::LoadV4
:
1564 case NVPTXISD::LDGV4
:
1565 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1566 NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64
,
1567 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64
,
1568 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64
, None
,
1569 NVPTX::INT_PTX_LDG_G_v4f16_ELE_areg64
,
1570 NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_areg64
,
1571 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64
, None
);
1573 case NVPTXISD::LDUV4
:
1574 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1575 NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64
,
1576 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64
,
1577 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64
, None
,
1578 NVPTX::INT_PTX_LDU_G_v4f16_ELE_areg64
,
1579 NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_areg64
,
1580 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64
, None
);
1584 switch (N
->getOpcode()) {
1588 case ISD::INTRINSIC_W_CHAIN
:
1590 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1591 NVPTX::INT_PTX_LDG_GLOBAL_i8areg
,
1592 NVPTX::INT_PTX_LDG_GLOBAL_i16areg
,
1593 NVPTX::INT_PTX_LDG_GLOBAL_i32areg
,
1594 NVPTX::INT_PTX_LDG_GLOBAL_i64areg
,
1595 NVPTX::INT_PTX_LDG_GLOBAL_f16areg
,
1596 NVPTX::INT_PTX_LDG_GLOBAL_f16x2areg
,
1597 NVPTX::INT_PTX_LDG_GLOBAL_f32areg
,
1598 NVPTX::INT_PTX_LDG_GLOBAL_f64areg
);
1600 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1601 NVPTX::INT_PTX_LDU_GLOBAL_i8areg
,
1602 NVPTX::INT_PTX_LDU_GLOBAL_i16areg
,
1603 NVPTX::INT_PTX_LDU_GLOBAL_i32areg
,
1604 NVPTX::INT_PTX_LDU_GLOBAL_i64areg
,
1605 NVPTX::INT_PTX_LDU_GLOBAL_f16areg
,
1606 NVPTX::INT_PTX_LDU_GLOBAL_f16x2areg
,
1607 NVPTX::INT_PTX_LDU_GLOBAL_f32areg
,
1608 NVPTX::INT_PTX_LDU_GLOBAL_f64areg
);
1610 case NVPTXISD::LoadV2
:
1611 case NVPTXISD::LDGV2
:
1612 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1613 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32
,
1614 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32
,
1615 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32
,
1616 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32
,
1617 NVPTX::INT_PTX_LDG_G_v2f16_ELE_areg32
,
1618 NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_areg32
,
1619 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32
,
1620 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32
);
1622 case NVPTXISD::LDUV2
:
1623 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1624 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32
,
1625 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32
,
1626 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32
,
1627 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32
,
1628 NVPTX::INT_PTX_LDU_G_v2f16_ELE_areg32
,
1629 NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_areg32
,
1630 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32
,
1631 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32
);
1633 case NVPTXISD::LoadV4
:
1634 case NVPTXISD::LDGV4
:
1635 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1636 NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32
,
1637 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32
,
1638 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32
, None
,
1639 NVPTX::INT_PTX_LDG_G_v4f16_ELE_areg32
,
1640 NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_areg32
,
1641 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32
, None
);
1643 case NVPTXISD::LDUV4
:
1644 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1645 NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32
,
1646 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32
,
1647 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32
, None
,
1648 NVPTX::INT_PTX_LDU_G_v4f16_ELE_areg32
,
1649 NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_areg32
,
1650 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32
, None
);
1656 SDValue Ops
[] = { Op1
, Chain
};
1657 LD
= CurDAG
->getMachineNode(Opcode
.getValue(), DL
, InstVTList
, Ops
);
1660 MachineMemOperand
*MemRef
= Mem
->getMemOperand();
1661 CurDAG
->setNodeMemRefs(cast
<MachineSDNode
>(LD
), {MemRef
});
1663 // For automatic generation of LDG (through SelectLoad[Vector], not the
1664 // intrinsics), we may have an extending load like:
1666 // i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64
1668 // In this case, the matching logic above will select a load for the original
1669 // memory type (in this case, i8) and our types will not match (the node needs
1670 // to return an i32 in this case). Our LDG/LDU nodes do not support the
1671 // concept of sign-/zero-extension, so emulate it here by adding an explicit
1672 // CVT instruction. Ptxas should clean up any redundancies here.
1674 EVT OrigType
= N
->getValueType(0);
1675 LoadSDNode
*LdNode
= dyn_cast
<LoadSDNode
>(N
);
1677 if (OrigType
!= EltVT
&& LdNode
) {
1678 // We have an extending-load. The instruction we selected operates on the
1679 // smaller type, but the SDNode we are replacing has the larger type. We
1680 // need to emit a CVT to make the types match.
1681 bool IsSigned
= LdNode
->getExtensionType() == ISD::SEXTLOAD
;
1682 unsigned CvtOpc
= GetConvertOpcode(OrigType
.getSimpleVT(),
1683 EltVT
.getSimpleVT(), IsSigned
);
1685 // For each output value, apply the manual sign/zero-extension and make sure
1686 // all users of the load go through that CVT.
1687 for (unsigned i
= 0; i
!= NumElts
; ++i
) {
1689 SDValue
OrigVal(N
, i
);
1692 CurDAG
->getMachineNode(CvtOpc
, DL
, OrigType
, Res
,
1693 CurDAG
->getTargetConstant(NVPTX::PTXCvtMode::NONE
,
1695 ReplaceUses(OrigVal
, SDValue(CvtNode
, 0));
1703 bool NVPTXDAGToDAGISel::tryStore(SDNode
*N
) {
1705 MemSDNode
*ST
= cast
<MemSDNode
>(N
);
1706 assert(ST
->writeMem() && "Expected store");
1707 StoreSDNode
*PlainStore
= dyn_cast
<StoreSDNode
>(N
);
1708 AtomicSDNode
*AtomicStore
= dyn_cast
<AtomicSDNode
>(N
);
1709 assert((PlainStore
|| AtomicStore
) && "Expected store");
1710 EVT StoreVT
= ST
->getMemoryVT();
1711 SDNode
*NVPTXST
= nullptr;
1713 // do not support pre/post inc/dec
1714 if (PlainStore
&& PlainStore
->isIndexed())
1717 if (!StoreVT
.isSimple())
1720 AtomicOrdering Ordering
= ST
->getSuccessOrdering();
1721 // In order to lower atomic loads with stronger guarantees we would need to
1722 // use store.release or insert fences. However these features were only added
1723 // with PTX ISA 6.0 / sm_70.
1724 // TODO: Check if we can actually use the new instructions and implement them.
1725 if (isStrongerThanMonotonic(Ordering
))
1728 // Address Space Setting
1729 unsigned int CodeAddrSpace
= getCodeAddrSpace(ST
);
1730 unsigned int PointerSize
=
1731 CurDAG
->getDataLayout().getPointerSizeInBits(ST
->getAddressSpace());
1734 // - .volatile is only available for .global and .shared
1735 // - .volatile has the same memory synchronization semantics as .relaxed.sys
1736 bool isVolatile
= ST
->isVolatile() || Ordering
== AtomicOrdering::Monotonic
;
1737 if (CodeAddrSpace
!= NVPTX::PTXLdStInstCode::GLOBAL
&&
1738 CodeAddrSpace
!= NVPTX::PTXLdStInstCode::SHARED
&&
1739 CodeAddrSpace
!= NVPTX::PTXLdStInstCode::GENERIC
)
1743 MVT SimpleVT
= StoreVT
.getSimpleVT();
1744 unsigned vecType
= NVPTX::PTXLdStInstCode::Scalar
;
1746 // Type Setting: toType + toTypeWidth
1747 // - for integer type, always use 'u'
1749 MVT ScalarVT
= SimpleVT
.getScalarType();
1750 unsigned toTypeWidth
= ScalarVT
.getSizeInBits();
1751 if (SimpleVT
.isVector()) {
1752 assert(StoreVT
== MVT::v2f16
&& "Unexpected vector type");
1753 // v2f16 is stored using st.b32
1757 unsigned int toType
;
1758 if (ScalarVT
.isFloatingPoint())
1759 // f16 uses .b16 as its storage type.
1760 toType
= ScalarVT
.SimpleTy
== MVT::f16
? NVPTX::PTXLdStInstCode::Untyped
1761 : NVPTX::PTXLdStInstCode::Float
;
1763 toType
= NVPTX::PTXLdStInstCode::Unsigned
;
1765 // Create the machine instruction DAG
1766 SDValue Chain
= ST
->getChain();
1767 SDValue Value
= PlainStore
? PlainStore
->getValue() : AtomicStore
->getVal();
1768 SDValue BasePtr
= ST
->getBasePtr();
1770 SDValue Offset
, Base
;
1771 Optional
<unsigned> Opcode
;
1772 MVT::SimpleValueType SourceVT
=
1773 Value
.getNode()->getSimpleValueType(0).SimpleTy
;
1775 if (SelectDirectAddr(BasePtr
, Addr
)) {
1776 Opcode
= pickOpcodeForVT(SourceVT
, NVPTX::ST_i8_avar
, NVPTX::ST_i16_avar
,
1777 NVPTX::ST_i32_avar
, NVPTX::ST_i64_avar
,
1778 NVPTX::ST_f16_avar
, NVPTX::ST_f16x2_avar
,
1779 NVPTX::ST_f32_avar
, NVPTX::ST_f64_avar
);
1782 SDValue Ops
[] = {Value
,
1783 getI32Imm(isVolatile
, dl
),
1784 getI32Imm(CodeAddrSpace
, dl
),
1785 getI32Imm(vecType
, dl
),
1786 getI32Imm(toType
, dl
),
1787 getI32Imm(toTypeWidth
, dl
),
1790 NVPTXST
= CurDAG
->getMachineNode(Opcode
.getValue(), dl
, MVT::Other
, Ops
);
1791 } else if (PointerSize
== 64
1792 ? SelectADDRsi64(BasePtr
.getNode(), BasePtr
, Base
, Offset
)
1793 : SelectADDRsi(BasePtr
.getNode(), BasePtr
, Base
, Offset
)) {
1794 Opcode
= pickOpcodeForVT(SourceVT
, NVPTX::ST_i8_asi
, NVPTX::ST_i16_asi
,
1795 NVPTX::ST_i32_asi
, NVPTX::ST_i64_asi
,
1796 NVPTX::ST_f16_asi
, NVPTX::ST_f16x2_asi
,
1797 NVPTX::ST_f32_asi
, NVPTX::ST_f64_asi
);
1800 SDValue Ops
[] = {Value
,
1801 getI32Imm(isVolatile
, dl
),
1802 getI32Imm(CodeAddrSpace
, dl
),
1803 getI32Imm(vecType
, dl
),
1804 getI32Imm(toType
, dl
),
1805 getI32Imm(toTypeWidth
, dl
),
1809 NVPTXST
= CurDAG
->getMachineNode(Opcode
.getValue(), dl
, MVT::Other
, Ops
);
1810 } else if (PointerSize
== 64
1811 ? SelectADDRri64(BasePtr
.getNode(), BasePtr
, Base
, Offset
)
1812 : SelectADDRri(BasePtr
.getNode(), BasePtr
, Base
, Offset
)) {
1813 if (PointerSize
== 64)
1814 Opcode
= pickOpcodeForVT(
1815 SourceVT
, NVPTX::ST_i8_ari_64
, NVPTX::ST_i16_ari_64
,
1816 NVPTX::ST_i32_ari_64
, NVPTX::ST_i64_ari_64
, NVPTX::ST_f16_ari_64
,
1817 NVPTX::ST_f16x2_ari_64
, NVPTX::ST_f32_ari_64
, NVPTX::ST_f64_ari_64
);
1819 Opcode
= pickOpcodeForVT(SourceVT
, NVPTX::ST_i8_ari
, NVPTX::ST_i16_ari
,
1820 NVPTX::ST_i32_ari
, NVPTX::ST_i64_ari
,
1821 NVPTX::ST_f16_ari
, NVPTX::ST_f16x2_ari
,
1822 NVPTX::ST_f32_ari
, NVPTX::ST_f64_ari
);
1826 SDValue Ops
[] = {Value
,
1827 getI32Imm(isVolatile
, dl
),
1828 getI32Imm(CodeAddrSpace
, dl
),
1829 getI32Imm(vecType
, dl
),
1830 getI32Imm(toType
, dl
),
1831 getI32Imm(toTypeWidth
, dl
),
1835 NVPTXST
= CurDAG
->getMachineNode(Opcode
.getValue(), dl
, MVT::Other
, Ops
);
1837 if (PointerSize
== 64)
1839 pickOpcodeForVT(SourceVT
, NVPTX::ST_i8_areg_64
, NVPTX::ST_i16_areg_64
,
1840 NVPTX::ST_i32_areg_64
, NVPTX::ST_i64_areg_64
,
1841 NVPTX::ST_f16_areg_64
, NVPTX::ST_f16x2_areg_64
,
1842 NVPTX::ST_f32_areg_64
, NVPTX::ST_f64_areg_64
);
1844 Opcode
= pickOpcodeForVT(SourceVT
, NVPTX::ST_i8_areg
, NVPTX::ST_i16_areg
,
1845 NVPTX::ST_i32_areg
, NVPTX::ST_i64_areg
,
1846 NVPTX::ST_f16_areg
, NVPTX::ST_f16x2_areg
,
1847 NVPTX::ST_f32_areg
, NVPTX::ST_f64_areg
);
1850 SDValue Ops
[] = {Value
,
1851 getI32Imm(isVolatile
, dl
),
1852 getI32Imm(CodeAddrSpace
, dl
),
1853 getI32Imm(vecType
, dl
),
1854 getI32Imm(toType
, dl
),
1855 getI32Imm(toTypeWidth
, dl
),
1858 NVPTXST
= CurDAG
->getMachineNode(Opcode
.getValue(), dl
, MVT::Other
, Ops
);
1864 MachineMemOperand
*MemRef
= cast
<MemSDNode
>(N
)->getMemOperand();
1865 CurDAG
->setNodeMemRefs(cast
<MachineSDNode
>(NVPTXST
), {MemRef
});
1866 ReplaceNode(N
, NVPTXST
);
1870 bool NVPTXDAGToDAGISel::tryStoreVector(SDNode
*N
) {
1871 SDValue Chain
= N
->getOperand(0);
1872 SDValue Op1
= N
->getOperand(1);
1873 SDValue Addr
, Offset
, Base
;
1874 Optional
<unsigned> Opcode
;
1877 EVT EltVT
= Op1
.getValueType();
1878 MemSDNode
*MemSD
= cast
<MemSDNode
>(N
);
1879 EVT StoreVT
= MemSD
->getMemoryVT();
1881 // Address Space Setting
1882 unsigned CodeAddrSpace
= getCodeAddrSpace(MemSD
);
1883 if (CodeAddrSpace
== NVPTX::PTXLdStInstCode::CONSTANT
) {
1884 report_fatal_error("Cannot store to pointer that points to constant "
1887 unsigned int PointerSize
=
1888 CurDAG
->getDataLayout().getPointerSizeInBits(MemSD
->getAddressSpace());
1891 // - .volatile is only availalble for .global and .shared
1892 bool IsVolatile
= MemSD
->isVolatile();
1893 if (CodeAddrSpace
!= NVPTX::PTXLdStInstCode::GLOBAL
&&
1894 CodeAddrSpace
!= NVPTX::PTXLdStInstCode::SHARED
&&
1895 CodeAddrSpace
!= NVPTX::PTXLdStInstCode::GENERIC
)
1898 // Type Setting: toType + toTypeWidth
1899 // - for integer type, always use 'u'
1900 assert(StoreVT
.isSimple() && "Store value is not simple");
1901 MVT ScalarVT
= StoreVT
.getSimpleVT().getScalarType();
1902 unsigned ToTypeWidth
= ScalarVT
.getSizeInBits();
1904 if (ScalarVT
.isFloatingPoint())
1905 ToType
= ScalarVT
.SimpleTy
== MVT::f16
? NVPTX::PTXLdStInstCode::Untyped
1906 : NVPTX::PTXLdStInstCode::Float
;
1908 ToType
= NVPTX::PTXLdStInstCode::Unsigned
;
1910 SmallVector
<SDValue
, 12> StOps
;
1914 switch (N
->getOpcode()) {
1915 case NVPTXISD::StoreV2
:
1916 VecType
= NVPTX::PTXLdStInstCode::V2
;
1917 StOps
.push_back(N
->getOperand(1));
1918 StOps
.push_back(N
->getOperand(2));
1919 N2
= N
->getOperand(3);
1921 case NVPTXISD::StoreV4
:
1922 VecType
= NVPTX::PTXLdStInstCode::V4
;
1923 StOps
.push_back(N
->getOperand(1));
1924 StOps
.push_back(N
->getOperand(2));
1925 StOps
.push_back(N
->getOperand(3));
1926 StOps
.push_back(N
->getOperand(4));
1927 N2
= N
->getOperand(5);
1933 // v8f16 is a special case. PTX doesn't have st.v8.f16
1934 // instruction. Instead, we split the vector into v2f16 chunks and
1935 // store them with st.v4.b32.
1936 if (EltVT
== MVT::v2f16
) {
1937 assert(N
->getOpcode() == NVPTXISD::StoreV4
&& "Unexpected load opcode.");
1939 ToType
= NVPTX::PTXLdStInstCode::Untyped
;
1943 StOps
.push_back(getI32Imm(IsVolatile
, DL
));
1944 StOps
.push_back(getI32Imm(CodeAddrSpace
, DL
));
1945 StOps
.push_back(getI32Imm(VecType
, DL
));
1946 StOps
.push_back(getI32Imm(ToType
, DL
));
1947 StOps
.push_back(getI32Imm(ToTypeWidth
, DL
));
1949 if (SelectDirectAddr(N2
, Addr
)) {
1950 switch (N
->getOpcode()) {
1953 case NVPTXISD::StoreV2
:
1954 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1955 NVPTX::STV_i8_v2_avar
, NVPTX::STV_i16_v2_avar
,
1956 NVPTX::STV_i32_v2_avar
, NVPTX::STV_i64_v2_avar
,
1957 NVPTX::STV_f16_v2_avar
, NVPTX::STV_f16x2_v2_avar
,
1958 NVPTX::STV_f32_v2_avar
, NVPTX::STV_f64_v2_avar
);
1960 case NVPTXISD::StoreV4
:
1962 pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
, NVPTX::STV_i8_v4_avar
,
1963 NVPTX::STV_i16_v4_avar
, NVPTX::STV_i32_v4_avar
, None
,
1964 NVPTX::STV_f16_v4_avar
, NVPTX::STV_f16x2_v4_avar
,
1965 NVPTX::STV_f32_v4_avar
, None
);
1968 StOps
.push_back(Addr
);
1969 } else if (PointerSize
== 64 ? SelectADDRsi64(N2
.getNode(), N2
, Base
, Offset
)
1970 : SelectADDRsi(N2
.getNode(), N2
, Base
, Offset
)) {
1971 switch (N
->getOpcode()) {
1974 case NVPTXISD::StoreV2
:
1975 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
1976 NVPTX::STV_i8_v2_asi
, NVPTX::STV_i16_v2_asi
,
1977 NVPTX::STV_i32_v2_asi
, NVPTX::STV_i64_v2_asi
,
1978 NVPTX::STV_f16_v2_asi
, NVPTX::STV_f16x2_v2_asi
,
1979 NVPTX::STV_f32_v2_asi
, NVPTX::STV_f64_v2_asi
);
1981 case NVPTXISD::StoreV4
:
1983 pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
, NVPTX::STV_i8_v4_asi
,
1984 NVPTX::STV_i16_v4_asi
, NVPTX::STV_i32_v4_asi
, None
,
1985 NVPTX::STV_f16_v4_asi
, NVPTX::STV_f16x2_v4_asi
,
1986 NVPTX::STV_f32_v4_asi
, None
);
1989 StOps
.push_back(Base
);
1990 StOps
.push_back(Offset
);
1991 } else if (PointerSize
== 64 ? SelectADDRri64(N2
.getNode(), N2
, Base
, Offset
)
1992 : SelectADDRri(N2
.getNode(), N2
, Base
, Offset
)) {
1993 if (PointerSize
== 64) {
1994 switch (N
->getOpcode()) {
1997 case NVPTXISD::StoreV2
:
1998 Opcode
= pickOpcodeForVT(
1999 EltVT
.getSimpleVT().SimpleTy
, NVPTX::STV_i8_v2_ari_64
,
2000 NVPTX::STV_i16_v2_ari_64
, NVPTX::STV_i32_v2_ari_64
,
2001 NVPTX::STV_i64_v2_ari_64
, NVPTX::STV_f16_v2_ari_64
,
2002 NVPTX::STV_f16x2_v2_ari_64
, NVPTX::STV_f32_v2_ari_64
,
2003 NVPTX::STV_f64_v2_ari_64
);
2005 case NVPTXISD::StoreV4
:
2006 Opcode
= pickOpcodeForVT(
2007 EltVT
.getSimpleVT().SimpleTy
, NVPTX::STV_i8_v4_ari_64
,
2008 NVPTX::STV_i16_v4_ari_64
, NVPTX::STV_i32_v4_ari_64
, None
,
2009 NVPTX::STV_f16_v4_ari_64
, NVPTX::STV_f16x2_v4_ari_64
,
2010 NVPTX::STV_f32_v4_ari_64
, None
);
2014 switch (N
->getOpcode()) {
2017 case NVPTXISD::StoreV2
:
2018 Opcode
= pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
,
2019 NVPTX::STV_i8_v2_ari
, NVPTX::STV_i16_v2_ari
,
2020 NVPTX::STV_i32_v2_ari
, NVPTX::STV_i64_v2_ari
,
2021 NVPTX::STV_f16_v2_ari
, NVPTX::STV_f16x2_v2_ari
,
2022 NVPTX::STV_f32_v2_ari
, NVPTX::STV_f64_v2_ari
);
2024 case NVPTXISD::StoreV4
:
2026 pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
, NVPTX::STV_i8_v4_ari
,
2027 NVPTX::STV_i16_v4_ari
, NVPTX::STV_i32_v4_ari
, None
,
2028 NVPTX::STV_f16_v4_ari
, NVPTX::STV_f16x2_v4_ari
,
2029 NVPTX::STV_f32_v4_ari
, None
);
2033 StOps
.push_back(Base
);
2034 StOps
.push_back(Offset
);
2036 if (PointerSize
== 64) {
2037 switch (N
->getOpcode()) {
2040 case NVPTXISD::StoreV2
:
2041 Opcode
= pickOpcodeForVT(
2042 EltVT
.getSimpleVT().SimpleTy
, NVPTX::STV_i8_v2_areg_64
,
2043 NVPTX::STV_i16_v2_areg_64
, NVPTX::STV_i32_v2_areg_64
,
2044 NVPTX::STV_i64_v2_areg_64
, NVPTX::STV_f16_v2_areg_64
,
2045 NVPTX::STV_f16x2_v2_areg_64
, NVPTX::STV_f32_v2_areg_64
,
2046 NVPTX::STV_f64_v2_areg_64
);
2048 case NVPTXISD::StoreV4
:
2049 Opcode
= pickOpcodeForVT(
2050 EltVT
.getSimpleVT().SimpleTy
, NVPTX::STV_i8_v4_areg_64
,
2051 NVPTX::STV_i16_v4_areg_64
, NVPTX::STV_i32_v4_areg_64
, None
,
2052 NVPTX::STV_f16_v4_areg_64
, NVPTX::STV_f16x2_v4_areg_64
,
2053 NVPTX::STV_f32_v4_areg_64
, None
);
2057 switch (N
->getOpcode()) {
2060 case NVPTXISD::StoreV2
:
2062 pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
, NVPTX::STV_i8_v2_areg
,
2063 NVPTX::STV_i16_v2_areg
, NVPTX::STV_i32_v2_areg
,
2064 NVPTX::STV_i64_v2_areg
, NVPTX::STV_f16_v2_areg
,
2065 NVPTX::STV_f16x2_v2_areg
, NVPTX::STV_f32_v2_areg
,
2066 NVPTX::STV_f64_v2_areg
);
2068 case NVPTXISD::StoreV4
:
2070 pickOpcodeForVT(EltVT
.getSimpleVT().SimpleTy
, NVPTX::STV_i8_v4_areg
,
2071 NVPTX::STV_i16_v4_areg
, NVPTX::STV_i32_v4_areg
, None
,
2072 NVPTX::STV_f16_v4_areg
, NVPTX::STV_f16x2_v4_areg
,
2073 NVPTX::STV_f32_v4_areg
, None
);
2077 StOps
.push_back(N2
);
2083 StOps
.push_back(Chain
);
2085 ST
= CurDAG
->getMachineNode(Opcode
.getValue(), DL
, MVT::Other
, StOps
);
2087 MachineMemOperand
*MemRef
= cast
<MemSDNode
>(N
)->getMemOperand();
2088 CurDAG
->setNodeMemRefs(cast
<MachineSDNode
>(ST
), {MemRef
});
2094 bool NVPTXDAGToDAGISel::tryLoadParam(SDNode
*Node
) {
2095 SDValue Chain
= Node
->getOperand(0);
2096 SDValue Offset
= Node
->getOperand(2);
2097 SDValue Flag
= Node
->getOperand(3);
2099 MemSDNode
*Mem
= cast
<MemSDNode
>(Node
);
2102 switch (Node
->getOpcode()) {
2105 case NVPTXISD::LoadParam
:
2108 case NVPTXISD::LoadParamV2
:
2111 case NVPTXISD::LoadParamV4
:
2116 EVT EltVT
= Node
->getValueType(0);
2117 EVT MemVT
= Mem
->getMemoryVT();
2119 Optional
<unsigned> Opcode
;
2125 Opcode
= pickOpcodeForVT(MemVT
.getSimpleVT().SimpleTy
,
2126 NVPTX::LoadParamMemI8
, NVPTX::LoadParamMemI16
,
2127 NVPTX::LoadParamMemI32
, NVPTX::LoadParamMemI64
,
2128 NVPTX::LoadParamMemF16
, NVPTX::LoadParamMemF16x2
,
2129 NVPTX::LoadParamMemF32
, NVPTX::LoadParamMemF64
);
2133 pickOpcodeForVT(MemVT
.getSimpleVT().SimpleTy
, NVPTX::LoadParamMemV2I8
,
2134 NVPTX::LoadParamMemV2I16
, NVPTX::LoadParamMemV2I32
,
2135 NVPTX::LoadParamMemV2I64
, NVPTX::LoadParamMemV2F16
,
2136 NVPTX::LoadParamMemV2F16x2
, NVPTX::LoadParamMemV2F32
,
2137 NVPTX::LoadParamMemV2F64
);
2140 Opcode
= pickOpcodeForVT(
2141 MemVT
.getSimpleVT().SimpleTy
, NVPTX::LoadParamMemV4I8
,
2142 NVPTX::LoadParamMemV4I16
, NVPTX::LoadParamMemV4I32
, None
,
2143 NVPTX::LoadParamMemV4F16
, NVPTX::LoadParamMemV4F16x2
,
2144 NVPTX::LoadParamMemV4F32
, None
);
2152 VTs
= CurDAG
->getVTList(EltVT
, MVT::Other
, MVT::Glue
);
2153 } else if (VecSize
== 2) {
2154 VTs
= CurDAG
->getVTList(EltVT
, EltVT
, MVT::Other
, MVT::Glue
);
2156 EVT EVTs
[] = { EltVT
, EltVT
, EltVT
, EltVT
, MVT::Other
, MVT::Glue
};
2157 VTs
= CurDAG
->getVTList(EVTs
);
2160 unsigned OffsetVal
= cast
<ConstantSDNode
>(Offset
)->getZExtValue();
2162 SmallVector
<SDValue
, 2> Ops
;
2163 Ops
.push_back(CurDAG
->getTargetConstant(OffsetVal
, DL
, MVT::i32
));
2164 Ops
.push_back(Chain
);
2165 Ops
.push_back(Flag
);
2167 ReplaceNode(Node
, CurDAG
->getMachineNode(Opcode
.getValue(), DL
, VTs
, Ops
));
2171 bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode
*N
) {
2173 SDValue Chain
= N
->getOperand(0);
2174 SDValue Offset
= N
->getOperand(1);
2175 unsigned OffsetVal
= cast
<ConstantSDNode
>(Offset
)->getZExtValue();
2176 MemSDNode
*Mem
= cast
<MemSDNode
>(N
);
2178 // How many elements do we have?
2179 unsigned NumElts
= 1;
2180 switch (N
->getOpcode()) {
2183 case NVPTXISD::StoreRetval
:
2186 case NVPTXISD::StoreRetvalV2
:
2189 case NVPTXISD::StoreRetvalV4
:
2194 // Build vector of operands
2195 SmallVector
<SDValue
, 6> Ops
;
2196 for (unsigned i
= 0; i
< NumElts
; ++i
)
2197 Ops
.push_back(N
->getOperand(i
+ 2));
2198 Ops
.push_back(CurDAG
->getTargetConstant(OffsetVal
, DL
, MVT::i32
));
2199 Ops
.push_back(Chain
);
2201 // Determine target opcode
2202 // If we have an i1, use an 8-bit store. The lowering code in
2203 // NVPTXISelLowering will have already emitted an upcast.
2204 Optional
<unsigned> Opcode
= 0;
2209 Opcode
= pickOpcodeForVT(Mem
->getMemoryVT().getSimpleVT().SimpleTy
,
2210 NVPTX::StoreRetvalI8
, NVPTX::StoreRetvalI16
,
2211 NVPTX::StoreRetvalI32
, NVPTX::StoreRetvalI64
,
2212 NVPTX::StoreRetvalF16
, NVPTX::StoreRetvalF16x2
,
2213 NVPTX::StoreRetvalF32
, NVPTX::StoreRetvalF64
);
2216 Opcode
= pickOpcodeForVT(Mem
->getMemoryVT().getSimpleVT().SimpleTy
,
2217 NVPTX::StoreRetvalV2I8
, NVPTX::StoreRetvalV2I16
,
2218 NVPTX::StoreRetvalV2I32
, NVPTX::StoreRetvalV2I64
,
2219 NVPTX::StoreRetvalV2F16
, NVPTX::StoreRetvalV2F16x2
,
2220 NVPTX::StoreRetvalV2F32
, NVPTX::StoreRetvalV2F64
);
2223 Opcode
= pickOpcodeForVT(Mem
->getMemoryVT().getSimpleVT().SimpleTy
,
2224 NVPTX::StoreRetvalV4I8
, NVPTX::StoreRetvalV4I16
,
2225 NVPTX::StoreRetvalV4I32
, None
,
2226 NVPTX::StoreRetvalV4F16
, NVPTX::StoreRetvalV4F16x2
,
2227 NVPTX::StoreRetvalV4F32
, None
);
2233 SDNode
*Ret
= CurDAG
->getMachineNode(Opcode
.getValue(), DL
, MVT::Other
, Ops
);
2234 MachineMemOperand
*MemRef
= cast
<MemSDNode
>(N
)->getMemOperand();
2235 CurDAG
->setNodeMemRefs(cast
<MachineSDNode
>(Ret
), {MemRef
});
2237 ReplaceNode(N
, Ret
);
2241 bool NVPTXDAGToDAGISel::tryStoreParam(SDNode
*N
) {
2243 SDValue Chain
= N
->getOperand(0);
2244 SDValue Param
= N
->getOperand(1);
2245 unsigned ParamVal
= cast
<ConstantSDNode
>(Param
)->getZExtValue();
2246 SDValue Offset
= N
->getOperand(2);
2247 unsigned OffsetVal
= cast
<ConstantSDNode
>(Offset
)->getZExtValue();
2248 MemSDNode
*Mem
= cast
<MemSDNode
>(N
);
2249 SDValue Flag
= N
->getOperand(N
->getNumOperands() - 1);
2251 // How many elements do we have?
2252 unsigned NumElts
= 1;
2253 switch (N
->getOpcode()) {
2256 case NVPTXISD::StoreParamU32
:
2257 case NVPTXISD::StoreParamS32
:
2258 case NVPTXISD::StoreParam
:
2261 case NVPTXISD::StoreParamV2
:
2264 case NVPTXISD::StoreParamV4
:
2269 // Build vector of operands
2270 SmallVector
<SDValue
, 8> Ops
;
2271 for (unsigned i
= 0; i
< NumElts
; ++i
)
2272 Ops
.push_back(N
->getOperand(i
+ 3));
2273 Ops
.push_back(CurDAG
->getTargetConstant(ParamVal
, DL
, MVT::i32
));
2274 Ops
.push_back(CurDAG
->getTargetConstant(OffsetVal
, DL
, MVT::i32
));
2275 Ops
.push_back(Chain
);
2276 Ops
.push_back(Flag
);
2278 // Determine target opcode
2279 // If we have an i1, use an 8-bit store. The lowering code in
2280 // NVPTXISelLowering will have already emitted an upcast.
2281 Optional
<unsigned> Opcode
= 0;
2282 switch (N
->getOpcode()) {
2288 Opcode
= pickOpcodeForVT(Mem
->getMemoryVT().getSimpleVT().SimpleTy
,
2289 NVPTX::StoreParamI8
, NVPTX::StoreParamI16
,
2290 NVPTX::StoreParamI32
, NVPTX::StoreParamI64
,
2291 NVPTX::StoreParamF16
, NVPTX::StoreParamF16x2
,
2292 NVPTX::StoreParamF32
, NVPTX::StoreParamF64
);
2295 Opcode
= pickOpcodeForVT(Mem
->getMemoryVT().getSimpleVT().SimpleTy
,
2296 NVPTX::StoreParamV2I8
, NVPTX::StoreParamV2I16
,
2297 NVPTX::StoreParamV2I32
, NVPTX::StoreParamV2I64
,
2298 NVPTX::StoreParamV2F16
, NVPTX::StoreParamV2F16x2
,
2299 NVPTX::StoreParamV2F32
, NVPTX::StoreParamV2F64
);
2302 Opcode
= pickOpcodeForVT(Mem
->getMemoryVT().getSimpleVT().SimpleTy
,
2303 NVPTX::StoreParamV4I8
, NVPTX::StoreParamV4I16
,
2304 NVPTX::StoreParamV4I32
, None
,
2305 NVPTX::StoreParamV4F16
, NVPTX::StoreParamV4F16x2
,
2306 NVPTX::StoreParamV4F32
, None
);
2312 // Special case: if we have a sign-extend/zero-extend node, insert the
2313 // conversion instruction first, and use that as the value operand to
2314 // the selected StoreParam node.
2315 case NVPTXISD::StoreParamU32
: {
2316 Opcode
= NVPTX::StoreParamI32
;
2317 SDValue CvtNone
= CurDAG
->getTargetConstant(NVPTX::PTXCvtMode::NONE
, DL
,
2319 SDNode
*Cvt
= CurDAG
->getMachineNode(NVPTX::CVT_u32_u16
, DL
,
2320 MVT::i32
, Ops
[0], CvtNone
);
2321 Ops
[0] = SDValue(Cvt
, 0);
2324 case NVPTXISD::StoreParamS32
: {
2325 Opcode
= NVPTX::StoreParamI32
;
2326 SDValue CvtNone
= CurDAG
->getTargetConstant(NVPTX::PTXCvtMode::NONE
, DL
,
2328 SDNode
*Cvt
= CurDAG
->getMachineNode(NVPTX::CVT_s32_s16
, DL
,
2329 MVT::i32
, Ops
[0], CvtNone
);
2330 Ops
[0] = SDValue(Cvt
, 0);
2335 SDVTList RetVTs
= CurDAG
->getVTList(MVT::Other
, MVT::Glue
);
2337 CurDAG
->getMachineNode(Opcode
.getValue(), DL
, RetVTs
, Ops
);
2338 MachineMemOperand
*MemRef
= cast
<MemSDNode
>(N
)->getMemOperand();
2339 CurDAG
->setNodeMemRefs(cast
<MachineSDNode
>(Ret
), {MemRef
});
2341 ReplaceNode(N
, Ret
);
2345 bool NVPTXDAGToDAGISel::tryTextureIntrinsic(SDNode
*N
) {
2348 switch (N
->getOpcode()) {
2349 default: return false;
2350 case NVPTXISD::Tex1DFloatS32
:
2351 Opc
= NVPTX::TEX_1D_F32_S32
;
2353 case NVPTXISD::Tex1DFloatFloat
:
2354 Opc
= NVPTX::TEX_1D_F32_F32
;
2356 case NVPTXISD::Tex1DFloatFloatLevel
:
2357 Opc
= NVPTX::TEX_1D_F32_F32_LEVEL
;
2359 case NVPTXISD::Tex1DFloatFloatGrad
:
2360 Opc
= NVPTX::TEX_1D_F32_F32_GRAD
;
2362 case NVPTXISD::Tex1DS32S32
:
2363 Opc
= NVPTX::TEX_1D_S32_S32
;
2365 case NVPTXISD::Tex1DS32Float
:
2366 Opc
= NVPTX::TEX_1D_S32_F32
;
2368 case NVPTXISD::Tex1DS32FloatLevel
:
2369 Opc
= NVPTX::TEX_1D_S32_F32_LEVEL
;
2371 case NVPTXISD::Tex1DS32FloatGrad
:
2372 Opc
= NVPTX::TEX_1D_S32_F32_GRAD
;
2374 case NVPTXISD::Tex1DU32S32
:
2375 Opc
= NVPTX::TEX_1D_U32_S32
;
2377 case NVPTXISD::Tex1DU32Float
:
2378 Opc
= NVPTX::TEX_1D_U32_F32
;
2380 case NVPTXISD::Tex1DU32FloatLevel
:
2381 Opc
= NVPTX::TEX_1D_U32_F32_LEVEL
;
2383 case NVPTXISD::Tex1DU32FloatGrad
:
2384 Opc
= NVPTX::TEX_1D_U32_F32_GRAD
;
2386 case NVPTXISD::Tex1DArrayFloatS32
:
2387 Opc
= NVPTX::TEX_1D_ARRAY_F32_S32
;
2389 case NVPTXISD::Tex1DArrayFloatFloat
:
2390 Opc
= NVPTX::TEX_1D_ARRAY_F32_F32
;
2392 case NVPTXISD::Tex1DArrayFloatFloatLevel
:
2393 Opc
= NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL
;
2395 case NVPTXISD::Tex1DArrayFloatFloatGrad
:
2396 Opc
= NVPTX::TEX_1D_ARRAY_F32_F32_GRAD
;
2398 case NVPTXISD::Tex1DArrayS32S32
:
2399 Opc
= NVPTX::TEX_1D_ARRAY_S32_S32
;
2401 case NVPTXISD::Tex1DArrayS32Float
:
2402 Opc
= NVPTX::TEX_1D_ARRAY_S32_F32
;
2404 case NVPTXISD::Tex1DArrayS32FloatLevel
:
2405 Opc
= NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL
;
2407 case NVPTXISD::Tex1DArrayS32FloatGrad
:
2408 Opc
= NVPTX::TEX_1D_ARRAY_S32_F32_GRAD
;
2410 case NVPTXISD::Tex1DArrayU32S32
:
2411 Opc
= NVPTX::TEX_1D_ARRAY_U32_S32
;
2413 case NVPTXISD::Tex1DArrayU32Float
:
2414 Opc
= NVPTX::TEX_1D_ARRAY_U32_F32
;
2416 case NVPTXISD::Tex1DArrayU32FloatLevel
:
2417 Opc
= NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL
;
2419 case NVPTXISD::Tex1DArrayU32FloatGrad
:
2420 Opc
= NVPTX::TEX_1D_ARRAY_U32_F32_GRAD
;
2422 case NVPTXISD::Tex2DFloatS32
:
2423 Opc
= NVPTX::TEX_2D_F32_S32
;
2425 case NVPTXISD::Tex2DFloatFloat
:
2426 Opc
= NVPTX::TEX_2D_F32_F32
;
2428 case NVPTXISD::Tex2DFloatFloatLevel
:
2429 Opc
= NVPTX::TEX_2D_F32_F32_LEVEL
;
2431 case NVPTXISD::Tex2DFloatFloatGrad
:
2432 Opc
= NVPTX::TEX_2D_F32_F32_GRAD
;
2434 case NVPTXISD::Tex2DS32S32
:
2435 Opc
= NVPTX::TEX_2D_S32_S32
;
2437 case NVPTXISD::Tex2DS32Float
:
2438 Opc
= NVPTX::TEX_2D_S32_F32
;
2440 case NVPTXISD::Tex2DS32FloatLevel
:
2441 Opc
= NVPTX::TEX_2D_S32_F32_LEVEL
;
2443 case NVPTXISD::Tex2DS32FloatGrad
:
2444 Opc
= NVPTX::TEX_2D_S32_F32_GRAD
;
2446 case NVPTXISD::Tex2DU32S32
:
2447 Opc
= NVPTX::TEX_2D_U32_S32
;
2449 case NVPTXISD::Tex2DU32Float
:
2450 Opc
= NVPTX::TEX_2D_U32_F32
;
2452 case NVPTXISD::Tex2DU32FloatLevel
:
2453 Opc
= NVPTX::TEX_2D_U32_F32_LEVEL
;
2455 case NVPTXISD::Tex2DU32FloatGrad
:
2456 Opc
= NVPTX::TEX_2D_U32_F32_GRAD
;
2458 case NVPTXISD::Tex2DArrayFloatS32
:
2459 Opc
= NVPTX::TEX_2D_ARRAY_F32_S32
;
2461 case NVPTXISD::Tex2DArrayFloatFloat
:
2462 Opc
= NVPTX::TEX_2D_ARRAY_F32_F32
;
2464 case NVPTXISD::Tex2DArrayFloatFloatLevel
:
2465 Opc
= NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL
;
2467 case NVPTXISD::Tex2DArrayFloatFloatGrad
:
2468 Opc
= NVPTX::TEX_2D_ARRAY_F32_F32_GRAD
;
2470 case NVPTXISD::Tex2DArrayS32S32
:
2471 Opc
= NVPTX::TEX_2D_ARRAY_S32_S32
;
2473 case NVPTXISD::Tex2DArrayS32Float
:
2474 Opc
= NVPTX::TEX_2D_ARRAY_S32_F32
;
2476 case NVPTXISD::Tex2DArrayS32FloatLevel
:
2477 Opc
= NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL
;
2479 case NVPTXISD::Tex2DArrayS32FloatGrad
:
2480 Opc
= NVPTX::TEX_2D_ARRAY_S32_F32_GRAD
;
2482 case NVPTXISD::Tex2DArrayU32S32
:
2483 Opc
= NVPTX::TEX_2D_ARRAY_U32_S32
;
2485 case NVPTXISD::Tex2DArrayU32Float
:
2486 Opc
= NVPTX::TEX_2D_ARRAY_U32_F32
;
2488 case NVPTXISD::Tex2DArrayU32FloatLevel
:
2489 Opc
= NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL
;
2491 case NVPTXISD::Tex2DArrayU32FloatGrad
:
2492 Opc
= NVPTX::TEX_2D_ARRAY_U32_F32_GRAD
;
2494 case NVPTXISD::Tex3DFloatS32
:
2495 Opc
= NVPTX::TEX_3D_F32_S32
;
2497 case NVPTXISD::Tex3DFloatFloat
:
2498 Opc
= NVPTX::TEX_3D_F32_F32
;
2500 case NVPTXISD::Tex3DFloatFloatLevel
:
2501 Opc
= NVPTX::TEX_3D_F32_F32_LEVEL
;
2503 case NVPTXISD::Tex3DFloatFloatGrad
:
2504 Opc
= NVPTX::TEX_3D_F32_F32_GRAD
;
2506 case NVPTXISD::Tex3DS32S32
:
2507 Opc
= NVPTX::TEX_3D_S32_S32
;
2509 case NVPTXISD::Tex3DS32Float
:
2510 Opc
= NVPTX::TEX_3D_S32_F32
;
2512 case NVPTXISD::Tex3DS32FloatLevel
:
2513 Opc
= NVPTX::TEX_3D_S32_F32_LEVEL
;
2515 case NVPTXISD::Tex3DS32FloatGrad
:
2516 Opc
= NVPTX::TEX_3D_S32_F32_GRAD
;
2518 case NVPTXISD::Tex3DU32S32
:
2519 Opc
= NVPTX::TEX_3D_U32_S32
;
2521 case NVPTXISD::Tex3DU32Float
:
2522 Opc
= NVPTX::TEX_3D_U32_F32
;
2524 case NVPTXISD::Tex3DU32FloatLevel
:
2525 Opc
= NVPTX::TEX_3D_U32_F32_LEVEL
;
2527 case NVPTXISD::Tex3DU32FloatGrad
:
2528 Opc
= NVPTX::TEX_3D_U32_F32_GRAD
;
2530 case NVPTXISD::TexCubeFloatFloat
:
2531 Opc
= NVPTX::TEX_CUBE_F32_F32
;
2533 case NVPTXISD::TexCubeFloatFloatLevel
:
2534 Opc
= NVPTX::TEX_CUBE_F32_F32_LEVEL
;
2536 case NVPTXISD::TexCubeS32Float
:
2537 Opc
= NVPTX::TEX_CUBE_S32_F32
;
2539 case NVPTXISD::TexCubeS32FloatLevel
:
2540 Opc
= NVPTX::TEX_CUBE_S32_F32_LEVEL
;
2542 case NVPTXISD::TexCubeU32Float
:
2543 Opc
= NVPTX::TEX_CUBE_U32_F32
;
2545 case NVPTXISD::TexCubeU32FloatLevel
:
2546 Opc
= NVPTX::TEX_CUBE_U32_F32_LEVEL
;
2548 case NVPTXISD::TexCubeArrayFloatFloat
:
2549 Opc
= NVPTX::TEX_CUBE_ARRAY_F32_F32
;
2551 case NVPTXISD::TexCubeArrayFloatFloatLevel
:
2552 Opc
= NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL
;
2554 case NVPTXISD::TexCubeArrayS32Float
:
2555 Opc
= NVPTX::TEX_CUBE_ARRAY_S32_F32
;
2557 case NVPTXISD::TexCubeArrayS32FloatLevel
:
2558 Opc
= NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL
;
2560 case NVPTXISD::TexCubeArrayU32Float
:
2561 Opc
= NVPTX::TEX_CUBE_ARRAY_U32_F32
;
2563 case NVPTXISD::TexCubeArrayU32FloatLevel
:
2564 Opc
= NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL
;
2566 case NVPTXISD::Tld4R2DFloatFloat
:
2567 Opc
= NVPTX::TLD4_R_2D_F32_F32
;
2569 case NVPTXISD::Tld4G2DFloatFloat
:
2570 Opc
= NVPTX::TLD4_G_2D_F32_F32
;
2572 case NVPTXISD::Tld4B2DFloatFloat
:
2573 Opc
= NVPTX::TLD4_B_2D_F32_F32
;
2575 case NVPTXISD::Tld4A2DFloatFloat
:
2576 Opc
= NVPTX::TLD4_A_2D_F32_F32
;
2578 case NVPTXISD::Tld4R2DS64Float
:
2579 Opc
= NVPTX::TLD4_R_2D_S32_F32
;
2581 case NVPTXISD::Tld4G2DS64Float
:
2582 Opc
= NVPTX::TLD4_G_2D_S32_F32
;
2584 case NVPTXISD::Tld4B2DS64Float
:
2585 Opc
= NVPTX::TLD4_B_2D_S32_F32
;
2587 case NVPTXISD::Tld4A2DS64Float
:
2588 Opc
= NVPTX::TLD4_A_2D_S32_F32
;
2590 case NVPTXISD::Tld4R2DU64Float
:
2591 Opc
= NVPTX::TLD4_R_2D_U32_F32
;
2593 case NVPTXISD::Tld4G2DU64Float
:
2594 Opc
= NVPTX::TLD4_G_2D_U32_F32
;
2596 case NVPTXISD::Tld4B2DU64Float
:
2597 Opc
= NVPTX::TLD4_B_2D_U32_F32
;
2599 case NVPTXISD::Tld4A2DU64Float
:
2600 Opc
= NVPTX::TLD4_A_2D_U32_F32
;
2602 case NVPTXISD::TexUnified1DFloatS32
:
2603 Opc
= NVPTX::TEX_UNIFIED_1D_F32_S32
;
2605 case NVPTXISD::TexUnified1DFloatFloat
:
2606 Opc
= NVPTX::TEX_UNIFIED_1D_F32_F32
;
2608 case NVPTXISD::TexUnified1DFloatFloatLevel
:
2609 Opc
= NVPTX::TEX_UNIFIED_1D_F32_F32_LEVEL
;
2611 case NVPTXISD::TexUnified1DFloatFloatGrad
:
2612 Opc
= NVPTX::TEX_UNIFIED_1D_F32_F32_GRAD
;
2614 case NVPTXISD::TexUnified1DS32S32
:
2615 Opc
= NVPTX::TEX_UNIFIED_1D_S32_S32
;
2617 case NVPTXISD::TexUnified1DS32Float
:
2618 Opc
= NVPTX::TEX_UNIFIED_1D_S32_F32
;
2620 case NVPTXISD::TexUnified1DS32FloatLevel
:
2621 Opc
= NVPTX::TEX_UNIFIED_1D_S32_F32_LEVEL
;
2623 case NVPTXISD::TexUnified1DS32FloatGrad
:
2624 Opc
= NVPTX::TEX_UNIFIED_1D_S32_F32_GRAD
;
2626 case NVPTXISD::TexUnified1DU32S32
:
2627 Opc
= NVPTX::TEX_UNIFIED_1D_U32_S32
;
2629 case NVPTXISD::TexUnified1DU32Float
:
2630 Opc
= NVPTX::TEX_UNIFIED_1D_U32_F32
;
2632 case NVPTXISD::TexUnified1DU32FloatLevel
:
2633 Opc
= NVPTX::TEX_UNIFIED_1D_U32_F32_LEVEL
;
2635 case NVPTXISD::TexUnified1DU32FloatGrad
:
2636 Opc
= NVPTX::TEX_UNIFIED_1D_U32_F32_GRAD
;
2638 case NVPTXISD::TexUnified1DArrayFloatS32
:
2639 Opc
= NVPTX::TEX_UNIFIED_1D_ARRAY_F32_S32
;
2641 case NVPTXISD::TexUnified1DArrayFloatFloat
:
2642 Opc
= NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32
;
2644 case NVPTXISD::TexUnified1DArrayFloatFloatLevel
:
2645 Opc
= NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL
;
2647 case NVPTXISD::TexUnified1DArrayFloatFloatGrad
:
2648 Opc
= NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD
;
2650 case NVPTXISD::TexUnified1DArrayS32S32
:
2651 Opc
= NVPTX::TEX_UNIFIED_1D_ARRAY_S32_S32
;
2653 case NVPTXISD::TexUnified1DArrayS32Float
:
2654 Opc
= NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32
;
2656 case NVPTXISD::TexUnified1DArrayS32FloatLevel
:
2657 Opc
= NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL
;
2659 case NVPTXISD::TexUnified1DArrayS32FloatGrad
:
2660 Opc
= NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD
;
2662 case NVPTXISD::TexUnified1DArrayU32S32
:
2663 Opc
= NVPTX::TEX_UNIFIED_1D_ARRAY_U32_S32
;
2665 case NVPTXISD::TexUnified1DArrayU32Float
:
2666 Opc
= NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32
;
2668 case NVPTXISD::TexUnified1DArrayU32FloatLevel
:
2669 Opc
= NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL
;
2671 case NVPTXISD::TexUnified1DArrayU32FloatGrad
:
2672 Opc
= NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD
;
2674 case NVPTXISD::TexUnified2DFloatS32
:
2675 Opc
= NVPTX::TEX_UNIFIED_2D_F32_S32
;
2677 case NVPTXISD::TexUnified2DFloatFloat
:
2678 Opc
= NVPTX::TEX_UNIFIED_2D_F32_F32
;
2680 case NVPTXISD::TexUnified2DFloatFloatLevel
:
2681 Opc
= NVPTX::TEX_UNIFIED_2D_F32_F32_LEVEL
;
2683 case NVPTXISD::TexUnified2DFloatFloatGrad
:
2684 Opc
= NVPTX::TEX_UNIFIED_2D_F32_F32_GRAD
;
2686 case NVPTXISD::TexUnified2DS32S32
:
2687 Opc
= NVPTX::TEX_UNIFIED_2D_S32_S32
;
2689 case NVPTXISD::TexUnified2DS32Float
:
2690 Opc
= NVPTX::TEX_UNIFIED_2D_S32_F32
;
2692 case NVPTXISD::TexUnified2DS32FloatLevel
:
2693 Opc
= NVPTX::TEX_UNIFIED_2D_S32_F32_LEVEL
;
2695 case NVPTXISD::TexUnified2DS32FloatGrad
:
2696 Opc
= NVPTX::TEX_UNIFIED_2D_S32_F32_GRAD
;
2698 case NVPTXISD::TexUnified2DU32S32
:
2699 Opc
= NVPTX::TEX_UNIFIED_2D_U32_S32
;
2701 case NVPTXISD::TexUnified2DU32Float
:
2702 Opc
= NVPTX::TEX_UNIFIED_2D_U32_F32
;
2704 case NVPTXISD::TexUnified2DU32FloatLevel
:
2705 Opc
= NVPTX::TEX_UNIFIED_2D_U32_F32_LEVEL
;
2707 case NVPTXISD::TexUnified2DU32FloatGrad
:
2708 Opc
= NVPTX::TEX_UNIFIED_2D_U32_F32_GRAD
;
2710 case NVPTXISD::TexUnified2DArrayFloatS32
:
2711 Opc
= NVPTX::TEX_UNIFIED_2D_ARRAY_F32_S32
;
2713 case NVPTXISD::TexUnified2DArrayFloatFloat
:
2714 Opc
= NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32
;
2716 case NVPTXISD::TexUnified2DArrayFloatFloatLevel
:
2717 Opc
= NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL
;
2719 case NVPTXISD::TexUnified2DArrayFloatFloatGrad
:
2720 Opc
= NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD
;
2722 case NVPTXISD::TexUnified2DArrayS32S32
:
2723 Opc
= NVPTX::TEX_UNIFIED_2D_ARRAY_S32_S32
;
2725 case NVPTXISD::TexUnified2DArrayS32Float
:
2726 Opc
= NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32
;
2728 case NVPTXISD::TexUnified2DArrayS32FloatLevel
:
2729 Opc
= NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL
;
2731 case NVPTXISD::TexUnified2DArrayS32FloatGrad
:
2732 Opc
= NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD
;
2734 case NVPTXISD::TexUnified2DArrayU32S32
:
2735 Opc
= NVPTX::TEX_UNIFIED_2D_ARRAY_U32_S32
;
2737 case NVPTXISD::TexUnified2DArrayU32Float
:
2738 Opc
= NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32
;
2740 case NVPTXISD::TexUnified2DArrayU32FloatLevel
:
2741 Opc
= NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL
;
2743 case NVPTXISD::TexUnified2DArrayU32FloatGrad
:
2744 Opc
= NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD
;
2746 case NVPTXISD::TexUnified3DFloatS32
:
2747 Opc
= NVPTX::TEX_UNIFIED_3D_F32_S32
;
2749 case NVPTXISD::TexUnified3DFloatFloat
:
2750 Opc
= NVPTX::TEX_UNIFIED_3D_F32_F32
;
2752 case NVPTXISD::TexUnified3DFloatFloatLevel
:
2753 Opc
= NVPTX::TEX_UNIFIED_3D_F32_F32_LEVEL
;
2755 case NVPTXISD::TexUnified3DFloatFloatGrad
:
2756 Opc
= NVPTX::TEX_UNIFIED_3D_F32_F32_GRAD
;
2758 case NVPTXISD::TexUnified3DS32S32
:
2759 Opc
= NVPTX::TEX_UNIFIED_3D_S32_S32
;
2761 case NVPTXISD::TexUnified3DS32Float
:
2762 Opc
= NVPTX::TEX_UNIFIED_3D_S32_F32
;
2764 case NVPTXISD::TexUnified3DS32FloatLevel
:
2765 Opc
= NVPTX::TEX_UNIFIED_3D_S32_F32_LEVEL
;
2767 case NVPTXISD::TexUnified3DS32FloatGrad
:
2768 Opc
= NVPTX::TEX_UNIFIED_3D_S32_F32_GRAD
;
2770 case NVPTXISD::TexUnified3DU32S32
:
2771 Opc
= NVPTX::TEX_UNIFIED_3D_U32_S32
;
2773 case NVPTXISD::TexUnified3DU32Float
:
2774 Opc
= NVPTX::TEX_UNIFIED_3D_U32_F32
;
2776 case NVPTXISD::TexUnified3DU32FloatLevel
:
2777 Opc
= NVPTX::TEX_UNIFIED_3D_U32_F32_LEVEL
;
2779 case NVPTXISD::TexUnified3DU32FloatGrad
:
2780 Opc
= NVPTX::TEX_UNIFIED_3D_U32_F32_GRAD
;
2782 case NVPTXISD::TexUnifiedCubeFloatFloat
:
2783 Opc
= NVPTX::TEX_UNIFIED_CUBE_F32_F32
;
2785 case NVPTXISD::TexUnifiedCubeFloatFloatLevel
:
2786 Opc
= NVPTX::TEX_UNIFIED_CUBE_F32_F32_LEVEL
;
2788 case NVPTXISD::TexUnifiedCubeS32Float
:
2789 Opc
= NVPTX::TEX_UNIFIED_CUBE_S32_F32
;
2791 case NVPTXISD::TexUnifiedCubeS32FloatLevel
:
2792 Opc
= NVPTX::TEX_UNIFIED_CUBE_S32_F32_LEVEL
;
2794 case NVPTXISD::TexUnifiedCubeU32Float
:
2795 Opc
= NVPTX::TEX_UNIFIED_CUBE_U32_F32
;
2797 case NVPTXISD::TexUnifiedCubeU32FloatLevel
:
2798 Opc
= NVPTX::TEX_UNIFIED_CUBE_U32_F32_LEVEL
;
2800 case NVPTXISD::TexUnifiedCubeArrayFloatFloat
:
2801 Opc
= NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32
;
2803 case NVPTXISD::TexUnifiedCubeArrayFloatFloatLevel
:
2804 Opc
= NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL
;
2806 case NVPTXISD::TexUnifiedCubeArrayS32Float
:
2807 Opc
= NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32
;
2809 case NVPTXISD::TexUnifiedCubeArrayS32FloatLevel
:
2810 Opc
= NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL
;
2812 case NVPTXISD::TexUnifiedCubeArrayU32Float
:
2813 Opc
= NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32
;
2815 case NVPTXISD::TexUnifiedCubeArrayU32FloatLevel
:
2816 Opc
= NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL
;
2818 case NVPTXISD::Tld4UnifiedR2DFloatFloat
:
2819 Opc
= NVPTX::TLD4_UNIFIED_R_2D_F32_F32
;
2821 case NVPTXISD::Tld4UnifiedG2DFloatFloat
:
2822 Opc
= NVPTX::TLD4_UNIFIED_G_2D_F32_F32
;
2824 case NVPTXISD::Tld4UnifiedB2DFloatFloat
:
2825 Opc
= NVPTX::TLD4_UNIFIED_B_2D_F32_F32
;
2827 case NVPTXISD::Tld4UnifiedA2DFloatFloat
:
2828 Opc
= NVPTX::TLD4_UNIFIED_A_2D_F32_F32
;
2830 case NVPTXISD::Tld4UnifiedR2DS64Float
:
2831 Opc
= NVPTX::TLD4_UNIFIED_R_2D_S32_F32
;
2833 case NVPTXISD::Tld4UnifiedG2DS64Float
:
2834 Opc
= NVPTX::TLD4_UNIFIED_G_2D_S32_F32
;
2836 case NVPTXISD::Tld4UnifiedB2DS64Float
:
2837 Opc
= NVPTX::TLD4_UNIFIED_B_2D_S32_F32
;
2839 case NVPTXISD::Tld4UnifiedA2DS64Float
:
2840 Opc
= NVPTX::TLD4_UNIFIED_A_2D_S32_F32
;
2842 case NVPTXISD::Tld4UnifiedR2DU64Float
:
2843 Opc
= NVPTX::TLD4_UNIFIED_R_2D_U32_F32
;
2845 case NVPTXISD::Tld4UnifiedG2DU64Float
:
2846 Opc
= NVPTX::TLD4_UNIFIED_G_2D_U32_F32
;
2848 case NVPTXISD::Tld4UnifiedB2DU64Float
:
2849 Opc
= NVPTX::TLD4_UNIFIED_B_2D_U32_F32
;
2851 case NVPTXISD::Tld4UnifiedA2DU64Float
:
2852 Opc
= NVPTX::TLD4_UNIFIED_A_2D_U32_F32
;
2856 // Copy over operands
2857 SmallVector
<SDValue
, 8> Ops(drop_begin(N
->ops()));
2858 Ops
.push_back(N
->getOperand(0)); // Move chain to the back.
2860 ReplaceNode(N
, CurDAG
->getMachineNode(Opc
, SDLoc(N
), N
->getVTList(), Ops
));
2864 bool NVPTXDAGToDAGISel::trySurfaceIntrinsic(SDNode
*N
) {
2866 switch (N
->getOpcode()) {
2867 default: return false;
2868 case NVPTXISD::Suld1DI8Clamp
:
2869 Opc
= NVPTX::SULD_1D_I8_CLAMP
;
2871 case NVPTXISD::Suld1DI16Clamp
:
2872 Opc
= NVPTX::SULD_1D_I16_CLAMP
;
2874 case NVPTXISD::Suld1DI32Clamp
:
2875 Opc
= NVPTX::SULD_1D_I32_CLAMP
;
2877 case NVPTXISD::Suld1DI64Clamp
:
2878 Opc
= NVPTX::SULD_1D_I64_CLAMP
;
2880 case NVPTXISD::Suld1DV2I8Clamp
:
2881 Opc
= NVPTX::SULD_1D_V2I8_CLAMP
;
2883 case NVPTXISD::Suld1DV2I16Clamp
:
2884 Opc
= NVPTX::SULD_1D_V2I16_CLAMP
;
2886 case NVPTXISD::Suld1DV2I32Clamp
:
2887 Opc
= NVPTX::SULD_1D_V2I32_CLAMP
;
2889 case NVPTXISD::Suld1DV2I64Clamp
:
2890 Opc
= NVPTX::SULD_1D_V2I64_CLAMP
;
2892 case NVPTXISD::Suld1DV4I8Clamp
:
2893 Opc
= NVPTX::SULD_1D_V4I8_CLAMP
;
2895 case NVPTXISD::Suld1DV4I16Clamp
:
2896 Opc
= NVPTX::SULD_1D_V4I16_CLAMP
;
2898 case NVPTXISD::Suld1DV4I32Clamp
:
2899 Opc
= NVPTX::SULD_1D_V4I32_CLAMP
;
2901 case NVPTXISD::Suld1DArrayI8Clamp
:
2902 Opc
= NVPTX::SULD_1D_ARRAY_I8_CLAMP
;
2904 case NVPTXISD::Suld1DArrayI16Clamp
:
2905 Opc
= NVPTX::SULD_1D_ARRAY_I16_CLAMP
;
2907 case NVPTXISD::Suld1DArrayI32Clamp
:
2908 Opc
= NVPTX::SULD_1D_ARRAY_I32_CLAMP
;
2910 case NVPTXISD::Suld1DArrayI64Clamp
:
2911 Opc
= NVPTX::SULD_1D_ARRAY_I64_CLAMP
;
2913 case NVPTXISD::Suld1DArrayV2I8Clamp
:
2914 Opc
= NVPTX::SULD_1D_ARRAY_V2I8_CLAMP
;
2916 case NVPTXISD::Suld1DArrayV2I16Clamp
:
2917 Opc
= NVPTX::SULD_1D_ARRAY_V2I16_CLAMP
;
2919 case NVPTXISD::Suld1DArrayV2I32Clamp
:
2920 Opc
= NVPTX::SULD_1D_ARRAY_V2I32_CLAMP
;
2922 case NVPTXISD::Suld1DArrayV2I64Clamp
:
2923 Opc
= NVPTX::SULD_1D_ARRAY_V2I64_CLAMP
;
2925 case NVPTXISD::Suld1DArrayV4I8Clamp
:
2926 Opc
= NVPTX::SULD_1D_ARRAY_V4I8_CLAMP
;
2928 case NVPTXISD::Suld1DArrayV4I16Clamp
:
2929 Opc
= NVPTX::SULD_1D_ARRAY_V4I16_CLAMP
;
2931 case NVPTXISD::Suld1DArrayV4I32Clamp
:
2932 Opc
= NVPTX::SULD_1D_ARRAY_V4I32_CLAMP
;
2934 case NVPTXISD::Suld2DI8Clamp
:
2935 Opc
= NVPTX::SULD_2D_I8_CLAMP
;
2937 case NVPTXISD::Suld2DI16Clamp
:
2938 Opc
= NVPTX::SULD_2D_I16_CLAMP
;
2940 case NVPTXISD::Suld2DI32Clamp
:
2941 Opc
= NVPTX::SULD_2D_I32_CLAMP
;
2943 case NVPTXISD::Suld2DI64Clamp
:
2944 Opc
= NVPTX::SULD_2D_I64_CLAMP
;
2946 case NVPTXISD::Suld2DV2I8Clamp
:
2947 Opc
= NVPTX::SULD_2D_V2I8_CLAMP
;
2949 case NVPTXISD::Suld2DV2I16Clamp
:
2950 Opc
= NVPTX::SULD_2D_V2I16_CLAMP
;
2952 case NVPTXISD::Suld2DV2I32Clamp
:
2953 Opc
= NVPTX::SULD_2D_V2I32_CLAMP
;
2955 case NVPTXISD::Suld2DV2I64Clamp
:
2956 Opc
= NVPTX::SULD_2D_V2I64_CLAMP
;
2958 case NVPTXISD::Suld2DV4I8Clamp
:
2959 Opc
= NVPTX::SULD_2D_V4I8_CLAMP
;
2961 case NVPTXISD::Suld2DV4I16Clamp
:
2962 Opc
= NVPTX::SULD_2D_V4I16_CLAMP
;
2964 case NVPTXISD::Suld2DV4I32Clamp
:
2965 Opc
= NVPTX::SULD_2D_V4I32_CLAMP
;
2967 case NVPTXISD::Suld2DArrayI8Clamp
:
2968 Opc
= NVPTX::SULD_2D_ARRAY_I8_CLAMP
;
2970 case NVPTXISD::Suld2DArrayI16Clamp
:
2971 Opc
= NVPTX::SULD_2D_ARRAY_I16_CLAMP
;
2973 case NVPTXISD::Suld2DArrayI32Clamp
:
2974 Opc
= NVPTX::SULD_2D_ARRAY_I32_CLAMP
;
2976 case NVPTXISD::Suld2DArrayI64Clamp
:
2977 Opc
= NVPTX::SULD_2D_ARRAY_I64_CLAMP
;
2979 case NVPTXISD::Suld2DArrayV2I8Clamp
:
2980 Opc
= NVPTX::SULD_2D_ARRAY_V2I8_CLAMP
;
2982 case NVPTXISD::Suld2DArrayV2I16Clamp
:
2983 Opc
= NVPTX::SULD_2D_ARRAY_V2I16_CLAMP
;
2985 case NVPTXISD::Suld2DArrayV2I32Clamp
:
2986 Opc
= NVPTX::SULD_2D_ARRAY_V2I32_CLAMP
;
2988 case NVPTXISD::Suld2DArrayV2I64Clamp
:
2989 Opc
= NVPTX::SULD_2D_ARRAY_V2I64_CLAMP
;
2991 case NVPTXISD::Suld2DArrayV4I8Clamp
:
2992 Opc
= NVPTX::SULD_2D_ARRAY_V4I8_CLAMP
;
2994 case NVPTXISD::Suld2DArrayV4I16Clamp
:
2995 Opc
= NVPTX::SULD_2D_ARRAY_V4I16_CLAMP
;
2997 case NVPTXISD::Suld2DArrayV4I32Clamp
:
2998 Opc
= NVPTX::SULD_2D_ARRAY_V4I32_CLAMP
;
3000 case NVPTXISD::Suld3DI8Clamp
:
3001 Opc
= NVPTX::SULD_3D_I8_CLAMP
;
3003 case NVPTXISD::Suld3DI16Clamp
:
3004 Opc
= NVPTX::SULD_3D_I16_CLAMP
;
3006 case NVPTXISD::Suld3DI32Clamp
:
3007 Opc
= NVPTX::SULD_3D_I32_CLAMP
;
3009 case NVPTXISD::Suld3DI64Clamp
:
3010 Opc
= NVPTX::SULD_3D_I64_CLAMP
;
3012 case NVPTXISD::Suld3DV2I8Clamp
:
3013 Opc
= NVPTX::SULD_3D_V2I8_CLAMP
;
3015 case NVPTXISD::Suld3DV2I16Clamp
:
3016 Opc
= NVPTX::SULD_3D_V2I16_CLAMP
;
3018 case NVPTXISD::Suld3DV2I32Clamp
:
3019 Opc
= NVPTX::SULD_3D_V2I32_CLAMP
;
3021 case NVPTXISD::Suld3DV2I64Clamp
:
3022 Opc
= NVPTX::SULD_3D_V2I64_CLAMP
;
3024 case NVPTXISD::Suld3DV4I8Clamp
:
3025 Opc
= NVPTX::SULD_3D_V4I8_CLAMP
;
3027 case NVPTXISD::Suld3DV4I16Clamp
:
3028 Opc
= NVPTX::SULD_3D_V4I16_CLAMP
;
3030 case NVPTXISD::Suld3DV4I32Clamp
:
3031 Opc
= NVPTX::SULD_3D_V4I32_CLAMP
;
3033 case NVPTXISD::Suld1DI8Trap
:
3034 Opc
= NVPTX::SULD_1D_I8_TRAP
;
3036 case NVPTXISD::Suld1DI16Trap
:
3037 Opc
= NVPTX::SULD_1D_I16_TRAP
;
3039 case NVPTXISD::Suld1DI32Trap
:
3040 Opc
= NVPTX::SULD_1D_I32_TRAP
;
3042 case NVPTXISD::Suld1DI64Trap
:
3043 Opc
= NVPTX::SULD_1D_I64_TRAP
;
3045 case NVPTXISD::Suld1DV2I8Trap
:
3046 Opc
= NVPTX::SULD_1D_V2I8_TRAP
;
3048 case NVPTXISD::Suld1DV2I16Trap
:
3049 Opc
= NVPTX::SULD_1D_V2I16_TRAP
;
3051 case NVPTXISD::Suld1DV2I32Trap
:
3052 Opc
= NVPTX::SULD_1D_V2I32_TRAP
;
3054 case NVPTXISD::Suld1DV2I64Trap
:
3055 Opc
= NVPTX::SULD_1D_V2I64_TRAP
;
3057 case NVPTXISD::Suld1DV4I8Trap
:
3058 Opc
= NVPTX::SULD_1D_V4I8_TRAP
;
3060 case NVPTXISD::Suld1DV4I16Trap
:
3061 Opc
= NVPTX::SULD_1D_V4I16_TRAP
;
3063 case NVPTXISD::Suld1DV4I32Trap
:
3064 Opc
= NVPTX::SULD_1D_V4I32_TRAP
;
3066 case NVPTXISD::Suld1DArrayI8Trap
:
3067 Opc
= NVPTX::SULD_1D_ARRAY_I8_TRAP
;
3069 case NVPTXISD::Suld1DArrayI16Trap
:
3070 Opc
= NVPTX::SULD_1D_ARRAY_I16_TRAP
;
3072 case NVPTXISD::Suld1DArrayI32Trap
:
3073 Opc
= NVPTX::SULD_1D_ARRAY_I32_TRAP
;
3075 case NVPTXISD::Suld1DArrayI64Trap
:
3076 Opc
= NVPTX::SULD_1D_ARRAY_I64_TRAP
;
3078 case NVPTXISD::Suld1DArrayV2I8Trap
:
3079 Opc
= NVPTX::SULD_1D_ARRAY_V2I8_TRAP
;
3081 case NVPTXISD::Suld1DArrayV2I16Trap
:
3082 Opc
= NVPTX::SULD_1D_ARRAY_V2I16_TRAP
;
3084 case NVPTXISD::Suld1DArrayV2I32Trap
:
3085 Opc
= NVPTX::SULD_1D_ARRAY_V2I32_TRAP
;
3087 case NVPTXISD::Suld1DArrayV2I64Trap
:
3088 Opc
= NVPTX::SULD_1D_ARRAY_V2I64_TRAP
;
3090 case NVPTXISD::Suld1DArrayV4I8Trap
:
3091 Opc
= NVPTX::SULD_1D_ARRAY_V4I8_TRAP
;
3093 case NVPTXISD::Suld1DArrayV4I16Trap
:
3094 Opc
= NVPTX::SULD_1D_ARRAY_V4I16_TRAP
;
3096 case NVPTXISD::Suld1DArrayV4I32Trap
:
3097 Opc
= NVPTX::SULD_1D_ARRAY_V4I32_TRAP
;
3099 case NVPTXISD::Suld2DI8Trap
:
3100 Opc
= NVPTX::SULD_2D_I8_TRAP
;
3102 case NVPTXISD::Suld2DI16Trap
:
3103 Opc
= NVPTX::SULD_2D_I16_TRAP
;
3105 case NVPTXISD::Suld2DI32Trap
:
3106 Opc
= NVPTX::SULD_2D_I32_TRAP
;
3108 case NVPTXISD::Suld2DI64Trap
:
3109 Opc
= NVPTX::SULD_2D_I64_TRAP
;
3111 case NVPTXISD::Suld2DV2I8Trap
:
3112 Opc
= NVPTX::SULD_2D_V2I8_TRAP
;
3114 case NVPTXISD::Suld2DV2I16Trap
:
3115 Opc
= NVPTX::SULD_2D_V2I16_TRAP
;
3117 case NVPTXISD::Suld2DV2I32Trap
:
3118 Opc
= NVPTX::SULD_2D_V2I32_TRAP
;
3120 case NVPTXISD::Suld2DV2I64Trap
:
3121 Opc
= NVPTX::SULD_2D_V2I64_TRAP
;
3123 case NVPTXISD::Suld2DV4I8Trap
:
3124 Opc
= NVPTX::SULD_2D_V4I8_TRAP
;
3126 case NVPTXISD::Suld2DV4I16Trap
:
3127 Opc
= NVPTX::SULD_2D_V4I16_TRAP
;
3129 case NVPTXISD::Suld2DV4I32Trap
:
3130 Opc
= NVPTX::SULD_2D_V4I32_TRAP
;
3132 case NVPTXISD::Suld2DArrayI8Trap
:
3133 Opc
= NVPTX::SULD_2D_ARRAY_I8_TRAP
;
3135 case NVPTXISD::Suld2DArrayI16Trap
:
3136 Opc
= NVPTX::SULD_2D_ARRAY_I16_TRAP
;
3138 case NVPTXISD::Suld2DArrayI32Trap
:
3139 Opc
= NVPTX::SULD_2D_ARRAY_I32_TRAP
;
3141 case NVPTXISD::Suld2DArrayI64Trap
:
3142 Opc
= NVPTX::SULD_2D_ARRAY_I64_TRAP
;
3144 case NVPTXISD::Suld2DArrayV2I8Trap
:
3145 Opc
= NVPTX::SULD_2D_ARRAY_V2I8_TRAP
;
3147 case NVPTXISD::Suld2DArrayV2I16Trap
:
3148 Opc
= NVPTX::SULD_2D_ARRAY_V2I16_TRAP
;
3150 case NVPTXISD::Suld2DArrayV2I32Trap
:
3151 Opc
= NVPTX::SULD_2D_ARRAY_V2I32_TRAP
;
3153 case NVPTXISD::Suld2DArrayV2I64Trap
:
3154 Opc
= NVPTX::SULD_2D_ARRAY_V2I64_TRAP
;
3156 case NVPTXISD::Suld2DArrayV4I8Trap
:
3157 Opc
= NVPTX::SULD_2D_ARRAY_V4I8_TRAP
;
3159 case NVPTXISD::Suld2DArrayV4I16Trap
:
3160 Opc
= NVPTX::SULD_2D_ARRAY_V4I16_TRAP
;
3162 case NVPTXISD::Suld2DArrayV4I32Trap
:
3163 Opc
= NVPTX::SULD_2D_ARRAY_V4I32_TRAP
;
3165 case NVPTXISD::Suld3DI8Trap
:
3166 Opc
= NVPTX::SULD_3D_I8_TRAP
;
3168 case NVPTXISD::Suld3DI16Trap
:
3169 Opc
= NVPTX::SULD_3D_I16_TRAP
;
3171 case NVPTXISD::Suld3DI32Trap
:
3172 Opc
= NVPTX::SULD_3D_I32_TRAP
;
3174 case NVPTXISD::Suld3DI64Trap
:
3175 Opc
= NVPTX::SULD_3D_I64_TRAP
;
3177 case NVPTXISD::Suld3DV2I8Trap
:
3178 Opc
= NVPTX::SULD_3D_V2I8_TRAP
;
3180 case NVPTXISD::Suld3DV2I16Trap
:
3181 Opc
= NVPTX::SULD_3D_V2I16_TRAP
;
3183 case NVPTXISD::Suld3DV2I32Trap
:
3184 Opc
= NVPTX::SULD_3D_V2I32_TRAP
;
3186 case NVPTXISD::Suld3DV2I64Trap
:
3187 Opc
= NVPTX::SULD_3D_V2I64_TRAP
;
3189 case NVPTXISD::Suld3DV4I8Trap
:
3190 Opc
= NVPTX::SULD_3D_V4I8_TRAP
;
3192 case NVPTXISD::Suld3DV4I16Trap
:
3193 Opc
= NVPTX::SULD_3D_V4I16_TRAP
;
3195 case NVPTXISD::Suld3DV4I32Trap
:
3196 Opc
= NVPTX::SULD_3D_V4I32_TRAP
;
3198 case NVPTXISD::Suld1DI8Zero
:
3199 Opc
= NVPTX::SULD_1D_I8_ZERO
;
3201 case NVPTXISD::Suld1DI16Zero
:
3202 Opc
= NVPTX::SULD_1D_I16_ZERO
;
3204 case NVPTXISD::Suld1DI32Zero
:
3205 Opc
= NVPTX::SULD_1D_I32_ZERO
;
3207 case NVPTXISD::Suld1DI64Zero
:
3208 Opc
= NVPTX::SULD_1D_I64_ZERO
;
3210 case NVPTXISD::Suld1DV2I8Zero
:
3211 Opc
= NVPTX::SULD_1D_V2I8_ZERO
;
3213 case NVPTXISD::Suld1DV2I16Zero
:
3214 Opc
= NVPTX::SULD_1D_V2I16_ZERO
;
3216 case NVPTXISD::Suld1DV2I32Zero
:
3217 Opc
= NVPTX::SULD_1D_V2I32_ZERO
;
3219 case NVPTXISD::Suld1DV2I64Zero
:
3220 Opc
= NVPTX::SULD_1D_V2I64_ZERO
;
3222 case NVPTXISD::Suld1DV4I8Zero
:
3223 Opc
= NVPTX::SULD_1D_V4I8_ZERO
;
3225 case NVPTXISD::Suld1DV4I16Zero
:
3226 Opc
= NVPTX::SULD_1D_V4I16_ZERO
;
3228 case NVPTXISD::Suld1DV4I32Zero
:
3229 Opc
= NVPTX::SULD_1D_V4I32_ZERO
;
3231 case NVPTXISD::Suld1DArrayI8Zero
:
3232 Opc
= NVPTX::SULD_1D_ARRAY_I8_ZERO
;
3234 case NVPTXISD::Suld1DArrayI16Zero
:
3235 Opc
= NVPTX::SULD_1D_ARRAY_I16_ZERO
;
3237 case NVPTXISD::Suld1DArrayI32Zero
:
3238 Opc
= NVPTX::SULD_1D_ARRAY_I32_ZERO
;
3240 case NVPTXISD::Suld1DArrayI64Zero
:
3241 Opc
= NVPTX::SULD_1D_ARRAY_I64_ZERO
;
3243 case NVPTXISD::Suld1DArrayV2I8Zero
:
3244 Opc
= NVPTX::SULD_1D_ARRAY_V2I8_ZERO
;
3246 case NVPTXISD::Suld1DArrayV2I16Zero
:
3247 Opc
= NVPTX::SULD_1D_ARRAY_V2I16_ZERO
;
3249 case NVPTXISD::Suld1DArrayV2I32Zero
:
3250 Opc
= NVPTX::SULD_1D_ARRAY_V2I32_ZERO
;
3252 case NVPTXISD::Suld1DArrayV2I64Zero
:
3253 Opc
= NVPTX::SULD_1D_ARRAY_V2I64_ZERO
;
3255 case NVPTXISD::Suld1DArrayV4I8Zero
:
3256 Opc
= NVPTX::SULD_1D_ARRAY_V4I8_ZERO
;
3258 case NVPTXISD::Suld1DArrayV4I16Zero
:
3259 Opc
= NVPTX::SULD_1D_ARRAY_V4I16_ZERO
;
3261 case NVPTXISD::Suld1DArrayV4I32Zero
:
3262 Opc
= NVPTX::SULD_1D_ARRAY_V4I32_ZERO
;
3264 case NVPTXISD::Suld2DI8Zero
:
3265 Opc
= NVPTX::SULD_2D_I8_ZERO
;
3267 case NVPTXISD::Suld2DI16Zero
:
3268 Opc
= NVPTX::SULD_2D_I16_ZERO
;
3270 case NVPTXISD::Suld2DI32Zero
:
3271 Opc
= NVPTX::SULD_2D_I32_ZERO
;
3273 case NVPTXISD::Suld2DI64Zero
:
3274 Opc
= NVPTX::SULD_2D_I64_ZERO
;
3276 case NVPTXISD::Suld2DV2I8Zero
:
3277 Opc
= NVPTX::SULD_2D_V2I8_ZERO
;
3279 case NVPTXISD::Suld2DV2I16Zero
:
3280 Opc
= NVPTX::SULD_2D_V2I16_ZERO
;
3282 case NVPTXISD::Suld2DV2I32Zero
:
3283 Opc
= NVPTX::SULD_2D_V2I32_ZERO
;
3285 case NVPTXISD::Suld2DV2I64Zero
:
3286 Opc
= NVPTX::SULD_2D_V2I64_ZERO
;
3288 case NVPTXISD::Suld2DV4I8Zero
:
3289 Opc
= NVPTX::SULD_2D_V4I8_ZERO
;
3291 case NVPTXISD::Suld2DV4I16Zero
:
3292 Opc
= NVPTX::SULD_2D_V4I16_ZERO
;
3294 case NVPTXISD::Suld2DV4I32Zero
:
3295 Opc
= NVPTX::SULD_2D_V4I32_ZERO
;
3297 case NVPTXISD::Suld2DArrayI8Zero
:
3298 Opc
= NVPTX::SULD_2D_ARRAY_I8_ZERO
;
3300 case NVPTXISD::Suld2DArrayI16Zero
:
3301 Opc
= NVPTX::SULD_2D_ARRAY_I16_ZERO
;
3303 case NVPTXISD::Suld2DArrayI32Zero
:
3304 Opc
= NVPTX::SULD_2D_ARRAY_I32_ZERO
;
3306 case NVPTXISD::Suld2DArrayI64Zero
:
3307 Opc
= NVPTX::SULD_2D_ARRAY_I64_ZERO
;
3309 case NVPTXISD::Suld2DArrayV2I8Zero
:
3310 Opc
= NVPTX::SULD_2D_ARRAY_V2I8_ZERO
;
3312 case NVPTXISD::Suld2DArrayV2I16Zero
:
3313 Opc
= NVPTX::SULD_2D_ARRAY_V2I16_ZERO
;
3315 case NVPTXISD::Suld2DArrayV2I32Zero
:
3316 Opc
= NVPTX::SULD_2D_ARRAY_V2I32_ZERO
;
3318 case NVPTXISD::Suld2DArrayV2I64Zero
:
3319 Opc
= NVPTX::SULD_2D_ARRAY_V2I64_ZERO
;
3321 case NVPTXISD::Suld2DArrayV4I8Zero
:
3322 Opc
= NVPTX::SULD_2D_ARRAY_V4I8_ZERO
;
3324 case NVPTXISD::Suld2DArrayV4I16Zero
:
3325 Opc
= NVPTX::SULD_2D_ARRAY_V4I16_ZERO
;
3327 case NVPTXISD::Suld2DArrayV4I32Zero
:
3328 Opc
= NVPTX::SULD_2D_ARRAY_V4I32_ZERO
;
3330 case NVPTXISD::Suld3DI8Zero
:
3331 Opc
= NVPTX::SULD_3D_I8_ZERO
;
3333 case NVPTXISD::Suld3DI16Zero
:
3334 Opc
= NVPTX::SULD_3D_I16_ZERO
;
3336 case NVPTXISD::Suld3DI32Zero
:
3337 Opc
= NVPTX::SULD_3D_I32_ZERO
;
3339 case NVPTXISD::Suld3DI64Zero
:
3340 Opc
= NVPTX::SULD_3D_I64_ZERO
;
3342 case NVPTXISD::Suld3DV2I8Zero
:
3343 Opc
= NVPTX::SULD_3D_V2I8_ZERO
;
3345 case NVPTXISD::Suld3DV2I16Zero
:
3346 Opc
= NVPTX::SULD_3D_V2I16_ZERO
;
3348 case NVPTXISD::Suld3DV2I32Zero
:
3349 Opc
= NVPTX::SULD_3D_V2I32_ZERO
;
3351 case NVPTXISD::Suld3DV2I64Zero
:
3352 Opc
= NVPTX::SULD_3D_V2I64_ZERO
;
3354 case NVPTXISD::Suld3DV4I8Zero
:
3355 Opc
= NVPTX::SULD_3D_V4I8_ZERO
;
3357 case NVPTXISD::Suld3DV4I16Zero
:
3358 Opc
= NVPTX::SULD_3D_V4I16_ZERO
;
3360 case NVPTXISD::Suld3DV4I32Zero
:
3361 Opc
= NVPTX::SULD_3D_V4I32_ZERO
;
3365 // Copy over operands
3366 SmallVector
<SDValue
, 8> Ops(drop_begin(N
->ops()));
3367 Ops
.push_back(N
->getOperand(0)); // Move chain to the back.
3369 ReplaceNode(N
, CurDAG
->getMachineNode(Opc
, SDLoc(N
), N
->getVTList(), Ops
));
3374 /// SelectBFE - Look for instruction sequences that can be made more efficient
3375 /// by using the 'bfe' (bit-field extract) PTX instruction
3376 bool NVPTXDAGToDAGISel::tryBFE(SDNode
*N
) {
3378 SDValue LHS
= N
->getOperand(0);
3379 SDValue RHS
= N
->getOperand(1);
3383 bool IsSigned
= false;
3385 if (N
->getOpcode() == ISD::AND
) {
3386 // Canonicalize the operands
3387 // We want 'and %val, %mask'
3388 if (isa
<ConstantSDNode
>(LHS
) && !isa
<ConstantSDNode
>(RHS
)) {
3389 std::swap(LHS
, RHS
);
3392 ConstantSDNode
*Mask
= dyn_cast
<ConstantSDNode
>(RHS
);
3394 // We need a constant mask on the RHS of the AND
3398 // Extract the mask bits
3399 uint64_t MaskVal
= Mask
->getZExtValue();
3400 if (!isMask_64(MaskVal
)) {
3401 // We *could* handle shifted masks here, but doing so would require an
3402 // 'and' operation to fix up the low-order bits so we would trade
3403 // shr+and for bfe+and, which has the same throughput
3407 // How many bits are in our mask?
3408 uint64_t NumBits
= countTrailingOnes(MaskVal
);
3409 Len
= CurDAG
->getTargetConstant(NumBits
, DL
, MVT::i32
);
3411 if (LHS
.getOpcode() == ISD::SRL
|| LHS
.getOpcode() == ISD::SRA
) {
3412 // We have a 'srl/and' pair, extract the effective start bit and length
3413 Val
= LHS
.getNode()->getOperand(0);
3414 Start
= LHS
.getNode()->getOperand(1);
3415 ConstantSDNode
*StartConst
= dyn_cast
<ConstantSDNode
>(Start
);
3417 uint64_t StartVal
= StartConst
->getZExtValue();
3418 // How many "good" bits do we have left? "good" is defined here as bits
3419 // that exist in the original value, not shifted in.
3420 uint64_t GoodBits
= Start
.getValueSizeInBits() - StartVal
;
3421 if (NumBits
> GoodBits
) {
3422 // Do not handle the case where bits have been shifted in. In theory
3423 // we could handle this, but the cost is likely higher than just
3424 // emitting the srl/and pair.
3427 Start
= CurDAG
->getTargetConstant(StartVal
, DL
, MVT::i32
);
3429 // Do not handle the case where the shift amount (can be zero if no srl
3430 // was found) is not constant. We could handle this case, but it would
3431 // require run-time logic that would be more expensive than just
3432 // emitting the srl/and pair.
3436 // Do not handle the case where the LHS of the and is not a shift. While
3437 // it would be trivial to handle this case, it would just transform
3438 // 'and' -> 'bfe', but 'and' has higher-throughput.
3441 } else if (N
->getOpcode() == ISD::SRL
|| N
->getOpcode() == ISD::SRA
) {
3442 if (LHS
->getOpcode() == ISD::AND
) {
3443 ConstantSDNode
*ShiftCnst
= dyn_cast
<ConstantSDNode
>(RHS
);
3445 // Shift amount must be constant
3449 uint64_t ShiftAmt
= ShiftCnst
->getZExtValue();
3451 SDValue AndLHS
= LHS
->getOperand(0);
3452 SDValue AndRHS
= LHS
->getOperand(1);
3454 // Canonicalize the AND to have the mask on the RHS
3455 if (isa
<ConstantSDNode
>(AndLHS
)) {
3456 std::swap(AndLHS
, AndRHS
);
3459 ConstantSDNode
*MaskCnst
= dyn_cast
<ConstantSDNode
>(AndRHS
);
3461 // Mask must be constant
3465 uint64_t MaskVal
= MaskCnst
->getZExtValue();
3468 if (isMask_64(MaskVal
)) {
3470 // The number of bits in the result bitfield will be the number of
3471 // trailing ones (the AND) minus the number of bits we shift off
3472 NumBits
= countTrailingOnes(MaskVal
) - ShiftAmt
;
3473 } else if (isShiftedMask_64(MaskVal
)) {
3474 NumZeros
= countTrailingZeros(MaskVal
);
3475 unsigned NumOnes
= countTrailingOnes(MaskVal
>> NumZeros
);
3476 // The number of bits in the result bitfield will be the number of
3477 // trailing zeros plus the number of set bits in the mask minus the
3478 // number of bits we shift off
3479 NumBits
= NumZeros
+ NumOnes
- ShiftAmt
;
3481 // This is not a mask we can handle
3485 if (ShiftAmt
< NumZeros
) {
3486 // Handling this case would require extra logic that would make this
3487 // transformation non-profitable
3492 Start
= CurDAG
->getTargetConstant(ShiftAmt
, DL
, MVT::i32
);
3493 Len
= CurDAG
->getTargetConstant(NumBits
, DL
, MVT::i32
);
3494 } else if (LHS
->getOpcode() == ISD::SHL
) {
3495 // Here, we have a pattern like:
3497 // (sra (shl val, NN), MM)
3499 // (srl (shl val, NN), MM)
3501 // If MM >= NN, we can efficiently optimize this with bfe
3502 Val
= LHS
->getOperand(0);
3504 SDValue ShlRHS
= LHS
->getOperand(1);
3505 ConstantSDNode
*ShlCnst
= dyn_cast
<ConstantSDNode
>(ShlRHS
);
3507 // Shift amount must be constant
3510 uint64_t InnerShiftAmt
= ShlCnst
->getZExtValue();
3512 SDValue ShrRHS
= RHS
;
3513 ConstantSDNode
*ShrCnst
= dyn_cast
<ConstantSDNode
>(ShrRHS
);
3515 // Shift amount must be constant
3518 uint64_t OuterShiftAmt
= ShrCnst
->getZExtValue();
3520 // To avoid extra codegen and be profitable, we need Outer >= Inner
3521 if (OuterShiftAmt
< InnerShiftAmt
) {
3525 // If the outer shift is more than the type size, we have no bitfield to
3526 // extract (since we also check that the inner shift is <= the outer shift
3527 // then this also implies that the inner shift is < the type size)
3528 if (OuterShiftAmt
>= Val
.getValueSizeInBits()) {
3532 Start
= CurDAG
->getTargetConstant(OuterShiftAmt
- InnerShiftAmt
, DL
,
3534 Len
= CurDAG
->getTargetConstant(Val
.getValueSizeInBits() - OuterShiftAmt
,
3537 if (N
->getOpcode() == ISD::SRA
) {
3538 // If we have a arithmetic right shift, we need to use the signed bfe
3553 // For the BFE operations we form here from "and" and "srl", always use the
3554 // unsigned variants.
3555 if (Val
.getValueType() == MVT::i32
) {
3557 Opc
= NVPTX::BFE_S32rii
;
3559 Opc
= NVPTX::BFE_U32rii
;
3561 } else if (Val
.getValueType() == MVT::i64
) {
3563 Opc
= NVPTX::BFE_S64rii
;
3565 Opc
= NVPTX::BFE_U64rii
;
3568 // We cannot handle this type
3576 ReplaceNode(N
, CurDAG
->getMachineNode(Opc
, DL
, N
->getVTList(), Ops
));
3580 // SelectDirectAddr - Match a direct address for DAG.
3581 // A direct address could be a globaladdress or externalsymbol.
3582 bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N
, SDValue
&Address
) {
3583 // Return true if TGA or ES.
3584 if (N
.getOpcode() == ISD::TargetGlobalAddress
||
3585 N
.getOpcode() == ISD::TargetExternalSymbol
) {
3589 if (N
.getOpcode() == NVPTXISD::Wrapper
) {
3590 Address
= N
.getOperand(0);
3593 // addrspacecast(MoveParam(arg_symbol) to addrspace(PARAM)) -> arg_symbol
3594 if (AddrSpaceCastSDNode
*CastN
= dyn_cast
<AddrSpaceCastSDNode
>(N
)) {
3595 if (CastN
->getSrcAddressSpace() == ADDRESS_SPACE_GENERIC
&&
3596 CastN
->getDestAddressSpace() == ADDRESS_SPACE_PARAM
&&
3597 CastN
->getOperand(0).getOpcode() == NVPTXISD::MoveParam
)
3598 return SelectDirectAddr(CastN
->getOperand(0).getOperand(0), Address
);
3604 bool NVPTXDAGToDAGISel::SelectADDRsi_imp(
3605 SDNode
*OpNode
, SDValue Addr
, SDValue
&Base
, SDValue
&Offset
, MVT mvt
) {
3606 if (Addr
.getOpcode() == ISD::ADD
) {
3607 if (ConstantSDNode
*CN
= dyn_cast
<ConstantSDNode
>(Addr
.getOperand(1))) {
3608 SDValue base
= Addr
.getOperand(0);
3609 if (SelectDirectAddr(base
, Base
)) {
3610 Offset
= CurDAG
->getTargetConstant(CN
->getZExtValue(), SDLoc(OpNode
),
3620 bool NVPTXDAGToDAGISel::SelectADDRsi(SDNode
*OpNode
, SDValue Addr
,
3621 SDValue
&Base
, SDValue
&Offset
) {
3622 return SelectADDRsi_imp(OpNode
, Addr
, Base
, Offset
, MVT::i32
);
3626 bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode
*OpNode
, SDValue Addr
,
3627 SDValue
&Base
, SDValue
&Offset
) {
3628 return SelectADDRsi_imp(OpNode
, Addr
, Base
, Offset
, MVT::i64
);
3632 bool NVPTXDAGToDAGISel::SelectADDRri_imp(
3633 SDNode
*OpNode
, SDValue Addr
, SDValue
&Base
, SDValue
&Offset
, MVT mvt
) {
3634 if (FrameIndexSDNode
*FIN
= dyn_cast
<FrameIndexSDNode
>(Addr
)) {
3635 Base
= CurDAG
->getTargetFrameIndex(FIN
->getIndex(), mvt
);
3636 Offset
= CurDAG
->getTargetConstant(0, SDLoc(OpNode
), mvt
);
3639 if (Addr
.getOpcode() == ISD::TargetExternalSymbol
||
3640 Addr
.getOpcode() == ISD::TargetGlobalAddress
)
3641 return false; // direct calls.
3643 if (Addr
.getOpcode() == ISD::ADD
) {
3644 if (SelectDirectAddr(Addr
.getOperand(0), Addr
)) {
3647 if (ConstantSDNode
*CN
= dyn_cast
<ConstantSDNode
>(Addr
.getOperand(1))) {
3648 if (FrameIndexSDNode
*FIN
=
3649 dyn_cast
<FrameIndexSDNode
>(Addr
.getOperand(0)))
3650 // Constant offset from frame ref.
3651 Base
= CurDAG
->getTargetFrameIndex(FIN
->getIndex(), mvt
);
3653 Base
= Addr
.getOperand(0);
3654 Offset
= CurDAG
->getTargetConstant(CN
->getZExtValue(), SDLoc(OpNode
),
3663 bool NVPTXDAGToDAGISel::SelectADDRri(SDNode
*OpNode
, SDValue Addr
,
3664 SDValue
&Base
, SDValue
&Offset
) {
3665 return SelectADDRri_imp(OpNode
, Addr
, Base
, Offset
, MVT::i32
);
3669 bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode
*OpNode
, SDValue Addr
,
3670 SDValue
&Base
, SDValue
&Offset
) {
3671 return SelectADDRri_imp(OpNode
, Addr
, Base
, Offset
, MVT::i64
);
3674 bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode
*N
,
3675 unsigned int spN
) const {
3676 const Value
*Src
= nullptr;
3677 if (MemSDNode
*mN
= dyn_cast
<MemSDNode
>(N
)) {
3678 if (spN
== 0 && mN
->getMemOperand()->getPseudoValue())
3680 Src
= mN
->getMemOperand()->getValue();
3684 if (auto *PT
= dyn_cast
<PointerType
>(Src
->getType()))
3685 return (PT
->getAddressSpace() == spN
);
3689 /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
3690 /// inline asm expressions.
3691 bool NVPTXDAGToDAGISel::SelectInlineAsmMemoryOperand(
3692 const SDValue
&Op
, unsigned ConstraintID
, std::vector
<SDValue
> &OutOps
) {
3694 switch (ConstraintID
) {
3697 case InlineAsm::Constraint_m
: // memory
3698 if (SelectDirectAddr(Op
, Op0
)) {
3699 OutOps
.push_back(Op0
);
3700 OutOps
.push_back(CurDAG
->getTargetConstant(0, SDLoc(Op
), MVT::i32
));
3703 if (SelectADDRri(Op
.getNode(), Op
, Op0
, Op1
)) {
3704 OutOps
.push_back(Op0
);
3705 OutOps
.push_back(Op1
);
3713 /// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a
3714 /// conversion from \p SrcTy to \p DestTy.
3715 unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy
, MVT SrcTy
,
3717 switch (SrcTy
.SimpleTy
) {
3719 llvm_unreachable("Unhandled source type");
3721 switch (DestTy
.SimpleTy
) {
3723 llvm_unreachable("Unhandled dest type");
3725 return IsSigned
? NVPTX::CVT_s16_s8
: NVPTX::CVT_u16_u8
;
3727 return IsSigned
? NVPTX::CVT_s32_s8
: NVPTX::CVT_u32_u8
;
3729 return IsSigned
? NVPTX::CVT_s64_s8
: NVPTX::CVT_u64_u8
;
3732 switch (DestTy
.SimpleTy
) {
3734 llvm_unreachable("Unhandled dest type");
3736 return IsSigned
? NVPTX::CVT_s8_s16
: NVPTX::CVT_u8_u16
;
3738 return IsSigned
? NVPTX::CVT_s32_s16
: NVPTX::CVT_u32_u16
;
3740 return IsSigned
? NVPTX::CVT_s64_s16
: NVPTX::CVT_u64_u16
;
3743 switch (DestTy
.SimpleTy
) {
3745 llvm_unreachable("Unhandled dest type");
3747 return IsSigned
? NVPTX::CVT_s8_s32
: NVPTX::CVT_u8_u32
;
3749 return IsSigned
? NVPTX::CVT_s16_s32
: NVPTX::CVT_u16_u32
;
3751 return IsSigned
? NVPTX::CVT_s64_s32
: NVPTX::CVT_u64_u32
;
3754 switch (DestTy
.SimpleTy
) {
3756 llvm_unreachable("Unhandled dest type");
3758 return IsSigned
? NVPTX::CVT_s8_s64
: NVPTX::CVT_u8_u64
;
3760 return IsSigned
? NVPTX::CVT_s16_s64
: NVPTX::CVT_u16_u64
;
3762 return IsSigned
? NVPTX::CVT_s32_s64
: NVPTX::CVT_u32_u64
;