llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll

   1 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-nsa-encoding -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NONSA %s
   2 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1010,NSA %s
   3 ; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1030,NSA %s
   4
   5 ; GCN-LABEL: {{^}}sample_2d:
   6 ;
   7 ; TODO: use NSA here
   8 ; GCN: v_mov_b32_e32 v2, v0
   9 ;
  10 ; GCN: image_sample v[0:3], v[1:2],
  11 define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %t, float %s) {
  12 main_body:
  13   %v = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
  14   ret <4 x float> %v
  15 }
  16
  17 ; GCN-LABEL: {{^}}sample_3d:
  18 ; NONSA: v_mov_b32_e32 v3, v0
  19 ; NONSA: image_sample v[0:3], v[1:3],
  20 ; NSA: image_sample v[0:3], [v1, v2, v0],
  21 define amdgpu_ps <4 x float> @sample_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %r, float %s, float %t) {
  22 main_body:
  23   %v = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 15, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
  24   ret <4 x float> %v
  25 }
  26
  27 ; GCN-LABEL: {{^}}sample_d_3d:
  28 ; GFX1010: image_sample_d v[0:3], v[7:22],
  29 ; GFX1030: image_sample_d v[0:3], [v3, v8, v7, v5, v4, v6, v0, v2, v1],
  30 define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %r, float %t, float %dsdh, float %dtdv, float %dsdv, float %drdv, float %drdh, float %dtdh) {
  31 main_body:
  32   %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f32(i32 15, float %dsdh, float %dtdh, float %drdh, float %dsdv, float %dtdv, float %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
  33   ret <4 x float> %v
  34 }
  35
  36 ; GCN-LABEL: {{^}}sample_contig_nsa:
  37 ; NONSA: image_sample_c_l v5, v[0:4],
  38 ; NSA: image_sample_c_l v{{[0-9]+}}, v[0:4],
  39 ; NSA: image_sample v{{[0-9]+}}, [v6, v7, v5],
  40 define amdgpu_ps <2 x float> @sample_contig_nsa(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s1, float %t1, float %r1, float %lod, float %r2, float %s2, float %t2) {
  41 main_body:
  42   %v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
  43   %v2 = call float @llvm.amdgcn.image.sample.3d.f32.f32(i32 1, float %s2, float %t2, float %r2, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
  44   %r.0 = insertelement <2 x float> undef, float %v1, i32 0
  45   %r = insertelement <2 x float> %r.0, float %v2, i32 1
  46   ret <2 x float> %r
  47 }
  48
  49 ; GCN-LABEL: {{^}}sample_nsa_nsa:
  50 ; NSA: image_sample_c_l v{{[0-9]+}}, [v1, v2, v3, v4, v0],
  51 ; NSA: image_sample v{{[0-9]+}}, [v6, v7, v5],
  52 define amdgpu_ps <2 x float> @sample_nsa_nsa(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %lod, float %zcompare, float %s1, float %t1, float %r1, float %r2, float %s2, float %t2) {
  53 main_body:
  54   %v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
  55   %v2 = call float @llvm.amdgcn.image.sample.3d.f32.f32(i32 1, float %s2, float %t2, float %r2, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
  56   %r.0 = insertelement <2 x float> undef, float %v1, i32 0
  57   %r = insertelement <2 x float> %r.0, float %v2, i32 1
  58   ret <2 x float> %r
  59 }
  60
  61 ; GCN-LABEL: {{^}}sample_nsa_contig:
  62 ; NSA: image_sample_c_l v{{[0-9]+}}, [v1, v2, v3, v4, v0],
  63 ; NSA: image_sample v{{[0-9]+}}, v[5:7],
  64 define amdgpu_ps <2 x float> @sample_nsa_contig(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %lod, float %zcompare, float %s1, float %t1, float %r1, float %s2, float %t2, float %r2) {
  65 main_body:
  66   %v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
  67   %v2 = call float @llvm.amdgcn.image.sample.3d.f32.f32(i32 1, float %s2, float %t2, float %r2, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
  68   %r.0 = insertelement <2 x float> undef, float %v1, i32 0
  69   %r = insertelement <2 x float> %r.0, float %v2, i32 1
  70   ret <2 x float> %r
  71 }
  72
  73 ; GCN-LABEL: {{^}}sample_contig_contig:
  74 ; NSA: image_sample_c_l v{{[0-9]+}}, v[0:4],
  75 ; NSA: image_sample v{{[0-9]+}}, v[5:7],
  76 ; NONSA: image_sample_c_l v{{[0-9]+}}, v[0:4],
  77 ; NONSA: image_sample v{{[0-9]+}}, v[5:7],
  78 define amdgpu_ps <2 x float> @sample_contig_contig(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s1, float %t1, float %r1, float %lod, float %s2, float %t2, float %r2) {
  79 main_body:
  80   %v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
  81   %v2 = call float @llvm.amdgcn.image.sample.3d.f32.f32(i32 1, float %s2, float %t2, float %r2, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
  82   %r.0 = insertelement <2 x float> undef, float %v1, i32 0
  83   %r = insertelement <2 x float> %r.0, float %v2, i32 1
  84   ret <2 x float> %r
  85 }
  86
  87 ; Test that undef inputs with NSA are handled safely; these tests used to crash.
  88
  89 ; GCN-LABEL: {{^}}sample_undef_undef_undef_undef:
  90 ; GCN: image_sample_c_b v0, v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY
  91 define amdgpu_ps float @sample_undef_undef_undef_undef(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp) {
  92   %r = call float @llvm.amdgcn.image.sample.c.b.1darray.f32.f32.f32(i32 1, float undef, float undef, float undef, float undef, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
  93   ret float %r
  94 }
  95
  96 ; GCN-LABEL: {{^}}sample_undef_undef_undef_def:
  97 ; NONSA: v_mov_b32_e32 v3, v0
  98 ; NONSA: image_sample_c_b v0, v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY
  99 ; NSA: image_sample_c_b v0, [v0, v0, v0, v0], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY
 100 define amdgpu_ps float @sample_undef_undef_undef_def(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %layer) {
 101   %r = call float @llvm.amdgcn.image.sample.c.b.1darray.f32.f32.f32(i32 1, float undef, float undef, float undef, float %layer, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
 102   ret float %r
 103 }
 104
 105 ; GCN-LABEL: {{^}}sample_undef_undef_undef_def_rnd:
 106 ; GCN: v_rndne_f32_e32 v3, v0
 107 ; GCN: image_sample_c_b v0, v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY
 108 define amdgpu_ps float @sample_undef_undef_undef_def_rnd(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %layer) {
 109   %layer_rnd = call float @llvm.rint.f32(float %layer)
 110   %r = call float @llvm.amdgcn.image.sample.c.b.1darray.f32.f32.f32(i32 1, float undef, float undef, float undef, float %layer_rnd, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
 111   ret float %r
 112 }
 113
 114 ; GCN-LABEL: {{^}}sample_def_undef_undef_undef:
 115 ; GCN: v_add_f32_e32 v0, 1.0, v0
 116 ; GCN: image_sample_c_b v0, v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY
 117 define amdgpu_ps float @sample_def_undef_undef_undef(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %z0) {
 118   ; The NSA reassign pass is conservative (quite reasonably!) when one of the operands
 119   ; comes directly from a function argument (via COPY). To test that NSA can be
 120   ; eliminated in the presence of undef, just add an arbitrary intermediate
 121   ; computation.
 122   %c0 = fadd float %z0, 1.0
 123   %r = call float @llvm.amdgcn.image.sample.c.b.1darray.f32.f32.f32(i32 1, float %c0, float undef, float undef, float undef, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
 124   ret float %r
 125 }
 126
 127 declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 128 declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 129 declare <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f32(i32, float, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 130
 131 declare float @llvm.amdgcn.image.sample.3d.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 132 declare float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 133
 134 declare float @llvm.rint.f32(float) #2
 135 declare float @llvm.amdgcn.image.sample.c.b.1darray.f32.f32.f32(i32 immarg, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #1
 136
 137 attributes #1 = { nounwind readonly }
 138 attributes #2 = { nounwind readnone speculatable willreturn }