llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.nsa.ll

   1 ; RUN: llc -amdgpu-nsa-threshold=3 -mtriple=amdgcn -mcpu=gfx1010 -mattr=-nsa-encoding -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NONSA,GFX10-NONSA %s
   2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-nsa-threshold=32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NONSA,GFX10-NONSA %s
   3 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-nsa-threshold=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,NSA-T2 %s
   4 ; RUN: llc -amdgpu-nsa-threshold=3 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,NSA-T3,GFX1010-NSA %s
   5 ; RUN: llc -amdgpu-nsa-threshold=3 -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,NSA-T3,GFX1030-NSA %s
   6 ; RUN: llc -amdgpu-nsa-threshold=3 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-nsa-encoding -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NONSA,GFX11-NONSA %s
   7 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-nsa-threshold=32 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NONSA,GFX11-NONSA %s
   8 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-nsa-threshold=2 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,NSA-T2 %s
   9 ; RUN: llc -amdgpu-nsa-threshold=3 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NSA,NSA-T3,GFX11-NSA %s
  10
  11 ; Default NSA threshold is 3 addresses
  12 ; GCN-LABEL: {{^}}sample_2d:
  13 ; NONSA: v_mov_b32_e32 v2, v0
  14 ; NONSA: image_sample v[0:3], v[1:2],
  15 ; NSA-T2: image_sample v[0:3], [v1, v0],
  16 ; NSA-T3: v_mov_b32_e32 v2, v0
  17 ; NSA-T3: image_sample v[0:3], v[1:2],
  18 define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %t, float %s) {
  19 main_body:
  20   %v = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
  21   ret <4 x float> %v
  22 }
  23
  24 ; GCN-LABEL: {{^}}sample_3d:
  25 ; NONSA: v_mov_b32_e32 v3, v0
  26 ; NONSA: image_sample v[0:3], v[1:3],
  27 ; NSA: image_sample v[0:3], [v1, v2, v0],
  28 define amdgpu_ps <4 x float> @sample_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %r, float %s, float %t) {
  29 main_body:
  30   %v = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 15, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
  31   ret <4 x float> %v
  32 }
  33
  34 ; GCN-LABEL: {{^}}sample_d_3d:
  35 ; GFX1010-NSA: image_sample_d v[0:3], v[7:15],
  36 ; GFX1030-NSA: image_sample_d v[0:3], [v3, v8, v7, v5, v4, v6, v0, v2, v1],
  37 ; GFX11-NSA: image_sample_d v[0:3], [v3, v8, v7, v5, v[9:13]],
  38 define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %r, float %t, float %dsdh, float %dtdv, float %dsdv, float %drdv, float %drdh, float %dtdh) {
  39 main_body:
  40   %v = call <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f32(i32 15, float %dsdh, float %dtdh, float %drdh, float %dsdv, float %dtdv, float %drdv, float %s, float %t, float %r, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
  41   ret <4 x float> %v
  42 }
  43
  44 ; GCN-LABEL: {{^}}sample_contig_nsa:
  45 ; GFX10-NONSA: image_sample_c_l v5, v[0:4],
  46 ; GFX11-NONSA: image_sample_c_l v0, v[0:4],
  47 ; GFX1010-NSA: image_sample_c_l v8, v[0:4],
  48 ; GFX1010-NSA: image_sample v9, [v6, v7, v5],
  49 ; GFX1030-NSA: image_sample_c_l v0, v[0:4],
  50 ; GFX1030-NSA: image_sample v1, [v6, v7, v5],
  51 ; GFX11-NSA: image_sample_c_l v0, v[0:4],
  52 ; GFX11-NSA: image_sample v1, [v6, v7, v5],
  53 define amdgpu_ps <2 x float> @sample_contig_nsa(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s1, float %t1, float %r1, float %lod, float %r2, float %s2, float %t2) {
  54 main_body:
  55   %v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
  56   %v2 = call float @llvm.amdgcn.image.sample.3d.f32.f32(i32 1, float %s2, float %t2, float %r2, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
  57   %r.0 = insertelement <2 x float> undef, float %v1, i32 0
  58   %r = insertelement <2 x float> %r.0, float %v2, i32 1
  59   ret <2 x float> %r
  60 }
  61
  62 ; GCN-LABEL: {{^}}sample_nsa_nsa:
  63 ; GFX1010-NSA: image_sample_c_l v8, [v1, v2, v3, v4, v0],
  64 ; GFX1010-NSA: image_sample v9, [v6, v7, v5],
  65 ; GFX1030-NSA: image_sample_c_l v0, [v1, v2, v3, v4, v0],
  66 ; GFX1030-NSA: image_sample v1, [v6, v7, v5],
  67 ; GFX11-NSA: image_sample_c_l v0, [v1, v2, v3, v4, v0],
  68 ; GFX11-NSA: image_sample v1, [v6, v7, v5],
  69 define amdgpu_ps <2 x float> @sample_nsa_nsa(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %lod, float %zcompare, float %s1, float %t1, float %r1, float %r2, float %s2, float %t2) {
  70 main_body:
  71   %v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
  72   %v2 = call float @llvm.amdgcn.image.sample.3d.f32.f32(i32 1, float %s2, float %t2, float %r2, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
  73   %r.0 = insertelement <2 x float> undef, float %v1, i32 0
  74   %r = insertelement <2 x float> %r.0, float %v2, i32 1
  75   ret <2 x float> %r
  76 }
  77
  78 ; GCN-LABEL: {{^}}sample_nsa_contig:
  79 ; GFX1010-NSA: image_sample_c_l v8, [v1, v2, v3, v4, v0],
  80 ; GFX1010-NSA: image_sample v9, v[5:7],
  81 ; GFX1030-NSA: image_sample_c_l v0, [v1, v2, v3, v4, v0],
  82 ; GFX1030-NSA: image_sample v1, v[5:7],
  83 ; GFX11-NSA: image_sample_c_l v0, [v1, v2, v3, v4, v0],
  84 ; GFX11-NSA: image_sample v1, v[5:7],
  85 define amdgpu_ps <2 x float> @sample_nsa_contig(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %lod, float %zcompare, float %s1, float %t1, float %r1, float %s2, float %t2, float %r2) {
  86 main_body:
  87   %v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
  88   %v2 = call float @llvm.amdgcn.image.sample.3d.f32.f32(i32 1, float %s2, float %t2, float %r2, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
  89   %r.0 = insertelement <2 x float> undef, float %v1, i32 0
  90   %r = insertelement <2 x float> %r.0, float %v2, i32 1
  91   ret <2 x float> %r
  92 }
  93
  94 ; GCN-LABEL: {{^}}sample_contig_contig:
  95 ; GFX1010-NSA: image_sample_c_l v8, v[0:4],
  96 ; GFX1010-NSA: image_sample v9, v[5:7],
  97 ; GFX1030-NSA: image_sample_c_l v0, v[0:4],
  98 ; GFX1030-NSA: image_sample v1, v[5:7],
  99 ; GFX11-NSA: image_sample_c_l v0, v[0:4],
 100 ; GFX11-NSA: image_sample v1, v[5:7],
 101 ; GFX10-NONSA: image_sample_c_l v8, v[0:4],
 102 ; GFX10-NONSA: image_sample v9, v[5:7],
 103 ; GFX11-NONSA: image_sample_c_l v0, v[0:4],
 104 ; GFX11-NONSA: image_sample v1, v[5:7],
 105 define amdgpu_ps <2 x float> @sample_contig_contig(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s1, float %t1, float %r1, float %lod, float %s2, float %t2, float %r2) {
 106 main_body:
 107   %v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare, float %s1, float %t1, float %r1, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
 108   %v2 = call float @llvm.amdgcn.image.sample.3d.f32.f32(i32 1, float %s2, float %t2, float %r2, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
 109   %r.0 = insertelement <2 x float> undef, float %v1, i32 0
 110   %r = insertelement <2 x float> %r.0, float %v2, i32 1
 111   ret <2 x float> %r
 112 }
 113
 114 ; Test that undef inputs with NSA are handled safely; these tests used to crash.
 115
 116 ; GCN-LABEL: {{^}}sample_undef_undef_undef_undef:
 117 ; GCN: image_sample_c_b v0, v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY
 118 define amdgpu_ps float @sample_undef_undef_undef_undef(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp) {
 119   %r = call float @llvm.amdgcn.image.sample.c.b.1darray.f32.f32.f32(i32 1, float undef, float undef, float undef, float undef, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
 120   ret float %r
 121 }
 122
 123 ; GCN-LABEL: {{^}}sample_undef_undef_undef_def:
 124 ; NONSA: v_mov_b32_e32 v3, v0
 125 ; NONSA: image_sample_c_b v0, v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY
 126 ; NSA: image_sample_c_b v0, [v0, v0, v0, v0], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY
 127 define amdgpu_ps float @sample_undef_undef_undef_def(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %layer) {
 128   %r = call float @llvm.amdgcn.image.sample.c.b.1darray.f32.f32.f32(i32 1, float undef, float undef, float undef, float %layer, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
 129   ret float %r
 130 }
 131
 132 ; GCN-LABEL: {{^}}sample_undef_undef_undef_def_rnd:
 133 ; GCN: v_rndne_f32_e32 v3, v0
 134 ; GCN: image_sample_c_b v0, v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY
 135 define amdgpu_ps float @sample_undef_undef_undef_def_rnd(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %layer) {
 136   %layer_rnd = call float @llvm.rint.f32(float %layer)
 137   %r = call float @llvm.amdgcn.image.sample.c.b.1darray.f32.f32.f32(i32 1, float undef, float undef, float undef, float %layer_rnd, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
 138   ret float %r
 139 }
 140
 141 ; GCN-LABEL: {{^}}sample_def_undef_undef_undef:
 142 ; GCN: v_add_f32_e32 v0, 1.0, v0
 143 ; GCN: image_sample_c_b v0, v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY
 144 define amdgpu_ps float @sample_def_undef_undef_undef(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %z0) {
 145   ; The NSA reassign pass is conservative (quite reasonably!) when one of the operands
 146   ; comes directly from a function argument (via COPY). To test that NSA can be
 147   ; eliminated in the presence of undef, just add an arbitrary intermediate
 148   ; computation.
 149   %c0 = fadd float %z0, 1.0
 150   %r = call float @llvm.amdgcn.image.sample.c.b.1darray.f32.f32.f32(i32 1, float %c0, float undef, float undef, float undef, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
 151   ret float %r
 152 }
 153
 154 declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 155 declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 156 declare <4 x float> @llvm.amdgcn.image.sample.d.3d.v4f32.f32(i32, float, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 157
 158 declare float @llvm.amdgcn.image.sample.3d.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 159 declare float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 160
 161 declare float @llvm.rint.f32(float) #2
 162 declare float @llvm.amdgcn.image.sample.c.b.1darray.f32.f32.f32(i32 immarg, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #1
 163
 164 attributes #1 = { nounwind readonly }
 165 attributes #2 = { nounwind readnone speculatable willreturn }