llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-loop-prefetch < %s | FileCheck --check-prefix=GCN %s
   3
   4 define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s, i32 %n) {
   5 ; GCN-LABEL: copy_flat:
   6 ; GCN:       ; %bb.0: ; %entry
   7 ; GCN-NEXT:    s_load_b32 s4, s[2:3], 0x34
   8 ; GCN-NEXT:    s_wait_kmcnt 0x0
   9 ; GCN-NEXT:    s_cmp_eq_u32 s4, 0
  10 ; GCN-NEXT:    s_cbranch_scc1 .LBB0_3
  11 ; GCN-NEXT:  ; %bb.1: ; %for.body.preheader
  12 ; GCN-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
  13 ; GCN-NEXT:    s_wait_kmcnt 0x0
  14 ; GCN-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 0xb0
  15 ; GCN-NEXT:  .LBB0_2: ; %for.body
  16 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
  17 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
  18 ; GCN-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
  19 ; GCN-NEXT:    s_prefetch_data s[2:3], 0x0, null, 0
  20 ; GCN-NEXT:    v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
  21 ; GCN-NEXT:    s_add_co_i32 s4, s4, -1
  22 ; GCN-NEXT:    flat_load_b128 v[0:3], v[0:1] offset:-176
  23 ; GCN-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 16
  24 ; GCN-NEXT:    s_cmp_lg_u32 s4, 0
  25 ; GCN-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 16
  26 ; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
  27 ; GCN-NEXT:    flat_store_b128 v[4:5], v[0:3]
  28 ; GCN-NEXT:    s_cbranch_scc1 .LBB0_2
  29 ; GCN-NEXT:  .LBB0_3: ; %for.end
  30 ; GCN-NEXT:    s_endpgm
  31 entry:
  32   %cmp6.not = icmp eq i32 %n, 0
  33   br i1 %cmp6.not, label %for.end, label %for.body
  34
  35 for.body:                                         ; preds = %entry, %for.body
  36   %i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
  37   %idxprom = zext i32 %i.07 to i64
  38   %arrayidx = getelementptr inbounds <4 x i32>, ptr %s, i64 %idxprom
  39   %ld = load <4 x i32>, ptr %arrayidx, align 4
  40   %arrayidx2 = getelementptr inbounds <4 x i32>, ptr %d, i64 %idxprom
  41   store <4 x i32> %ld, ptr %arrayidx2, align 4
  42   %inc = add nuw i32 %i.07, 1
  43   %exitcond.not = icmp eq i32 %inc, %n
  44   br i1 %exitcond.not, label %for.end, label %for.body
  45
  46 for.end:                                          ; preds = %for.body, %entry
  47   ret void
  48 }
  49
  50 define amdgpu_kernel void @copy_global(ptr addrspace(1) nocapture %d, ptr addrspace(1) nocapture readonly %s, i32 %n) {
  51 ; GCN-LABEL: copy_global:
  52 ; GCN:       ; %bb.0: ; %entry
  53 ; GCN-NEXT:    s_load_b32 s4, s[2:3], 0x34
  54 ; GCN-NEXT:    s_wait_kmcnt 0x0
  55 ; GCN-NEXT:    s_cmp_eq_u32 s4, 0
  56 ; GCN-NEXT:    s_cbranch_scc1 .LBB1_3
  57 ; GCN-NEXT:  ; %bb.1: ; %for.body.preheader
  58 ; GCN-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
  59 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
  60 ; GCN-NEXT:    s_wait_kmcnt 0x0
  61 ; GCN-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 0xb0
  62 ; GCN-NEXT:  .LBB1_2: ; %for.body
  63 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
  64 ; GCN-NEXT:    global_load_b128 v[1:4], v0, s[2:3] offset:-176
  65 ; GCN-NEXT:    s_prefetch_data s[2:3], 0x0, null, 0
  66 ; GCN-NEXT:    s_add_co_i32 s4, s4, -1
  67 ; GCN-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 16
  68 ; GCN-NEXT:    s_cmp_lg_u32 s4, 0
  69 ; GCN-NEXT:    s_wait_loadcnt 0x0
  70 ; GCN-NEXT:    global_store_b128 v0, v[1:4], s[0:1]
  71 ; GCN-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 16
  72 ; GCN-NEXT:    s_cbranch_scc1 .LBB1_2
  73 ; GCN-NEXT:  .LBB1_3: ; %for.end
  74 ; GCN-NEXT:    s_nop 0
  75 ; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
  76 ; GCN-NEXT:    s_endpgm
  77 entry:
  78   %cmp6.not = icmp eq i32 %n, 0
  79   br i1 %cmp6.not, label %for.end, label %for.body
  80
  81 for.body:                                         ; preds = %entry, %for.body
  82   %i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
  83   %idxprom = zext i32 %i.07 to i64
  84   %arrayidx = getelementptr inbounds <4 x i32>, ptr addrspace(1) %s, i64 %idxprom
  85   %ld = load <4 x i32>, ptr addrspace(1) %arrayidx, align 4
  86   %arrayidx2 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %d, i64 %idxprom
  87   store <4 x i32> %ld, ptr addrspace(1) %arrayidx2, align 4
  88   %inc = add nuw i32 %i.07, 1
  89   %exitcond.not = icmp eq i32 %inc, %n
  90   br i1 %exitcond.not, label %for.end, label %for.body
  91
  92 for.end:                                          ; preds = %for.body, %entry
  93   ret void
  94 }
  95
  96 define amdgpu_kernel void @copy_constant(ptr addrspace(1) nocapture %d, ptr addrspace(4) nocapture readonly %s, i32 %n) {
  97 ; GCN-LABEL: copy_constant:
  98 ; GCN:       ; %bb.0: ; %entry
  99 ; GCN-NEXT:    s_load_b32 s4, s[2:3], 0x34
 100 ; GCN-NEXT:    s_wait_kmcnt 0x0
 101 ; GCN-NEXT:    s_cmp_eq_u32 s4, 0
 102 ; GCN-NEXT:    s_cbranch_scc1 .LBB2_3
 103 ; GCN-NEXT:  ; %bb.1: ; %for.body.preheader
 104 ; GCN-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
 105 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 106 ; GCN-NEXT:  .LBB2_2: ; %for.body
 107 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
 108 ; GCN-NEXT:    s_wait_kmcnt 0x0
 109 ; GCN-NEXT:    s_load_b128 s[8:11], s[2:3], 0x0
 110 ; GCN-NEXT:    s_prefetch_data s[2:3], 0xb0, null, 0
 111 ; GCN-NEXT:    s_add_co_i32 s4, s4, -1
 112 ; GCN-NEXT:    s_add_nc_u64 s[2:3], s[2:3], 16
 113 ; GCN-NEXT:    s_cmp_lg_u32 s4, 0
 114 ; GCN-NEXT:    s_wait_kmcnt 0x0
 115 ; GCN-NEXT:    v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s9
 116 ; GCN-NEXT:    v_dual_mov_b32 v3, s10 :: v_dual_mov_b32 v4, s11
 117 ; GCN-NEXT:    global_store_b128 v0, v[1:4], s[0:1]
 118 ; GCN-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 16
 119 ; GCN-NEXT:    s_cbranch_scc1 .LBB2_2
 120 ; GCN-NEXT:  .LBB2_3: ; %for.end
 121 ; GCN-NEXT:    s_nop 0
 122 ; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 123 ; GCN-NEXT:    s_endpgm
 124 entry:
 125   %cmp6.not = icmp eq i32 %n, 0
 126   br i1 %cmp6.not, label %for.end, label %for.body
 127
 128 for.body:                                         ; preds = %entry, %for.body
 129   %i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
 130   %idxprom = zext i32 %i.07 to i64
 131   %arrayidx = getelementptr inbounds <4 x i32>, ptr addrspace(4) %s, i64 %idxprom
 132   %ld = load <4 x i32>, ptr addrspace(4) %arrayidx, align 4
 133   %arrayidx2 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %d, i64 %idxprom
 134   store <4 x i32> %ld, ptr addrspace(1) %arrayidx2, align 4
 135   %inc = add nuw i32 %i.07, 1
 136   %exitcond.not = icmp eq i32 %inc, %n
 137   br i1 %exitcond.not, label %for.end, label %for.body
 138
 139 for.end:                                          ; preds = %for.body, %entry
 140   ret void
 141 }
 142
 143 define amdgpu_kernel void @copy_local(ptr addrspace(3) nocapture %d, ptr addrspace(3) nocapture readonly %s, i32 %n) {
 144 ; GCN-LABEL: copy_local:
 145 ; GCN:       ; %bb.0: ; %entry
 146 ; GCN-NEXT:    s_load_b96 s[0:2], s[2:3], 0x24
 147 ; GCN-NEXT:    s_wait_kmcnt 0x0
 148 ; GCN-NEXT:    s_cmp_eq_u32 s2, 0
 149 ; GCN-NEXT:    s_cbranch_scc1 .LBB3_2
 150 ; GCN-NEXT:  .LBB3_1: ; %for.body
 151 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
 152 ; GCN-NEXT:    v_mov_b32_e32 v2, s1
 153 ; GCN-NEXT:    v_mov_b32_e32 v4, s0
 154 ; GCN-NEXT:    s_add_co_i32 s2, s2, -1
 155 ; GCN-NEXT:    s_add_co_i32 s0, s0, 16
 156 ; GCN-NEXT:    s_add_co_i32 s1, s1, 16
 157 ; GCN-NEXT:    ds_load_2addr_b32 v[0:1], v2 offset0:2 offset1:3
 158 ; GCN-NEXT:    ds_load_2addr_b32 v[2:3], v2 offset1:1
 159 ; GCN-NEXT:    s_cmp_lg_u32 s2, 0
 160 ; GCN-NEXT:    s_wait_dscnt 0x1
 161 ; GCN-NEXT:    ds_store_2addr_b32 v4, v0, v1 offset0:2 offset1:3
 162 ; GCN-NEXT:    s_wait_dscnt 0x1
 163 ; GCN-NEXT:    ds_store_2addr_b32 v4, v2, v3 offset1:1
 164 ; GCN-NEXT:    s_cbranch_scc1 .LBB3_1
 165 ; GCN-NEXT:  .LBB3_2: ; %for.end
 166 ; GCN-NEXT:    s_endpgm
 167 entry:
 168   %cmp6.not = icmp eq i32 %n, 0
 169   br i1 %cmp6.not, label %for.end, label %for.body
 170
 171 for.body:                                         ; preds = %entry, %for.body
 172   %i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
 173   %idxprom = zext i32 %i.07 to i64
 174   %arrayidx = getelementptr inbounds <4 x i32>, ptr addrspace(3) %s, i64 %idxprom
 175   %ld = load <4 x i32>, ptr addrspace(3) %arrayidx, align 4
 176   %arrayidx2 = getelementptr inbounds <4 x i32>, ptr addrspace(3) %d, i64 %idxprom
 177   store <4 x i32> %ld, ptr addrspace(3) %arrayidx2, align 4
 178   %inc = add nuw i32 %i.07, 1
 179   %exitcond.not = icmp eq i32 %inc, %n
 180   br i1 %exitcond.not, label %for.end, label %for.body
 181
 182 for.end:                                          ; preds = %for.body, %entry
 183   ret void
 184 }