llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll

   1 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
   2
   3
   4 ; There is no dependence between the store and the two loads. So we can combine
   5 ; the loads and schedule it freely.
   6
   7 ; GCN-LABEL: {{^}}ds_combine_nodep
   8
   9 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27
  10 ; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:7 offset1:8
  11 ; GCN: s_waitcnt lgkmcnt({{[0-9]+}})
  12 define amdgpu_kernel void @ds_combine_nodep(ptr addrspace(1) %out, ptr addrspace(3) %inptr) #0 {
  13
  14   %addr0 = getelementptr i8, ptr addrspace(3) %inptr, i32 24
  15   %load0 = load <3 x float>, ptr addrspace(3) %addr0, align 4
  16   %v0 = extractelement <3 x float> %load0, i32 2
  17
  18   %tmp1 = insertelement <2 x float> undef, float 1.0, i32 0
  19   %data = insertelement <2 x float> %tmp1, float 2.0, i32 1
  20
  21   %tmp2 = getelementptr float, ptr addrspace(3) %inptr, i32 26
  22   store <2 x float> %data, ptr addrspace(3) %tmp2, align 4
  23
  24   %vaddr1 = getelementptr float, ptr addrspace(3) %inptr, i32 7
  25   %v1 = load float, ptr addrspace(3) %vaddr1, align 4
  26
  27   %sum = fadd float %v0, %v1
  28   store float %sum, ptr addrspace(1) %out, align 4
  29   ret void
  30 }
  31
  32
  33 ; The store depends on the first load, so we could not move the first load down to combine with
  34 ; the second load directly. However, we can move the store after the combined load.
  35
  36 ; GCN-LABEL: {{^}}ds_combine_WAR
  37
  38 ; GCN:      ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:7 offset1:27
  39 ; GCN-NEXT: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27
  40 define amdgpu_kernel void @ds_combine_WAR(ptr addrspace(1) %out, ptr addrspace(3) %inptr) #0 {
  41
  42   %addr0 = getelementptr i8, ptr addrspace(3) %inptr, i32 100
  43   %load0 = load <3 x float>, ptr addrspace(3) %addr0, align 4
  44   %v0 = extractelement <3 x float> %load0, i32 2
  45
  46   %tmp1 = insertelement <2 x float> undef, float 1.0, i32 0
  47   %data = insertelement <2 x float> %tmp1, float 2.0, i32 1
  48
  49   %tmp2 = getelementptr float, ptr addrspace(3) %inptr, i32 26
  50   store <2 x float> %data, ptr addrspace(3) %tmp2, align 4
  51
  52   %vaddr1 = getelementptr float, ptr addrspace(3) %inptr, i32 7
  53   %v1 = load float, ptr addrspace(3) %vaddr1, align 4
  54
  55   %sum = fadd float %v0, %v1
  56   store float %sum, ptr addrspace(1) %out, align 4
  57   ret void
  58 }
  59
  60
  61 ; The second load depends on the store. We could combine the two loads, putting
  62 ; the combined load at the original place of the second load, but we prefer to
  63 ; leave the first load near the start of the function to hide its latency.
  64
  65 ; GCN-LABEL: {{^}}ds_combine_RAW
  66
  67 ; GCN:      ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27
  68 ; GCN-NEXT: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32
  69 ; GCN-NEXT: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:104
  70 define amdgpu_kernel void @ds_combine_RAW(ptr addrspace(1) %out, ptr addrspace(3) %inptr) #0 {
  71
  72   %addr0 = getelementptr i8, ptr addrspace(3) %inptr, i32 24
  73   %load0 = load <3 x float>, ptr addrspace(3) %addr0, align 4
  74   %v0 = extractelement <3 x float> %load0, i32 2
  75
  76   %tmp1 = insertelement <2 x float> undef, float 1.0, i32 0
  77   %data = insertelement <2 x float> %tmp1, float 2.0, i32 1
  78
  79   %tmp2 = getelementptr float, ptr addrspace(3) %inptr, i32 26
  80   store <2 x float> %data, ptr addrspace(3) %tmp2, align 4
  81
  82   %vaddr1 = getelementptr float, ptr addrspace(3) %inptr, i32 26
  83   %v1 = load float, ptr addrspace(3) %vaddr1, align 4
  84
  85   %sum = fadd float %v0, %v1
  86   store float %sum, ptr addrspace(1) %out, align 4
  87   ret void
  88 }
  89
  90
  91 ; The store depends on the first load, also the second load depends on the store.
  92 ; So we can not combine the two loads.
  93
  94 ; GCN-LABEL: {{^}}ds_combine_WAR_RAW
  95
  96 ; GCN:      ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:108
  97 ; GCN-NEXT: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27
  98 ; GCN-NEXT: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:104
  99 define amdgpu_kernel void @ds_combine_WAR_RAW(ptr addrspace(1) %out, ptr addrspace(3) %inptr) #0 {
 100
 101   %addr0 = getelementptr i8, ptr addrspace(3) %inptr, i32 100
 102   %load0 = load <3 x float>, ptr addrspace(3) %addr0, align 4
 103   %v0 = extractelement <3 x float> %load0, i32 2
 104
 105   %tmp1 = insertelement <2 x float> undef, float 1.0, i32 0
 106   %data = insertelement <2 x float> %tmp1, float 2.0, i32 1
 107
 108   %tmp2 = getelementptr float, ptr addrspace(3) %inptr, i32 26
 109   store <2 x float> %data, ptr addrspace(3) %tmp2, align 4
 110
 111   %vaddr1 = getelementptr float, ptr addrspace(3) %inptr, i32 26
 112   %v1 = load float, ptr addrspace(3) %vaddr1, align 4
 113
 114   %sum = fadd float %v0, %v1
 115   store float %sum, ptr addrspace(1) %out, align 4
 116   ret void
 117 }
 118
 119 attributes #0 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }