test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll

   1 ; RUN: opt -mtriple=amdgcn-- -S -separate-const-offset-from-gep -reassociate-geps-verify-no-dead-code -gvn < %s | FileCheck -check-prefix=IR %s
   2
   3 target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
   4
   5 @array = internal addrspace(4) constant [4096 x [32 x float]] zeroinitializer, align 4
   6
   7 ; IR-LABEL: @sum_of_array(
   8 ; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [4096 x [32 x float]], [4096 x [32 x float]] addrspace(4)* @array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
   9 ; IR: getelementptr inbounds float, float addrspace(4)* [[BASE_PTR]], i64 1
  10 ; IR: getelementptr inbounds float, float addrspace(4)* [[BASE_PTR]], i64 32
  11 ; IR: getelementptr inbounds float, float addrspace(4)* [[BASE_PTR]], i64 33
  12 define amdgpu_kernel void @sum_of_array(i32 %x, i32 %y, float addrspace(1)* nocapture %output) {
  13   %tmp = sext i32 %y to i64
  14   %tmp1 = sext i32 %x to i64
  15   %tmp2 = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(4)* @array, i64 0, i64 %tmp1, i64 %tmp
  16   %tmp4 = load float, float addrspace(4)* %tmp2, align 4
  17   %tmp5 = fadd float %tmp4, 0.000000e+00
  18   %tmp6 = add i32 %y, 1
  19   %tmp7 = sext i32 %tmp6 to i64
  20   %tmp8 = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(4)* @array, i64 0, i64 %tmp1, i64 %tmp7
  21   %tmp10 = load float, float addrspace(4)* %tmp8, align 4
  22   %tmp11 = fadd float %tmp5, %tmp10
  23   %tmp12 = add i32 %x, 1
  24   %tmp13 = sext i32 %tmp12 to i64
  25   %tmp14 = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(4)* @array, i64 0, i64 %tmp13, i64 %tmp
  26   %tmp16 = load float, float addrspace(4)* %tmp14, align 4
  27   %tmp17 = fadd float %tmp11, %tmp16
  28   %tmp18 = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(4)* @array, i64 0, i64 %tmp13, i64 %tmp7
  29   %tmp20 = load float, float addrspace(4)* %tmp18, align 4
  30   %tmp21 = fadd float %tmp17, %tmp20
  31   store float %tmp21, float addrspace(1)* %output, align 4
  32   ret void
  33 }
  34
  35 @array2 = internal addrspace(4) constant [4096 x [4 x float]] zeroinitializer, align 4
  36
  37 ; Some of the indices go over the maximum mubuf offset, so don't split them.
  38
  39 ; IR-LABEL: @sum_of_array_over_max_mubuf_offset(
  40 ; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [4096 x [4 x float]], [4096 x [4 x float]] addrspace(4)* @array2, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
  41 ; IR: getelementptr inbounds float, float addrspace(4)* [[BASE_PTR]], i64 255
  42 ; IR: add i32 %x, 256
  43 ; IR: getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(4)* @array2, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
  44 ; IR: getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(4)* @array2, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
  45 define amdgpu_kernel void @sum_of_array_over_max_mubuf_offset(i32 %x, i32 %y, float addrspace(1)* nocapture %output) {
  46   %tmp = sext i32 %y to i64
  47   %tmp1 = sext i32 %x to i64
  48   %tmp2 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(4)* @array2, i64 0, i64 %tmp1, i64 %tmp
  49   %tmp4 = load float, float addrspace(4)* %tmp2, align 4
  50   %tmp5 = fadd float %tmp4, 0.000000e+00
  51   %tmp6 = add i32 %y, 255
  52   %tmp7 = sext i32 %tmp6 to i64
  53   %tmp8 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(4)* @array2, i64 0, i64 %tmp1, i64 %tmp7
  54   %tmp10 = load float, float addrspace(4)* %tmp8, align 4
  55   %tmp11 = fadd float %tmp5, %tmp10
  56   %tmp12 = add i32 %x, 256
  57   %tmp13 = sext i32 %tmp12 to i64
  58   %tmp14 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(4)* @array2, i64 0, i64 %tmp13, i64 %tmp
  59   %tmp16 = load float, float addrspace(4)* %tmp14, align 4
  60   %tmp17 = fadd float %tmp11, %tmp16
  61   %tmp18 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(4)* @array2, i64 0, i64 %tmp13, i64 %tmp7
  62   %tmp20 = load float, float addrspace(4)* %tmp18, align 4
  63   %tmp21 = fadd float %tmp17, %tmp20
  64   store float %tmp21, float addrspace(1)* %output, align 4
  65   ret void
  66 }
  67
  68
  69 @lds_array = internal addrspace(3) global [4096 x [4 x float]] undef, align 4
  70
  71 ; DS instructions have a larger immediate offset, so make sure these are OK.
  72 ; IR-LABEL: @sum_of_lds_array_over_max_mubuf_offset(
  73 ; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [4096 x [4 x float]], [4096 x [4 x float]] addrspace(3)* @lds_array, i32 0, i32 %{{[a-zA-Z0-9]+}}, i32 %{{[a-zA-Z0-9]+}}
  74 ; IR: getelementptr inbounds float, float addrspace(3)* [[BASE_PTR]], i32 255
  75 ; IR: getelementptr inbounds float, float addrspace(3)* [[BASE_PTR]], i32 16128
  76 ; IR: getelementptr inbounds float, float addrspace(3)* [[BASE_PTR]], i32 16383
  77 define amdgpu_kernel void @sum_of_lds_array_over_max_mubuf_offset(i32 %x, i32 %y, float addrspace(1)* nocapture %output) {
  78   %tmp2 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(3)* @lds_array, i32 0, i32 %x, i32 %y
  79   %tmp4 = load float, float addrspace(3)* %tmp2, align 4
  80   %tmp5 = fadd float %tmp4, 0.000000e+00
  81   %tmp6 = add i32 %y, 255
  82   %tmp8 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(3)* @lds_array, i32 0, i32 %x, i32 %tmp6
  83   %tmp10 = load float, float addrspace(3)* %tmp8, align 4
  84   %tmp11 = fadd float %tmp5, %tmp10
  85   %tmp12 = add i32 %x, 4032
  86   %tmp14 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(3)* @lds_array, i32 0, i32 %tmp12, i32 %y
  87   %tmp16 = load float, float addrspace(3)* %tmp14, align 4
  88   %tmp17 = fadd float %tmp11, %tmp16
  89   %tmp18 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(3)* @lds_array, i32 0, i32 %tmp12, i32 %tmp6
  90   %tmp20 = load float, float addrspace(3)* %tmp18, align 4
  91   %tmp21 = fadd float %tmp17, %tmp20
  92   store float %tmp21, float addrspace(1)* %output, align 4
  93   ret void
  94 }
  95
  96 ; IR-LABEL: @keep_metadata(
  97 ; IR: getelementptr {{.*}} !amdgpu.uniform
  98 ; IR: getelementptr {{.*}} !amdgpu.uniform
  99 ; IR: getelementptr {{.*}} !amdgpu.uniform
 100 define amdgpu_ps <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @keep_metadata([0 x <4 x i32>] addrspace(4)* inreg noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(4)* inreg noalias dereferenceable(18446744073709551615), [0 x <4 x i32>] addrspace(4)* inreg noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(4)* inreg noalias dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #5 {
 101 main_body:
 102   %22 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %5) #8
 103   %23 = bitcast float %22 to i32
 104   %24 = shl i32 %23, 1
 105   %25 = getelementptr [0 x <8 x i32>], [0 x <8 x i32>] addrspace(4)* %1, i32 0, i32 %24, !amdgpu.uniform !0
 106   %26 = load <8 x i32>, <8 x i32> addrspace(4)* %25, align 32, !invariant.load !0
 107   %27 = shl i32 %23, 2
 108   %28 = or i32 %27, 3
 109   %29 = bitcast [0 x <8 x i32>] addrspace(4)* %1 to [0 x <4 x i32>] addrspace(4)*
 110   %30 = getelementptr [0 x <4 x i32>], [0 x <4 x i32>] addrspace(4)* %29, i32 0, i32 %28, !amdgpu.uniform !0
 111   %31 = load <4 x i32>, <4 x i32> addrspace(4)* %30, align 16, !invariant.load !0
 112   %32 = call nsz <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> zeroinitializer, <8 x i32> %26, <4 x i32> %31, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #8
 113   %33 = extractelement <4 x float> %32, i32 0
 114   %34 = extractelement <4 x float> %32, i32 1
 115   %35 = extractelement <4 x float> %32, i32 2
 116   %36 = extractelement <4 x float> %32, i32 3
 117   %37 = bitcast float %4 to i32
 118   %38 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %37, 4
 119   %39 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %38, float %33, 5
 120   %40 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %39, float %34, 6
 121   %41 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %40, float %35, 7
 122   %42 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %41, float %36, 8
 123   %43 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %42, float %20, 19
 124   ret <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %43
 125 }
 126
 127 ; Function Attrs: nounwind readnone speculatable
 128 declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #6
 129
 130 ; Function Attrs: nounwind readonly
 131 declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #7
 132
 133
 134 !0 = !{}
 135
 136 attributes #5 = { "InitialPSInputAddr"="45175" }
 137 attributes #6 = { nounwind readnone speculatable }
 138 attributes #7 = { nounwind readonly }
 139 attributes #8 = { nounwind readnone }