test/GPGPU/cuda-managed-memory-simple.ll

   1 ; RUN: opt %loadPolly -S  -polly-process-unprofitable -polly-acc-mincompute=0 -polly-target=gpu  -polly-codegen-ppcg -polly-acc-codegen-managed-memory < %s | \
   2 ; RUN: FileCheck %s
   3
   4 ; REQUIRES: pollyacc
   5
   6 ;
   7 ;    #include <cuda_runtime.h>
   8 ;
   9 ;    static const int N = 45;
  10 ;
  11 ;    void copy(int *R, int *A) {
  12 ;      for (int i = 0; i < N; i++) {
  13 ;        R[i] = A[i] * 10;
  14 ;      }
  15 ;    }
  16 ;
  17 ;    int main() {
  18 ;      int *A, *R;
  19 ;
  20 ;      cudaMallocManaged((void **)(&A), sizeof(int) * N, cudaMemAttachGlobal);
  21 ;      cudaMallocManaged((void **)(&R), sizeof(int) * N, cudaMemAttachGlobal);
  22 ;
  23 ;      for (int i = 0; i < N; i++) {
  24 ;        A[i] = i;
  25 ;        R[i] = 0;
  26 ;      }
  27 ;      copy(R, A);
  28 ;
  29 ;      return 0;
  30 ;    }
  31 ;
  32
  33 ; CHECK-NOT: polly_copyFromHostToDevice
  34 ; CHECK-NOT: polly_copyFromDeviceToHost
  35 ; CHECK-NOT: polly_freeDeviceMemory
  36 ; CHECK-NOT: polly_allocateMemoryForDevice
  37
  38 ; CHECK:       %[[REGCTX:[0-9]+]] = call i8* @polly_initContextCUDA()
  39 ; CHECK-NEXT:  %[[REGCA:[0-9]+]] = bitcast i32* %A to i8*
  40 ; CHECK-NEXT:  %[[REGCR:[0-9]+]] = bitcast i32* %R to i8*
  41 ; CHECK-NEXT:  %[[REGGEP0:[0-9]+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0
  42 ; CHECK-NEXT:  store i8* %[[REGCA]], i8** %polly_launch_0_param_0
  43 ; CHECK-NEXT:  %[[REGCP0:[0-9]+]] = bitcast i8** %polly_launch_0_param_0 to i8*
  44 ; CHECK-NEXT:  store i8* %[[REGCP0]], i8** %[[REGGEP0]]
  45 ; CHECK-NEXT:  %[[REGGEP1:[0-9]+]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 1
  46 ; CHECK-NEXT:  store i8* %[[REGCR]], i8** %polly_launch_0_param_1
  47 ; CHECK-NEXT:  %[[REGCP1:[0-9]+]] = bitcast i8** %polly_launch_0_param_1 to i8*
  48 ; CHECK-NEXT:  store i8* %[[REGCP1]], i8** %[[REGGEP1]]
  49 ; CHECK-NEXT:  %[[REGKERNEL:[0-9]+]] = call i8* @polly_getKernel(i8* getelementptr inbounds ([852 x i8], [852 x i8]* @FUNC_copy_SCOP_0_KERNEL_0, i32 0, i32 0), i8* getelementptr inbounds ([26 x i8], [26 x i8]* @FUNC_copy_SCOP_0_KERNEL_0_name, i32 0, i32 0))
  50 ; CHECK-NEXT:  call void @polly_launchKernel(i8* %[[REGKERNEL]], i32 2, i32 1, i32 32, i32 1, i32 1, i8* %polly_launch_0_params_i8ptr)
  51 ; CHECK-NEXT:  call void @polly_freeKernel(i8* %[[REGKERNEL]])
  52 ; CHECK-NEXT:  call void @polly_synchronizeDevice()
  53 ; CHECK-NEXT:  call void @polly_freeContext(i8* %[[REGCTX]])
  54
  55 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
  56
  57 define void @copy(i32* %R, i32* %A) {
  58 entry:
  59   br label %for.cond
  60
  61 for.cond:                                         ; preds = %for.inc, %entry
  62   %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
  63   %exitcond = icmp ne i64 %indvars.iv, 45
  64   br i1 %exitcond, label %for.body, label %for.end
  65
  66 for.body:                                         ; preds = %for.cond
  67   %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
  68   %tmp = load i32, i32* %arrayidx, align 4
  69   %mul = mul nsw i32 %tmp, 10
  70   %arrayidx2 = getelementptr inbounds i32, i32* %R, i64 %indvars.iv
  71   store i32 %mul, i32* %arrayidx2, align 4
  72   br label %for.inc
  73
  74 for.inc:                                          ; preds = %for.body
  75   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  76   br label %for.cond
  77
  78 for.end:                                          ; preds = %for.cond
  79   ret void
  80 }
  81
  82 define i32 @main() {
  83 entry:
  84   %A = alloca i32*, align 8
  85   %R = alloca i32*, align 8
  86   %tmp = bitcast i32** %A to i8**
  87   %call = call i32 @cudaMallocManaged(i8** nonnull %tmp, i64 180, i32 1) #2
  88   %tmp1 = bitcast i32** %R to i8**
  89   %call1 = call i32 @cudaMallocManaged(i8** nonnull %tmp1, i64 180, i32 1) #2
  90   br label %for.cond
  91
  92 for.cond:                                         ; preds = %for.inc, %entry
  93   %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
  94   %exitcond = icmp ne i64 %indvars.iv, 45
  95   br i1 %exitcond, label %for.body, label %for.end
  96
  97 for.body:                                         ; preds = %for.cond
  98   %tmp2 = load i32*, i32** %A, align 8
  99   %arrayidx = getelementptr inbounds i32, i32* %tmp2, i64 %indvars.iv
 100   %tmp3 = trunc i64 %indvars.iv to i32
 101   store i32 %tmp3, i32* %arrayidx, align 4
 102   %tmp4 = load i32*, i32** %R, align 8
 103   %arrayidx3 = getelementptr inbounds i32, i32* %tmp4, i64 %indvars.iv
 104   store i32 0, i32* %arrayidx3, align 4
 105   br label %for.inc
 106
 107 for.inc:                                          ; preds = %for.body
 108   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
 109   br label %for.cond
 110
 111 for.end:                                          ; preds = %for.cond
 112   %tmp5 = load i32*, i32** %R, align 8
 113   %tmp6 = load i32*, i32** %A, align 8
 114   call void @copy(i32* %tmp5, i32* %tmp6)
 115   ret i32 0
 116 }
 117
 118 declare i32 @cudaMallocManaged(i8**, i64, i32) #1