llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll

   1 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
   2 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
   3 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
   4 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
   5 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
   6 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
   7
   8 ; GCN-LABEL: {{^}}full_mask:
   9 ; GCN: s_mov_b64 exec, -1
  10 ; GCN: v_add_f32_e32 v0,
  11 define amdgpu_ps float @full_mask(float %a, float %b) {
  12 main_body:
  13   %s = fadd float %a, %b
  14   call void @llvm.amdgcn.init.exec(i64 -1)
  15   ret float %s
  16 }
  17
  18 ; GCN-LABEL: {{^}}partial_mask:
  19 ; GCN: s_mov_b64 exec, 0x1e240
  20 ; GCN: v_add_f32_e32 v0,
  21 define amdgpu_ps float @partial_mask(float %a, float %b) {
  22 main_body:
  23   %s = fadd float %a, %b
  24   call void @llvm.amdgcn.init.exec(i64 123456)
  25   ret float %s
  26 }
  27
  28 ; GCN-LABEL: {{^}}input_s3off8:
  29 ; GCN: s_bfe_u32 s0, s3, 0x70008
  30 ; GCN: s_bfm_b64 exec, s0, 0
  31 ; GCN: s_cmp_eq_u32 s0, 64
  32 ; GCN: s_cmov_b64 exec, -1
  33 ; GCN: v_add_f32_e32 v0,
  34 define amdgpu_ps float @input_s3off8(i32 inreg, i32 inreg, i32 inreg, i32 inreg %count, float %a, float %b) {
  35 main_body:
  36   %s = fadd float %a, %b
  37   call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8)
  38   ret float %s
  39 }
  40
  41 ; GCN-LABEL: {{^}}input_s0off19:
  42 ; GCN: s_bfe_u32 s0, s0, 0x70013
  43 ; GCN: s_bfm_b64 exec, s0, 0
  44 ; GCN: s_cmp_eq_u32 s0, 64
  45 ; GCN: s_cmov_b64 exec, -1
  46 ; GCN: v_add_f32_e32 v0,
  47 define amdgpu_ps float @input_s0off19(i32 inreg %count, float %a, float %b) {
  48 main_body:
  49   %s = fadd float %a, %b
  50   call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 19)
  51   ret float %s
  52 }
  53
  54 ; GCN-LABEL: {{^}}reuse_input:
  55 ; GCN: s_bfe_u32 s1, s0, 0x70013
  56 ; GCN: s_bfm_b64 exec, s1, 0
  57 ; GCN: s_cmp_eq_u32 s1, 64
  58 ; GCN: s_cmov_b64 exec, -1
  59 ; GCN: v_add{{(_nc)?}}_u32_e32 v0, s0, v0
  60 define amdgpu_ps float @reuse_input(i32 inreg %count, i32 %a) {
  61 main_body:
  62   call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 19)
  63   %s = add i32 %a, %count
  64   %f = sitofp i32 %s to float
  65   ret float %f
  66 }
  67
  68 ; GCN-LABEL: {{^}}reuse_input2:
  69 ; GCN: s_bfe_u32 s1, s0, 0x70013
  70 ; GCN: s_bfm_b64 exec, s1, 0
  71 ; GCN: s_cmp_eq_u32 s1, 64
  72 ; GCN: s_cmov_b64 exec, -1
  73 ; GCN: v_add{{(_nc)?}}_u32_e32 v0, s0, v0
  74 define amdgpu_ps float @reuse_input2(i32 inreg %count, i32 %a) {
  75 main_body:
  76   %s = add i32 %a, %count
  77   %f = sitofp i32 %s to float
  78   call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 19)
  79   ret float %f
  80 }
  81
  82 ; GCN-LABEL: {{^}}init_unreachable:
  83 ;
  84 ; This used to crash.
  85 define amdgpu_ps void @init_unreachable() {
  86 main_body:
  87   call void @llvm.amdgcn.init.exec(i64 -1)
  88   unreachable
  89 }
  90
  91 ; GCN-LABEL: {{^}}init_exec_before_frame_materialize:
  92 ; GCN-NOT: {{^}}v_
  93 ; GCN: s_mov_b64 exec, -1
  94 ; GCN: v_mov
  95 ; GCN: v_add
  96 define amdgpu_ps float @init_exec_before_frame_materialize(i32 inreg %a, i32 inreg %b) {
  97 main_body:
  98   %array0 = alloca [1024 x i32], align 16, addrspace(5)
  99   %array1 = alloca [20 x i32], align 16, addrspace(5)
 100   call void @llvm.amdgcn.init.exec(i64 -1)
 101
 102   %ptr0 = getelementptr inbounds [1024 x i32], ptr addrspace(5) %array0, i32 0, i32 1
 103   store i32 %a, ptr addrspace(5) %ptr0, align 4
 104
 105   %ptr1 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 1
 106   store i32 %a, ptr addrspace(5) %ptr1, align 4
 107
 108   %ptr2 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 2
 109   store i32 %b, ptr addrspace(5) %ptr2, align 4
 110
 111   %ptr3 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 %b
 112   %v3 = load i32, ptr addrspace(5) %ptr3, align 4
 113
 114   %ptr4 = getelementptr inbounds [1024 x i32], ptr addrspace(5) %array0, i32 0, i32 %b
 115   %v4 = load i32, ptr addrspace(5) %ptr4, align 4
 116
 117   %v5 = add i32 %v3, %v4
 118   %v = bitcast i32 %v5 to float
 119   ret float %v
 120 }
 121
 122 ; GCN-LABEL: {{^}}init_exec_input_before_frame_materialize:
 123 ; GCN-NOT: {{^}}v_
 124 ; GCN: s_bfe_u32 s2, s2, 0x70008
 125 ; GCN-NEXT: s_bfm_b64 exec, s2, 0
 126 ; GCN-NEXT: s_cmp_eq_u32 s2, 64
 127 ; GCN-NEXT: s_cmov_b64 exec, -1
 128 ; GCN: v_mov
 129 ; GCN: v_add
 130 define amdgpu_ps float @init_exec_input_before_frame_materialize(i32 inreg %a, i32 inreg %b, i32 inreg %count) {
 131 main_body:
 132   %array0 = alloca [1024 x i32], align 16, addrspace(5)
 133   %array1 = alloca [20 x i32], align 16, addrspace(5)
 134   call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8)
 135
 136   %ptr0 = getelementptr inbounds [1024 x i32], ptr addrspace(5) %array0, i32 0, i32 1
 137   store i32 %a, ptr addrspace(5) %ptr0, align 4
 138
 139   %ptr1 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 1
 140   store i32 %a, ptr addrspace(5) %ptr1, align 4
 141
 142   %ptr2 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 2
 143   store i32 %b, ptr addrspace(5) %ptr2, align 4
 144
 145   %ptr3 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 %b
 146   %v3 = load i32, ptr addrspace(5) %ptr3, align 4
 147
 148   %ptr4 = getelementptr inbounds [1024 x i32], ptr addrspace(5) %array0, i32 0, i32 %b
 149   %v4 = load i32, ptr addrspace(5) %ptr4, align 4
 150
 151   %v5 = add i32 %v3, %v4
 152   %v = bitcast i32 %v5 to float
 153   ret float %v
 154 }
 155
 156 ; GCN-LABEL: {{^}}init_exec_input_before_frame_materialize_nonentry:
 157 ; GCN-NOT: {{^}}v_
 158 ; GCN: %endif
 159 ; GCN: s_bfe_u32 [[S:s[0-9]+]], s2, 0x70008
 160 ; GCN-NEXT: s_bfm_b64 exec, [[S]], 0
 161 ; GCN-NEXT: s_cmp_eq_u32 [[S]], 64
 162 ; GCN-NEXT: s_cmov_b64 exec, -1
 163 ; GCN: v_mov
 164 ; GCN: v_add
 165 define amdgpu_ps float @init_exec_input_before_frame_materialize_nonentry(i32 inreg %a, i32 inreg %b, i32 inreg %count) {
 166 main_body:
 167   ; ideally these alloca would be in %endif, but this causes problems on Windows GlobalISel
 168   %array0 = alloca [1024 x i32], align 16, addrspace(5)
 169   %array1 = alloca [20 x i32], align 16, addrspace(5)
 170
 171   %cc = icmp uge i32 %count, 32
 172   br i1 %cc, label %endif, label %if
 173
 174 if:
 175   call void asm sideeffect "", ""()
 176   br label %endif
 177
 178 endif:
 179   call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8)
 180
 181   %ptr0 = getelementptr inbounds [1024 x i32], ptr addrspace(5) %array0, i32 0, i32 1
 182   store i32 %a, ptr addrspace(5) %ptr0, align 4
 183
 184   %ptr1 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 1
 185   store i32 %a, ptr addrspace(5) %ptr1, align 4
 186
 187   %ptr2 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 2
 188   store i32 %b, ptr addrspace(5) %ptr2, align 4
 189
 190   %ptr3 = getelementptr inbounds [20 x i32], ptr addrspace(5) %array1, i32 0, i32 %b
 191   %v3 = load i32, ptr addrspace(5) %ptr3, align 4
 192
 193   %ptr4 = getelementptr inbounds [1024 x i32], ptr addrspace(5) %array0, i32 0, i32 %b
 194   %v4 = load i32, ptr addrspace(5) %ptr4, align 4
 195
 196   %v5 = add i32 %v3, %v4
 197   %v6 = add i32 %v5, %count
 198   %v = bitcast i32 %v6 to float
 199   ret float %v
 200 }
 201
 202 declare void @llvm.amdgcn.init.exec(i64) #1
 203 declare void @llvm.amdgcn.init.exec.from.input(i32, i32) #1
 204
 205 attributes #1 = { convergent }