1 ; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck %s
3 @lds0 = addrspace(3) global [512 x float] undef
4 @lds1 = addrspace(3) global [256 x float] undef
5 @lds2 = addrspace(3) global [4096 x float] undef
6 @lds3 = addrspace(3) global [67 x i8] undef
8 @dynamic_shared0 = external addrspace(3) global [0 x float]
9 @dynamic_shared1 = external addrspace(3) global [0 x double]
10 @dynamic_shared2 = external addrspace(3) global [0 x double], align 4
11 @dynamic_shared3 = external addrspace(3) global [0 x double], align 16
13 ; CHECK-LABEL: {{^}}dynamic_shared_array_0:
14 ; CHECK: v_add_u32_e32 v{{[0-9]+}}, 0x800, v{{[0-9]+}}
15 define amdgpu_kernel void @dynamic_shared_array_0(float addrspace(1)* %out) {
16 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
17 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %tid.x
18 %val0 = load float, float addrspace(3)* %arrayidx0, align 4
19 %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x
20 store float %val0, float addrspace(3)* %arrayidx1, align 4
24 ; CHECK-LABEL: {{^}}dynamic_shared_array_1:
25 ; CHECK: v_lshlrev_b32_e32 {{v[0-9]+}}, 2, {{v[0-9]+}}
26 ; CHECK: v_lshlrev_b32_e32 {{v[0-9]+}}, 2, {{v[0-9]+}}
27 ; CHECK: v_lshlrev_b32_e32 [[IDX:v[0-9]+]], 2, {{v[0-9]+}}
28 ; CHECK: v_add_u32_e32 {{v[0-9]+}}, 0xc00, [[IDX]]
29 define amdgpu_kernel void @dynamic_shared_array_1(float addrspace(1)* %out, i32 %cond) {
31 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
32 %idx.0 = add nsw i32 %tid.x, 64
33 %tmp = icmp eq i32 %cond, 0
34 br i1 %tmp, label %if, label %else
37 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
38 %val0 = load float, float addrspace(3)* %arrayidx0, align 4
41 else: ; preds = %entry
42 %arrayidx1 = getelementptr inbounds [256 x float], [256 x float] addrspace(3)* @lds1, i32 0, i32 %idx.0
43 %val1 = load float, float addrspace(3)* %arrayidx1, align 4
46 endif: ; preds = %else, %if
47 %val = phi float [ %val0, %if ], [ %val1, %else ]
48 %arrayidx = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x
49 store float %val, float addrspace(3)* %arrayidx, align 4
53 ; CHECK-LABEL: {{^}}dynamic_shared_array_2:
54 ; CHECK: v_lshlrev_b32_e32 [[IDX:v[0-9]+]], 2, {{v[0-9]+}}
55 ; CHECK: v_add_u32_e32 {{v[0-9]+}}, 0x4000, [[IDX]]
56 define amdgpu_kernel void @dynamic_shared_array_2(i32 %idx) {
57 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
58 %vidx = add i32 %tid.x, %idx
59 %arrayidx0 = getelementptr inbounds [4096 x float], [4096 x float] addrspace(3)* @lds2, i32 0, i32 %vidx
60 %val0 = load float, float addrspace(3)* %arrayidx0, align 4
61 %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x
62 store float %val0, float addrspace(3)* %arrayidx1, align 4
66 ; The offset to the dynamic shared memory array should be aligned on the type
68 ; CHECK-LABEL: {{^}}dynamic_shared_array_3:
69 ; CHECK: v_lshlrev_b32_e32 [[IDX:v[0-9]+]], 2, {{v[0-9]+}}
70 ; CHECK: v_add_u32_e32 {{v[0-9]+}}, 0x44, [[IDX]]
71 define amdgpu_kernel void @dynamic_shared_array_3(i32 %idx) {
72 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
73 %vidx = add i32 %tid.x, %idx
74 %arrayidx0 = getelementptr inbounds [67 x i8], [67 x i8] addrspace(3)* @lds3, i32 0, i32 %vidx
75 %val0 = load i8, i8 addrspace(3)* %arrayidx0, align 4
76 %val1 = uitofp i8 %val0 to float
77 %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x
78 store float %val1, float addrspace(3)* %arrayidx1, align 4
82 ; The offset to the dynamic shared memory array should be aligned on the
84 ; CHECK-LABEL: {{^}}dynamic_shared_array_4:
85 ; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x48
86 ; CHECK: v_lshlrev_b32_e32 [[IDX:v[0-9]+]], 2, {{v[0-9]+}}
87 ; CHECK: v_add_u32_e32 {{v[0-9]+}}, [[DYNLDS]], [[IDX]]
88 define amdgpu_kernel void @dynamic_shared_array_4(i32 %idx) {
89 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
90 %vidx = add i32 %tid.x, %idx
91 %arrayidx0 = getelementptr inbounds [67 x i8], [67 x i8] addrspace(3)* @lds3, i32 0, i32 %vidx
92 %val0 = load i8, i8 addrspace(3)* %arrayidx0, align 4
93 %val1 = uitofp i8 %val0 to float
94 %val2 = uitofp i8 %val0 to double
95 %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x
96 store float %val1, float addrspace(3)* %arrayidx1, align 4
97 %arrayidx2 = getelementptr inbounds [0 x double], [0 x double] addrspace(3)* @dynamic_shared1, i32 0, i32 %tid.x
98 store double %val2, double addrspace(3)* %arrayidx2, align 4
102 ; Honor the explicit alignment from the specified variable.
103 ; CHECK-LABEL: {{^}}dynamic_shared_array_5:
104 ; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x44
105 ; CHECK: v_lshlrev_b32_e32 [[IDX:v[0-9]+]], 2, {{v[0-9]+}}
106 ; CHECK: v_add_u32_e32 {{v[0-9]+}}, [[DYNLDS]], [[IDX]]
107 define amdgpu_kernel void @dynamic_shared_array_5(i32 %idx) {
108 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
109 %vidx = add i32 %tid.x, %idx
110 %arrayidx0 = getelementptr inbounds [67 x i8], [67 x i8] addrspace(3)* @lds3, i32 0, i32 %vidx
111 %val0 = load i8, i8 addrspace(3)* %arrayidx0, align 4
112 %val1 = uitofp i8 %val0 to float
113 %val2 = uitofp i8 %val0 to double
114 %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x
115 store float %val1, float addrspace(3)* %arrayidx1, align 4
116 %arrayidx2 = getelementptr inbounds [0 x double], [0 x double] addrspace(3)* @dynamic_shared2, i32 0, i32 %tid.x
117 store double %val2, double addrspace(3)* %arrayidx2, align 4
121 ; Honor the explicit alignment from the specified variable.
122 ; CHECK-LABEL: {{^}}dynamic_shared_array_6:
123 ; CHECK: v_mov_b32_e32 [[DYNLDS:v[0-9]+]], 0x50
124 ; CHECK: v_lshlrev_b32_e32 [[IDX:v[0-9]+]], 2, {{v[0-9]+}}
125 ; CHECK: v_add_u32_e32 {{v[0-9]+}}, [[DYNLDS]], [[IDX]]
126 define amdgpu_kernel void @dynamic_shared_array_6(i32 %idx) {
127 %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
128 %vidx = add i32 %tid.x, %idx
129 %arrayidx0 = getelementptr inbounds [67 x i8], [67 x i8] addrspace(3)* @lds3, i32 0, i32 %vidx
130 %val0 = load i8, i8 addrspace(3)* %arrayidx0, align 4
131 %val1 = uitofp i8 %val0 to float
132 %val2 = uitofp i8 %val0 to double
133 %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @dynamic_shared0, i32 0, i32 %tid.x
134 store float %val1, float addrspace(3)* %arrayidx1, align 4
135 %arrayidx2 = getelementptr inbounds [0 x double], [0 x double] addrspace(3)* @dynamic_shared3, i32 0, i32 %tid.x
136 store double %val2, double addrspace(3)* %arrayidx2, align 4
140 declare i32 @llvm.amdgcn.workitem.id.x()