1 ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds --amdgpu-super-align-lds-globals=true < %s | FileCheck --check-prefixes=CHECK,SUPER-ALIGN_ON %s
2 ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds --amdgpu-super-align-lds-globals=true < %s | FileCheck --check-prefixes=CHECK,SUPER-ALIGN_ON %s
3 ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds --amdgpu-super-align-lds-globals=false < %s | FileCheck --check-prefixes=CHECK,SUPER-ALIGN_OFF %s
4 ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds --amdgpu-super-align-lds-globals=false < %s | FileCheck --check-prefixes=CHECK,SUPER-ALIGN_OFF %s
6 ; CHECK: %llvm.amdgcn.kernel.k1.lds.t = type { [32 x i8] }
7 ; CHECK: %llvm.amdgcn.kernel.k2.lds.t = type { i16, [2 x i8], i16 }
8 ; CHECK: %llvm.amdgcn.kernel.k3.lds.t = type { [32 x i64], [32 x i32] }
9 ; CHECK: %llvm.amdgcn.kernel.k4.lds.t = type { [2 x i32 addrspace(3)*] }
11 ; SUPER-ALIGN_ON: @lds.unused = addrspace(3) global i32 undef, align 4
12 ; SUPER-ALIGN_OFF: @lds.unused = addrspace(3) global i32 undef, align 2
13 @lds.unused = addrspace(3) global i32 undef, align 2
16 @lds.1 = internal unnamed_addr addrspace(3) global [32 x i8] undef, align 1
18 ; SUPER-ALIGN_ON: @llvm.amdgcn.kernel.k1.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k1.lds.t undef, align 16
19 ; SUPER-ALIGN_OFF: @llvm.amdgcn.kernel.k1.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k1.lds.t undef, align 1
21 ; CHECK: @llvm.amdgcn.kernel.k2.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k2.lds.t undef, align 4
22 ; SUPER-ALIGN_ON: @llvm.amdgcn.kernel.k3.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k3.lds.t undef, align 16
23 ; SUPER-ALIGN_OFF: @llvm.amdgcn.kernel.k3.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k3.lds.t undef, align 8
25 ; SUPER-ALIGN_ON: @llvm.amdgcn.kernel.k4.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k4.lds.t undef, align 16
26 ; SUPER-ALIGN_OFF: @llvm.amdgcn.kernel.k4.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k4.lds.t undef, align 4
29 ; CHECK: %1 = getelementptr inbounds [32 x i8], [32 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 0), i32 0, i32 0
30 ; CHECK: %2 = addrspacecast i8 addrspace(3)* %1 to i8*
31 ; CHECK: %ptr = getelementptr inbounds i8, i8* %2, i64 %x
32 ; CHECK: store i8 1, i8* %ptr, align 1
33 define amdgpu_kernel void @k1(i64 %x) {
34 %ptr = getelementptr inbounds i8, i8* addrspacecast ([32 x i8] addrspace(3)* @lds.1 to i8*), i64 %x
35 store i8 1, i8 addrspace(0)* %ptr, align 1
39 @lds.2 = internal unnamed_addr addrspace(3) global i16 undef, align 4
40 @lds.3 = internal unnamed_addr addrspace(3) global i16 undef, align 4
42 ; Check that alignment is propagated to uses for scalar variables.
45 ; CHECK: store i16 1, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k2.lds.t, %llvm.amdgcn.kernel.k2.lds.t addrspace(3)* @llvm.amdgcn.kernel.k2.lds, i32 0, i32 0), align 4
46 ; CHECK: store i16 2, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k2.lds.t, %llvm.amdgcn.kernel.k2.lds.t addrspace(3)* @llvm.amdgcn.kernel.k2.lds, i32 0, i32 2), align 4
47 define amdgpu_kernel void @k2() {
48 store i16 1, i16 addrspace(3)* @lds.2, align 2
49 store i16 2, i16 addrspace(3)* @lds.3, align 2
53 @lds.4 = internal unnamed_addr addrspace(3) global [32 x i64] undef, align 8
54 @lds.5 = internal unnamed_addr addrspace(3) global [32 x i32] undef, align 4
56 ; Check that alignment is propagated to uses for arrays.
59 ; CHECK: store i32 1, i32 addrspace(3)* %ptr1, align 8
60 ; CHECK: store i32 2, i32 addrspace(3)* %ptr2, align 4
61 ; SUPER-ALIGN_ON: store i32 3, i32 addrspace(3)* %ptr3, align 16
62 ; SUPER-ALIGN_OFF: store i32 3, i32 addrspace(3)* %ptr3, align 8
63 ; CHECK: store i32 4, i32 addrspace(3)* %ptr4, align 4
64 ; CHECK: store i32 5, i32 addrspace(3)* %ptr5, align 4
65 ; CHECK: %load1 = load i32, i32 addrspace(3)* %ptr1, align 8
66 ; CHECK: %load2 = load i32, i32 addrspace(3)* %ptr2, align 4
67 ; SUPER-ALIGN_ON: %load3 = load i32, i32 addrspace(3)* %ptr3, align 16
68 ; SUPER-ALIGN_OFF: %load3 = load i32, i32 addrspace(3)* %ptr3, align 8
69 ; CHECK: %load4 = load i32, i32 addrspace(3)* %ptr4, align 4
70 ; CHECK: %load5 = load i32, i32 addrspace(3)* %ptr5, align 4
71 ; CHECK: %val1 = atomicrmw volatile add i32 addrspace(3)* %ptr1, i32 1 monotonic, align 8
72 ; CHECK: %val2 = cmpxchg volatile i32 addrspace(3)* %ptr1, i32 1, i32 2 monotonic monotonic, align 8
73 ; CHECK: %ptr1.bc = bitcast i32 addrspace(3)* %ptr1 to i16 addrspace(3)*
74 ; CHECK: %ptr2.bc = bitcast i32 addrspace(3)* %ptr2 to i16 addrspace(3)*
75 ; CHECK: %ptr3.bc = bitcast i32 addrspace(3)* %ptr3 to i16 addrspace(3)*
76 ; CHECK: %ptr4.bc = bitcast i32 addrspace(3)* %ptr4 to i16 addrspace(3)*
77 ; CHECK: store i16 11, i16 addrspace(3)* %ptr1.bc, align 8
78 ; CHECK: store i16 12, i16 addrspace(3)* %ptr2.bc, align 4
79 ; SUPER-ALIGN_ON: store i16 13, i16 addrspace(3)* %ptr3.bc, align 16
80 ; SUPER-ALIGN_OFF: store i16 13, i16 addrspace(3)* %ptr3.bc, align 8
81 ; CHECK: store i16 14, i16 addrspace(3)* %ptr4.bc, align 4
82 ; CHECK: %ptr1.ac = addrspacecast i32 addrspace(3)* %ptr1 to i32*
83 ; CHECK: %ptr2.ac = addrspacecast i32 addrspace(3)* %ptr2 to i32*
84 ; CHECK: %ptr3.ac = addrspacecast i32 addrspace(3)* %ptr3 to i32*
85 ; CHECK: %ptr4.ac = addrspacecast i32 addrspace(3)* %ptr4 to i32*
86 ; CHECK: store i32 21, i32* %ptr1.ac, align 8
87 ; CHECK: store i32 22, i32* %ptr2.ac, align 4
88 ; SUPER-ALIGN_ON: store i32 23, i32* %ptr3.ac, align 16
89 ; SUPER-ALIGN_OFF: store i32 23, i32* %ptr3.ac, align 8
90 ; CHECK: store i32 24, i32* %ptr4.ac, align 4
91 define amdgpu_kernel void @k3(i64 %x) {
92 %ptr0 = getelementptr inbounds i64, i64 addrspace(3)* bitcast ([32 x i64] addrspace(3)* @lds.4 to i64 addrspace(3)*), i64 0
93 store i64 0, i64 addrspace(3)* %ptr0, align 8
95 %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* bitcast ([32 x i32] addrspace(3)* @lds.5 to i32 addrspace(3)*), i64 2
96 %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* bitcast ([32 x i32] addrspace(3)* @lds.5 to i32 addrspace(3)*), i64 3
97 %ptr3 = getelementptr inbounds i32, i32 addrspace(3)* bitcast ([32 x i32] addrspace(3)* @lds.5 to i32 addrspace(3)*), i64 4
98 %ptr4 = getelementptr inbounds i32, i32 addrspace(3)* bitcast ([32 x i32] addrspace(3)* @lds.5 to i32 addrspace(3)*), i64 5
99 %ptr5 = getelementptr inbounds i32, i32 addrspace(3)* bitcast ([32 x i32] addrspace(3)* @lds.5 to i32 addrspace(3)*), i64 %x
101 store i32 1, i32 addrspace(3)* %ptr1, align 4
102 store i32 2, i32 addrspace(3)* %ptr2, align 4
103 store i32 3, i32 addrspace(3)* %ptr3, align 4
104 store i32 4, i32 addrspace(3)* %ptr4, align 4
105 store i32 5, i32 addrspace(3)* %ptr5, align 4
107 %load1 = load i32, i32 addrspace(3)* %ptr1, align 4
108 %load2 = load i32, i32 addrspace(3)* %ptr2, align 4
109 %load3 = load i32, i32 addrspace(3)* %ptr3, align 4
110 %load4 = load i32, i32 addrspace(3)* %ptr4, align 4
111 %load5 = load i32, i32 addrspace(3)* %ptr5, align 4
113 %val1 = atomicrmw volatile add i32 addrspace(3)* %ptr1, i32 1 monotonic, align 4
114 %val2 = cmpxchg volatile i32 addrspace(3)* %ptr1, i32 1, i32 2 monotonic monotonic, align 4
116 %ptr1.bc = bitcast i32 addrspace(3)* %ptr1 to i16 addrspace(3)*
117 %ptr2.bc = bitcast i32 addrspace(3)* %ptr2 to i16 addrspace(3)*
118 %ptr3.bc = bitcast i32 addrspace(3)* %ptr3 to i16 addrspace(3)*
119 %ptr4.bc = bitcast i32 addrspace(3)* %ptr4 to i16 addrspace(3)*
121 store i16 11, i16 addrspace(3)* %ptr1.bc, align 2
122 store i16 12, i16 addrspace(3)* %ptr2.bc, align 2
123 store i16 13, i16 addrspace(3)* %ptr3.bc, align 2
124 store i16 14, i16 addrspace(3)* %ptr4.bc, align 2
126 %ptr1.ac = addrspacecast i32 addrspace(3)* %ptr1 to i32*
127 %ptr2.ac = addrspacecast i32 addrspace(3)* %ptr2 to i32*
128 %ptr3.ac = addrspacecast i32 addrspace(3)* %ptr3 to i32*
129 %ptr4.ac = addrspacecast i32 addrspace(3)* %ptr4 to i32*
131 store i32 21, i32* %ptr1.ac, align 4
132 store i32 22, i32* %ptr2.ac, align 4
133 store i32 23, i32* %ptr3.ac, align 4
134 store i32 24, i32* %ptr4.ac, align 4
139 @lds.6 = internal unnamed_addr addrspace(3) global [2 x i32 addrspace(3)*] undef, align 4
141 ; Check that aligment is not propagated if use is not a pointer operand.
144 ; SUPER-ALIGN_ON: store i32 undef, i32 addrspace(3)* %ptr, align 8
145 ; SUPER-ALIGN_OFF: store i32 undef, i32 addrspace(3)* %ptr, align 4
146 ; CHECK: store i32 addrspace(3)* %ptr, i32 addrspace(3)** undef, align 4
147 ; SUPER-ALIGN_ON: %val1 = cmpxchg volatile i32 addrspace(3)* %ptr, i32 1, i32 2 monotonic monotonic, align 8
148 ; SUPER-ALIGN_OFF: %val1 = cmpxchg volatile i32 addrspace(3)* %ptr, i32 1, i32 2 monotonic monotonic, align 4
149 ; CHECK: %val2 = cmpxchg volatile i32 addrspace(3)** undef, i32 addrspace(3)* %ptr, i32 addrspace(3)* undef monotonic monotonic, align 4
150 define amdgpu_kernel void @k4() {
151 %gep = getelementptr inbounds i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* bitcast ([2 x i32 addrspace(3)*] addrspace(3)* @lds.6 to i32 addrspace(3)* addrspace(3)*), i64 1
152 %ptr = bitcast i32 addrspace(3)* addrspace(3)* %gep to i32 addrspace(3)*
153 store i32 undef, i32 addrspace(3)* %ptr, align 4
154 store i32 addrspace(3)* %ptr, i32 addrspace(3)** undef, align 4
155 %val1 = cmpxchg volatile i32 addrspace(3)* %ptr, i32 1, i32 2 monotonic monotonic, align 4
156 %val2 = cmpxchg volatile i32 addrspace(3)** undef, i32 addrspace(3)* %ptr, i32 addrspace(3)* undef monotonic monotonic, align 4