1 // RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-nvvm --split-input-file %s | FileCheck --check-prefix=NVVM %s
2 // RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-rocdl --split-input-file %s | FileCheck --check-prefix=ROCDL %s
5 // NVVM-LABEL: llvm.func @private
6 gpu.func @private(%arg0: f32) private(%arg1: memref<4xf32, #gpu.address_space<private>>) {
7 // Allocate private memory inside the function.
8 // NVVM: %[[size:.*]] = llvm.mlir.constant(4 : i64) : i64
9 // NVVM: %[[raw:.*]] = llvm.alloca %[[size]] x f32 : (i64) -> !llvm.ptr
11 // ROCDL: %[[size:.*]] = llvm.mlir.constant(4 : i64) : i64
12 // ROCDL: %[[raw:.*]] = llvm.alloca %[[size]] x f32 : (i64) -> !llvm.ptr<5>
14 // Populate the memref descriptor.
15 // NVVM: %[[descr1:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
16 // NVVM: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
17 // NVVM: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
18 // NVVM: %[[c0:.*]] = llvm.mlir.constant(0 : index) : i64
19 // NVVM: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
20 // NVVM: %[[c4:.*]] = llvm.mlir.constant(4 : index) : i64
21 // NVVM: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
22 // NVVM: %[[c1:.*]] = llvm.mlir.constant(1 : index) : i64
23 // NVVM: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 0]
25 // ROCDL: %[[descr1:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<5>, ptr<5>, i64, array<1 x i64>, array<1 x i64>)>
26 // ROCDL: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
27 // ROCDL: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
28 // ROCDL: %[[c0:.*]] = llvm.mlir.constant(0 : index) : i64
29 // ROCDL: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
30 // ROCDL: %[[c4:.*]] = llvm.mlir.constant(4 : index) : i64
31 // ROCDL: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
32 // ROCDL: %[[c1:.*]] = llvm.mlir.constant(1 : index) : i64
33 // ROCDL: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 0]
35 // "Store" lowering should work just as any other memref, only check that
36 // we emit some core instructions.
37 // NVVM: llvm.extractvalue %[[descr6:.*]]
38 // NVVM: llvm.getelementptr
41 // ROCDL: llvm.extractvalue %[[descr6:.*]]
42 // ROCDL: llvm.getelementptr
44 %c0 = arith.constant 0 : index
45 memref.store %arg0, %arg1[%c0] : memref<4xf32, #gpu.address_space<private>>
47 "terminator"() : () -> ()
54 // Workgroup buffers are allocated as globals.
55 // NVVM: llvm.mlir.global internal @[[$buffer:.*]]()
56 // NVVM-SAME: addr_space = 3
57 // NVVM-SAME: !llvm.array<4 x f32>
59 // ROCDL: llvm.mlir.global internal @[[$buffer:.*]]()
60 // ROCDL-SAME: addr_space = 3
61 // ROCDL-SAME: !llvm.array<4 x f32>
63 // NVVM-LABEL: llvm.func @workgroup
66 // ROCDL-LABEL: llvm.func @workgroup
68 gpu.func @workgroup(%arg0: f32) workgroup(%arg1: memref<4xf32, #gpu.address_space<workgroup>>) {
69 // Get the address of the first element in the global array.
70 // NVVM: %[[addr:.*]] = llvm.mlir.addressof @[[$buffer]] : !llvm.ptr<3>
71 // NVVM: %[[raw:.*]] = llvm.getelementptr %[[addr]][0, 0]
72 // NVVM-SAME: !llvm.ptr<3>
74 // ROCDL: %[[addr:.*]] = llvm.mlir.addressof @[[$buffer]] : !llvm.ptr<3>
75 // ROCDL: %[[raw:.*]] = llvm.getelementptr %[[addr]][0, 0]
76 // ROCDL-SAME: !llvm.ptr<3>
78 // Populate the memref descriptor.
79 // NVVM: %[[descr1:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
80 // NVVM: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
81 // NVVM: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
82 // NVVM: %[[c0:.*]] = llvm.mlir.constant(0 : index) : i64
83 // NVVM: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
84 // NVVM: %[[c4:.*]] = llvm.mlir.constant(4 : index) : i64
85 // NVVM: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
86 // NVVM: %[[c1:.*]] = llvm.mlir.constant(1 : index) : i64
87 // NVVM: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 0]
89 // ROCDL: %[[descr1:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
90 // ROCDL: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
91 // ROCDL: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
92 // ROCDL: %[[c0:.*]] = llvm.mlir.constant(0 : index) : i64
93 // ROCDL: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
94 // ROCDL: %[[c4:.*]] = llvm.mlir.constant(4 : index) : i64
95 // ROCDL: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
96 // ROCDL: %[[c1:.*]] = llvm.mlir.constant(1 : index) : i64
97 // ROCDL: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 0]
99 // "Store" lowering should work just as any other memref, only check that
100 // we emit some core instructions.
101 // NVVM: llvm.extractvalue %[[descr6:.*]]
102 // NVVM: llvm.getelementptr
105 // ROCDL: llvm.extractvalue %[[descr6:.*]]
106 // ROCDL: llvm.getelementptr
108 %c0 = arith.constant 0 : index
109 memref.store %arg0, %arg1[%c0] : memref<4xf32, #gpu.address_space<workgroup>>
111 "terminator"() : () -> ()
118 // Check that the total size was computed correctly.
119 // NVVM: llvm.mlir.global internal @[[$buffer:.*]]()
120 // NVVM-SAME: addr_space = 3
121 // NVVM-SAME: !llvm.array<48 x f32>
123 // ROCDL: llvm.mlir.global internal @[[$buffer:.*]]()
124 // ROCDL-SAME: addr_space = 3
125 // ROCDL-SAME: !llvm.array<48 x f32>
127 // NVVM-LABEL: llvm.func @workgroup3d
128 // ROCDL-LABEL: llvm.func @workgroup3d
129 gpu.func @workgroup3d(%arg0: f32) workgroup(%arg1: memref<4x2x6xf32, #gpu.address_space<workgroup>>) {
130 // Get the address of the first element in the global array.
131 // NVVM: %[[addr:.*]] = llvm.mlir.addressof @[[$buffer]] : !llvm.ptr<3>
132 // NVVM: %[[raw:.*]] = llvm.getelementptr %[[addr]][0, 0]
133 // NVVM-SAME: !llvm.ptr<3>
135 // ROCDL: %[[addr:.*]] = llvm.mlir.addressof @[[$buffer]] : !llvm.ptr<3>
136 // ROCDL: %[[raw:.*]] = llvm.getelementptr %[[addr]][0, 0]
137 // ROCDL-SAME: !llvm.ptr<3>
139 // Populate the memref descriptor.
140 // NVVM: %[[descr1:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i64, array<3 x i64>, array<3 x i64>)>
141 // NVVM: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
142 // NVVM: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
143 // NVVM: %[[c0:.*]] = llvm.mlir.constant(0 : index) : i64
144 // NVVM: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
145 // NVVM: %[[c4:.*]] = llvm.mlir.constant(4 : index) : i64
146 // NVVM: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
147 // NVVM: %[[c12:.*]] = llvm.mlir.constant(12 : index) : i64
148 // NVVM: %[[descr6:.*]] = llvm.insertvalue %[[c12]], %[[descr5]][4, 0]
149 // NVVM: %[[c2:.*]] = llvm.mlir.constant(2 : index) : i64
150 // NVVM: %[[descr7:.*]] = llvm.insertvalue %[[c2]], %[[descr6]][3, 1]
151 // NVVM: %[[c6:.*]] = llvm.mlir.constant(6 : index) : i64
152 // NVVM: %[[descr8:.*]] = llvm.insertvalue %[[c6]], %[[descr7]][4, 1]
153 // NVVM: %[[c6:.*]] = llvm.mlir.constant(6 : index) : i64
154 // NVVM: %[[descr9:.*]] = llvm.insertvalue %[[c6]], %[[descr8]][3, 2]
155 // NVVM: %[[c1:.*]] = llvm.mlir.constant(1 : index) : i64
156 // NVVM: %[[descr10:.*]] = llvm.insertvalue %[[c1]], %[[descr9]][4, 2]
158 // ROCDL: %[[descr1:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i64, array<3 x i64>, array<3 x i64>)>
159 // ROCDL: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
160 // ROCDL: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
161 // ROCDL: %[[c0:.*]] = llvm.mlir.constant(0 : index) : i64
162 // ROCDL: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
163 // ROCDL: %[[c4:.*]] = llvm.mlir.constant(4 : index) : i64
164 // ROCDL: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
165 // ROCDL: %[[c12:.*]] = llvm.mlir.constant(12 : index) : i64
166 // ROCDL: %[[descr6:.*]] = llvm.insertvalue %[[c12]], %[[descr5]][4, 0]
167 // ROCDL: %[[c2:.*]] = llvm.mlir.constant(2 : index) : i64
168 // ROCDL: %[[descr7:.*]] = llvm.insertvalue %[[c2]], %[[descr6]][3, 1]
169 // ROCDL: %[[c6:.*]] = llvm.mlir.constant(6 : index) : i64
170 // ROCDL: %[[descr8:.*]] = llvm.insertvalue %[[c6]], %[[descr7]][4, 1]
171 // ROCDL: %[[c6:.*]] = llvm.mlir.constant(6 : index) : i64
172 // ROCDL: %[[descr9:.*]] = llvm.insertvalue %[[c6]], %[[descr8]][3, 2]
173 // ROCDL: %[[c1:.*]] = llvm.mlir.constant(1 : index) : i64
174 // ROCDL: %[[descr10:.*]] = llvm.insertvalue %[[c1]], %[[descr9]][4, 2]
176 %c0 = arith.constant 0 : index
177 memref.store %arg0, %arg1[%c0,%c0,%c0] : memref<4x2x6xf32, #gpu.address_space<workgroup>>
178 "terminator"() : () -> ()
185 // Check that several buffers are defined.
186 // NVVM: llvm.mlir.global internal @[[$buffer1:.*]]()
187 // NVVM-SAME: !llvm.array<1 x f32>
188 // NVVM: llvm.mlir.global internal @[[$buffer2:.*]]()
189 // NVVM-SAME: !llvm.array<2 x f32>
191 // ROCDL: llvm.mlir.global internal @[[$buffer1:.*]]()
192 // ROCDL-SAME: !llvm.array<1 x f32>
193 // ROCDL: llvm.mlir.global internal @[[$buffer2:.*]]()
194 // ROCDL-SAME: !llvm.array<2 x f32>
196 // NVVM-LABEL: llvm.func @multiple
197 // ROCDL-LABEL: llvm.func @multiple
198 gpu.func @multiple(%arg0: f32)
199 workgroup(%arg1: memref<1xf32, #gpu.address_space<workgroup>>, %arg2: memref<2xf32, #gpu.address_space<workgroup>>)
200 private(%arg3: memref<3xf32, #gpu.address_space<private>>, %arg4: memref<4xf32, #gpu.address_space<private>>) {
202 // Workgroup buffers.
203 // NVVM: llvm.mlir.addressof @[[$buffer1]]
204 // NVVM: llvm.mlir.addressof @[[$buffer2]]
206 // ROCDL: llvm.mlir.addressof @[[$buffer1]]
207 // ROCDL: llvm.mlir.addressof @[[$buffer2]]
210 // NVVM: %[[c3:.*]] = llvm.mlir.constant(3 : i64)
211 // NVVM: llvm.alloca %[[c3]] x f32 : (i64) -> !llvm.ptr
212 // NVVM: %[[c4:.*]] = llvm.mlir.constant(4 : i64)
213 // NVVM: llvm.alloca %[[c4]] x f32 : (i64) -> !llvm.ptr
215 // ROCDL: %[[c3:.*]] = llvm.mlir.constant(3 : i64)
216 // ROCDL: llvm.alloca %[[c3]] x f32 : (i64) -> !llvm.ptr<5>
217 // ROCDL: %[[c4:.*]] = llvm.mlir.constant(4 : i64)
218 // ROCDL: llvm.alloca %[[c4]] x f32 : (i64) -> !llvm.ptr<5>
220 %c0 = arith.constant 0 : index
221 memref.store %arg0, %arg1[%c0] : memref<1xf32, #gpu.address_space<workgroup>>
222 memref.store %arg0, %arg2[%c0] : memref<2xf32, #gpu.address_space<workgroup>>
223 memref.store %arg0, %arg3[%c0] : memref<3xf32, #gpu.address_space<private>>
224 memref.store %arg0, %arg4[%c0] : memref<4xf32, #gpu.address_space<private>>
225 "terminator"() : () -> ()
232 // Check that alignment attributes are set correctly
233 // NVVM: llvm.mlir.global internal @[[$buffer:.*]]()
234 // NVVM-SAME: addr_space = 3
235 // NVVM-SAME: alignment = 8
236 // NVVM-SAME: !llvm.array<48 x f32>
238 // ROCDL: llvm.mlir.global internal @[[$buffer:.*]]()
239 // ROCDL-SAME: addr_space = 3
240 // ROCDL-SAME: alignment = 8
241 // ROCDL-SAME: !llvm.array<48 x f32>
243 // NVVM-LABEL: llvm.func @explicitAlign
244 // ROCDL-LABEL: llvm.func @explicitAlign
245 gpu.func @explicitAlign(%arg0 : index)
246 workgroup(%arg1: memref<48xf32, #gpu.address_space<workgroup>> {llvm.align = 8 : i64})
247 private(%arg2: memref<48xf32, #gpu.address_space<private>> {llvm.align = 4 : i64}) {
248 // NVVM: %[[size:.*]] = llvm.mlir.constant(48 : i64) : i64
249 // NVVM: %[[raw:.*]] = llvm.alloca %[[size]] x f32 {alignment = 4 : i64} : (i64) -> !llvm.ptr
251 // ROCDL: %[[size:.*]] = llvm.mlir.constant(48 : i64) : i64
252 // ROCDL: %[[raw:.*]] = llvm.alloca %[[size]] x f32 {alignment = 4 : i64} : (i64) -> !llvm.ptr<5>
254 %val = memref.load %arg1[%arg0] : memref<48xf32, #gpu.address_space<workgroup>>
255 memref.store %val, %arg2[%arg0] : memref<48xf32, #gpu.address_space<private>>
256 "terminator"() : () -> ()