4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright 2010 PathScale Inc. All rights reserved.
24 * Use is subject to license terms.
35 #include "libpscnv_ib.h"
38 #define START_STRIDED 0
39 #define START_LINEAR 0x50
41 0x11002218, // mov b16 $r3l u16 s[0x2] // gets ntid.x to $r3l
42 0x61062c00, // add $r0 mul u16 u16 s[0xc] $r3l $r0 // gets tid.x + ctaid.x * ntid.x to $r0, our loop counter [tid.x is always at $r0 at CP start]
43 0x3000c9fd, 0x6420c7c8, // set le u32 $c0 # b32 s[0x10] $r0 // if size <= counter, set $c0
44 0x30000003, 0x00000280, // lg $c0 ret // if size <= counter, exit
46 0x30020005, 0xc4100780, // shl b32 $r1 $r0 0x2 // counter*4 to $r1
47 0xd0000209, 0x80c00780, // mov b32 $r2 g0[$r1] // read from input area
48 0xd0010209, 0xa0c00780, // mov b32 g1[$r1] $r2 // write to output area
49 0x60064801, 0x00200780, // add $r0 mul u16 u16 s[0x8] $r3l $r0 // counter += ntid.x * nctaid.x
50 0x3000c9fd, 0x642107c8, // set gt u32 $c0 # b32 s[0x10] $r0 // if size > counter, set $c0
51 0x10003003, 0x00000280, // lg $c0 bra 0x18 // if size > counter, loop
52 0x30000003, 0x00000780, // ret // exit program.
54 0x1100ea00, // mov b32 $r0 b32 s[0x14] // gets stride to $r0
55 0x41002c04, // mul $r1 u16 u16 s[0xc] u16 $r0l
56 0x41012c08, // mul $r2 u16 u16 s[0xc] u16 $r0h
57 0x2004060c, // add b16 $r1h $r1h $r2l // multiplies stride by ctaid.x and puts result into $r1, our loop counter
58 0x20000009, 0x04004780, // add b32 $r2 $r0 $r1 // stride*ctaid.x + stride into $r2
59 0x3002c809, 0xa4200780, // min u32 $r2 b32 s[0x10] $r2 // min (stride*ctaid.x + stride, size) into $r2, the upper bound of our loop
60 0x300203fd, 0x640187c8, // set ge u32 $c0 # $r1 $r2 // set $c0 if counter >= max
61 0x30000003, 0x00000280, // lg $c0 ret // finish execution if counter >= max
63 0x3002020d, 0xc4100780, // shl b32 $r3 $r1 0x2 // counter*4 to $r3
64 0xd0000601, 0x80c00780, // mov b32 $r0 g0[$r3] // read from input area
65 0xd0010601, 0xa0c00780, // mov b32 g1[$r3] $r0 // write to output area
66 0x20018205, 0x00000003, // add b32 $r1 $r1 0x1 // counter++
67 0x300203fd, 0x640047c8, // set lt u32 $c0 # $r1 $r2 // set $c0 if counter < max
68 0x10010003, 0x00000280, // lg $c0 bra 0x80 // and branch back to loop if so
69 0x30000003, 0x00000780, // ret // exit program.
73 #define CPSZ sizeof(cpcode)
75 struct pscnv_ib_bo
*in
;
76 struct pscnv_ib_bo
*out
;
77 struct pscnv_ib_bo
*cp
;
78 struct pscnv_ib_chan
*chan
;
80 int bytes
, ints
, threads
, ctas
;
82 void init(int *drm_fd
) {
86 fd
= drmOpen("pscnv", 0);
88 printf ("failed to open drm");
93 err
= pscnv_ib_chan_new(fd
, 0, &chan
, 0xdeadbeef, 0, 0);
95 printf ("chan: %s\n", strerror(-err
));
99 ret
= pscnv_obj_eng_new(fd
, chan
->cid
, 0xdeadd00d, 0x50c0, 0);
101 printf ("tesla: %s\n", strerror(-ret
));
105 if (err
= pscnv_ib_bo_alloc(fd
, chan
->vid
, 0x1, PSCNV_GEM_VRAM_SMALL
| PSCNV_GEM_MAPPABLE
, 0, bytes
+ 0x4000, 0, &in
)) {
106 printf ("in: %s\n", strerror(-err
));
110 if (err
= pscnv_ib_bo_alloc(fd
, chan
->vid
, 0x1, PSCNV_GEM_VRAM_SMALL
| PSCNV_GEM_MAPPABLE
, 0, bytes
+ 0x4000, 0, &out
)) {
111 printf ("out: %s\n", strerror(-err
));
115 if (err
= pscnv_ib_bo_alloc(fd
, chan
->vid
, 0x1, PSCNV_GEM_VRAM_SMALL
| PSCNV_GEM_MAPPABLE
, 0, 4096, 0, &cp
)) {
116 printf ("cp: %s\n", strerror(-err
));
120 memcpy(cp
->map
, cpcode
, CPSZ
);
122 /* from which offset of pushbuf, size of command and find free slot by nouveau_dma_wait and submit by nvchan_wr32(chan, 0x8c, chan->dma.ib_put); */
123 /* write batchbuffer don't forget to fire at the end */
125 /* nouveau_pushbufs_alloc */
127 BEGIN_RING50(chan
, 0, 0, 1);
128 OUT_RING(chan
, 0xdeadd00d);
130 // BEGIN_RING50(chan, 0, 0x180, 1); // DMA_NOTIFY
131 // OUT_RING(chan, (subc << 13) | (1 << 18) | 0x180);
132 // OUT_RING(chan, notify->handle);
134 BEGIN_RING50(chan
, 0, 0x1a0, 1); // DMA_GLOBAL
135 OUT_RING(chan
, 0xdeadbeef);
137 BEGIN_RING50(chan
, 0, 0x1c0, 1); // DMA_CODE_CB
138 OUT_RING(chan
, 0xdeadbeef);
140 BEGIN_RING50(chan
, 0, 0x2b8, 1); // enable all lanes
142 BEGIN_RING50(chan
, 0, 0x3b8, 1);
144 /* nouveau_pushbufs_submit */
149 struct timeval tvb
, tve
;
151 void prepare_mem(int fd
) {
155 if (err = nouveau_bo_map(in, NOUVEAU_BO_RD|NOUVEAU_BO_WR)) {
156 printf ("mapin: %s\n", strerror(-err));
159 memset (in->map, 1, bytes);
160 nouveau_bo_unmap (in);
162 memset(in
->map
, 1, bytes
);
164 if (err = nouveau_bo_map(out, NOUVEAU_BO_RD|NOUVEAU_BO_WR)) {
165 printf ("mapout: %s\n", strerror(-err));
168 memset (out->map, 0, bytes);
169 nouveau_bo_unmap (out);
171 memset(out
->map
, 0, bytes
);
172 gettimeofday(&tvb
, 0);
175 void check_mem(int fd
) {
179 if (err = nouveau_bo_map(out, NOUVEAU_BO_RD|NOUVEAU_BO_WR)) {
180 printf ("mapout: %s\n", strerror(-err));
184 gettimeofday(&tve
, 0); // we need to get the time after mapping, since it's the sync point with GPU
185 double secdiff
= tve
.tv_sec
- tvb
.tv_sec
;
186 secdiff
+= (tve
.tv_usec
- tvb
.tv_usec
) / 1000000.0;
187 printf ("\t%fs ", secdiff
);
189 intptr
= malloc(bytes
);
190 memcpy(intptr
, out
->map
, bytes
);
192 // intptr = out->map;
193 for (i
= 0; i
< ints
; i
++)
194 if (intptr
[i
] != 0x01010101) {
195 printf ("Copy failed at index %d!\n", i
);
199 // nouveau_bo_unmap (out);
200 printf ("Passed.\n");
203 void stridetest(int fd
) {
204 printf ("Trying strided access... ");
207 BEGIN_RING50(chan
, 0, 0x210, 2);
208 // OUT_RELOC(chan, cp, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
209 // OUT_RELOC(chan, cp, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
210 OUT_RING(chan
, cp
->vm_base
>> 32);
211 OUT_RING(chan
, cp
->vm_base
);
213 BEGIN_RING50(chan
, 0, 0x2b4, 1);
214 OUT_RING(chan
, threads
); // THREADS_PER_BLOCK
216 BEGIN_RING50(chan
, 0, 0x2c0, 1);
217 OUT_RING(chan
, CPREGS
);
219 BEGIN_RING50(chan
, 0, 0x3a4, 5);
220 OUT_RING(chan
, 0x00010000 | ctas
); // GRIDDIM
221 OUT_RING(chan
, 0x40); // SHARED_SIZE
222 OUT_RING(chan
, 0x10000 | threads
); // BLOCKDIM_XY
223 OUT_RING(chan
, 0x1); // BLOCKDIM_Z
224 OUT_RING(chan
, 0); // CP_START_ID
226 BEGIN_RING50(chan
, 0, 0x400, 5); // input segment
227 // OUT_RELOC(chan, in, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
228 // OUT_RELOC(chan, in, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
229 OUT_RING(chan
, in
->vm_base
>> 32);
230 OUT_RING(chan
, in
->vm_base
);
231 OUT_RING(chan
, 0x00000);
232 OUT_RING(chan
, 0xfffffff);
235 BEGIN_RING50(chan
, 0, 0x420, 5); // output segment
236 // OUT_RELOC(chan, out, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR | NOUVEAU_BO_HIGH, 0, 0);
237 // OUT_RELOC(chan, out, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR | NOUVEAU_BO_LOW, 0, 0);
238 OUT_RING(chan
, out
->vm_base
>> 32);
239 OUT_RING(chan
, out
->vm_base
);
240 OUT_RING(chan
, 0x00000);
241 OUT_RING(chan
, 0xfffffff);
244 BEGIN_RING50(chan
, 0, 0x374, 1); // USER_PARAM_COUNT
245 OUT_RING(chan
, 1 << 8);
247 BEGIN_RING50(chan
, 0, 0x600, 1); // USER_PARAM
248 OUT_RING(chan
, bytes
/4);
250 BEGIN_RING50(chan
, 0, 0x3b4, 1); // CP_START_ID
251 OUT_RING(chan
, START_STRIDED
);
253 BEGIN_RING50(chan
, 0, 0x2f8, 1);
254 OUT_RING(chan
, 1); // latch BLOCKDIM
256 BEGIN_RING50(chan
, 0, 0x368, 1);
257 OUT_RING(chan
, 0); // LAUNCH
259 BEGIN_RING50(chan
, 0, 0x50, 1);
260 OUT_RING(chan
, 0x1); // LAUNCH
264 while (chan
->chmap
[0x48/4] != 1);
269 void lineartest(int fd
) {
270 printf ("Trying linear access... ");
273 BEGIN_RING50(chan
, 0, 0x210, 2);
274 // OUT_RELOC(chan, cp, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
275 // OUT_RELOC(chan, cp, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
276 OUT_RING(chan
, cp
->vm_base
>> 32);
277 OUT_RING(chan
, cp
->vm_base
);
279 BEGIN_RING50(chan
, 0, 0x2b4, 1);
280 OUT_RING(chan
, 1); // THREADS_PER_BLOCK
282 BEGIN_RING50(chan
, 0, 0x2c0, 1);
283 OUT_RING(chan
, CPREGS
);
285 BEGIN_RING50(chan
, 0, 0x3a4, 5);
286 OUT_RING(chan
, 0x00010000 | ctas
); // GRIDDIM
287 OUT_RING(chan
, 0x40); // SHARED_SIZE
288 OUT_RING(chan
, 0x10001); // BLOCKDIM_XY
289 OUT_RING(chan
, 0x1); // BLOCKDIM_Z
290 OUT_RING(chan
, 0); // CP_START_ID
292 BEGIN_RING50(chan
, 0, 0x400, 5); // input segment
293 // OUT_RELOC(chan, in, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
294 // OUT_RELOC(chan, in, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
295 OUT_RING(chan
, in
->vm_base
>> 32);
296 OUT_RING(chan
, in
->vm_base
);
297 OUT_RING(chan
, 0x00000);
298 OUT_RING(chan
, 0xfffffff);
301 BEGIN_RING50(chan
, 0, 0x420, 5); // output segment
302 // OUT_RELOC(chan, out, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR | NOUVEAU_BO_HIGH, 0, 0);
303 // OUT_RELOC(chan, out, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR | NOUVEAU_BO_LOW, 0, 0);
304 OUT_RING(chan
, out
->vm_base
>> 32);
305 OUT_RING(chan
, out
->vm_base
);
306 OUT_RING(chan
, 0x00000);
307 OUT_RING(chan
, 0xfffffff);
310 BEGIN_RING50(chan
, 0, 0x374, 1); // USER_PARAM_COUNT
311 OUT_RING(chan
, 2 << 8);
313 int stride
= ( ints
% ctas
== 0 ) ? ints
/ ctas
: (ints
/ ctas
) + 1;
315 BEGIN_RING50(chan
, 0, 0x600, 2); // USER_PARAM
316 OUT_RING(chan
, bytes
);
317 OUT_RING(chan
, stride
);
319 BEGIN_RING50(chan
, 0, 0x3b4, 1); // CP_START_ID
320 OUT_RING(chan
, START_LINEAR
);
322 BEGIN_RING50(chan
, 0, 0x2f8, 1);
323 OUT_RING(chan
, 1); // latch BLOCKDIM
325 BEGIN_RING50(chan
, 0, 0x368, 1);
326 OUT_RING(chan
, 0); // LAUNCH
328 BEGIN_RING50(chan
, 0, 0x50, 1);
333 while (chan
->chmap
[0x48/4] != 2);
337 int main(int argc
, char **argv
) {
347 while ((c
= getopt (argc
, argv
, "s:c:t:")) != -1)
350 bytes
= atoi(optarg
);
356 threads
= atoi(optarg
);
360 ints
= bytes
/ sizeof(int);