1 /* { dg-do run { target openacc_nvidia_accel_selected } } */
2 /* { dg-additional-options "-lcuda" } */
3 /* { dg-require-effective-target openacc_cuda } */
13 main (int argc
, char **argv
)
21 CUstream
*streams
, stream
;
22 unsigned long *a
, *d_a
, dticks
;
29 acc_init (acc_device_nvidia
);
31 devnum
= acc_get_device_num (acc_device_nvidia
);
33 r
= cuDeviceGet (&dev
, devnum
);
34 if (r
!= CUDA_SUCCESS
)
36 fprintf (stderr
, "cuDeviceGet failed: %d\n", r
);
41 cuDeviceGetAttribute (&nprocs
, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
,
43 if (r
!= CUDA_SUCCESS
)
45 fprintf (stderr
, "cuDeviceGetAttribute failed: %d\n", r
);
49 r
= cuDeviceGetAttribute (&clkrate
, CU_DEVICE_ATTRIBUTE_CLOCK_RATE
, dev
);
50 if (r
!= CUDA_SUCCESS
)
52 fprintf (stderr
, "cuDeviceGetAttribute failed: %d\n", r
);
56 r
= cuModuleLoad (&module
, "subr.ptx");
57 if (r
!= CUDA_SUCCESS
)
59 fprintf (stderr
, "cuModuleLoad failed: %d\n", r
);
63 r
= cuModuleGetFunction (&delay
, module
, "delay");
64 if (r
!= CUDA_SUCCESS
)
66 fprintf (stderr
, "cuModuleGetFunction failed: %d\n", r
);
70 nbytes
= nprocs
* sizeof (unsigned long);
74 dticks
= (unsigned long) (dtime
* clkrate
);
78 a
= (unsigned long *) malloc (nbytes
);
79 d_a
= (unsigned long *) acc_malloc (nbytes
);
81 acc_map_data (a
, d_a
, nbytes
);
83 streams
= (CUstream
*) malloc (N
* sizeof (void *));
85 for (i
= 0; i
< N
; i
++)
87 streams
[i
] = (CUstream
) acc_get_cuda_stream (i
);
88 if (streams
[i
] != NULL
)
91 r
= cuStreamCreate (&streams
[i
], CU_STREAM_DEFAULT
);
92 if (r
!= CUDA_SUCCESS
)
94 fprintf (stderr
, "cuStreamCreate failed: %d\n", r
);
98 if (!acc_set_cuda_stream (i
, streams
[i
]))
104 kargs
[0] = (void *) &d_a
;
105 kargs
[1] = (void *) &dticks
;
107 stream
= (CUstream
) acc_get_cuda_stream (N
);
111 r
= cuStreamCreate (&stream
, CU_STREAM_DEFAULT
);
112 if (r
!= CUDA_SUCCESS
)
114 fprintf (stderr
, "cuStreamCreate failed: %d\n", r
);
118 if (!acc_set_cuda_stream (N
, stream
))
123 for (i
= 0; i
< N
; i
++)
125 r
= cuLaunchKernel (delay
, 1, 1, 1, 1, 1, 1, 0, streams
[i
], kargs
, 0);
126 if (r
!= CUDA_SUCCESS
)
128 fprintf (stderr
, "cuLaunchKernel failed: %d\n", r
);
133 acc_wait_all_async (N
);
135 for (i
= 0; i
<= N
; i
++)
137 if (acc_async_test (i
) != 0)
143 for (i
= 0; i
<= N
; i
++)
145 if (acc_async_test (i
) != 1)
149 atime
= stop_timer (0);
153 fprintf (stderr
, "actual time < delay time\n");
159 stream
= (CUstream
) acc_get_cuda_stream (N
+ 1);
163 r
= cuStreamCreate (&stream
, CU_STREAM_DEFAULT
);
164 if (r
!= CUDA_SUCCESS
)
166 fprintf (stderr
, "cuStreamCreate failed: %d\n", r
);
170 if (!acc_set_cuda_stream (N
+ 1, stream
))
173 acc_wait_all_async (N
+ 1);
177 atime
= stop_timer (0);
181 fprintf (stderr
, "actual time too long\n");
187 acc_wait_all_async (N
);
191 atime
= stop_timer (0);
195 fprintf (stderr
, "actual time too long\n");
207 acc_shutdown (acc_device_nvidia
);
212 /* { dg-output "" } */