cuda.c: move up free_device_arrays
[ppcg.git] / cuda.c
blob48e38f30f1717c9ffa68da3549556150e91d3460
1 /*
2 * Copyright 2012 Ecole Normale Superieure
4 * Use of this software is governed by the MIT license
6 * Written by Sven Verdoolaege,
7 * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
8 */
10 #include <isl/aff.h>
11 #include <isl/ast.h>
13 #include "cuda_common.h"
14 #include "cuda.h"
15 #include "gpu.h"
16 #include "gpu_print.h"
17 #include "print.h"
18 #include "util.h"
20 static __isl_give isl_printer *print_cuda_macros(__isl_take isl_printer *p)
22 const char *macros =
23 "#define cudaCheckReturn(ret) \\\n"
24 " do { \\\n"
25 " cudaError_t cudaCheckReturn_e = (ret); \\\n"
26 " if (cudaCheckReturn_e != cudaSuccess) { \\\n"
27 " fprintf(stderr, \"CUDA error: %s\\n\", "
28 "cudaGetErrorString(cudaCheckReturn_e)); \\\n"
29 " fflush(stderr); \\\n"
30 " } \\\n"
31 " assert(cudaCheckReturn_e == cudaSuccess); \\\n"
32 " } while(0)\n"
33 "#define cudaCheckKernel() \\\n"
34 " do { \\\n"
35 " cudaCheckReturn(cudaGetLastError()); \\\n"
36 " } while(0)\n\n";
38 p = isl_printer_print_str(p, macros);
39 return p;
42 /* Print a declaration for the device array corresponding to "array" on "p".
44 static __isl_give isl_printer *declare_device_array(__isl_take isl_printer *p,
45 struct gpu_array_info *array)
47 int i;
49 p = isl_printer_start_line(p);
50 p = isl_printer_print_str(p, array->type);
51 p = isl_printer_print_str(p, " ");
52 if (!array->linearize && array->n_index > 1)
53 p = isl_printer_print_str(p, "(");
54 p = isl_printer_print_str(p, "*dev_");
55 p = isl_printer_print_str(p, array->name);
56 if (!array->linearize && array->n_index > 1) {
57 p = isl_printer_print_str(p, ")");
58 for (i = 1; i < array->n_index; i++) {
59 p = isl_printer_print_str(p, "[");
60 p = isl_printer_print_pw_aff(p, array->bound[i]);
61 p = isl_printer_print_str(p, "]");
64 p = isl_printer_print_str(p, ";");
65 p = isl_printer_end_line(p);
67 return p;
70 static __isl_give isl_printer *declare_device_arrays(__isl_take isl_printer *p,
71 struct gpu_prog *prog)
73 int i;
75 for (i = 0; i < prog->n_array; ++i) {
76 if (!gpu_array_requires_device_allocation(&prog->array[i]))
77 continue;
79 p = declare_device_array(p, &prog->array[i]);
81 p = isl_printer_start_line(p);
82 p = isl_printer_end_line(p);
83 return p;
86 static __isl_give isl_printer *allocate_device_arrays(
87 __isl_take isl_printer *p, struct gpu_prog *prog)
89 int i;
91 for (i = 0; i < prog->n_array; ++i) {
92 if (!gpu_array_requires_device_allocation(&prog->array[i]))
93 continue;
94 p = isl_printer_start_line(p);
95 p = isl_printer_print_str(p,
96 "cudaCheckReturn(cudaMalloc((void **) &dev_");
97 p = isl_printer_print_str(p, prog->array[i].name);
98 p = isl_printer_print_str(p, ", ");
99 p = gpu_array_info_print_size(p, &prog->array[i]);
100 p = isl_printer_print_str(p, "));");
101 p = isl_printer_end_line(p);
103 p = isl_printer_start_line(p);
104 p = isl_printer_end_line(p);
105 return p;
108 static __isl_give isl_printer *free_device_arrays(__isl_take isl_printer *p,
109 struct gpu_prog *prog)
111 int i;
113 for (i = 0; i < prog->n_array; ++i) {
114 if (!gpu_array_requires_device_allocation(&prog->array[i]))
115 continue;
116 p = isl_printer_start_line(p);
117 p = isl_printer_print_str(p, "cudaCheckReturn(cudaFree(dev_");
118 p = isl_printer_print_str(p, prog->array[i].name);
119 p = isl_printer_print_str(p, "));");
120 p = isl_printer_end_line(p);
123 return p;
126 /* Print code to "p" for copying "array" from the host to the device
127 * in its entirety. The bounds on the extent of "array" have
128 * been precomputed in extract_array_info and are used in
129 * gpu_array_info_print_size.
131 static __isl_give isl_printer *copy_array_to_device(__isl_take isl_printer *p,
132 struct gpu_array_info *array)
134 p = isl_printer_start_line(p);
135 p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(dev_");
136 p = isl_printer_print_str(p, array->name);
137 p = isl_printer_print_str(p, ", ");
139 if (gpu_array_is_scalar(array))
140 p = isl_printer_print_str(p, "&");
141 p = isl_printer_print_str(p, array->name);
142 p = isl_printer_print_str(p, ", ");
144 p = gpu_array_info_print_size(p, array);
145 p = isl_printer_print_str(p, ", cudaMemcpyHostToDevice));");
146 p = isl_printer_end_line(p);
148 return p;
151 /* Print code to "p" for copying "array" back from the device to the host
152 * in its entirety. The bounds on the extent of "array" have
153 * been precomputed in extract_array_info and are used in
154 * gpu_array_info_print_size.
156 static __isl_give isl_printer *copy_array_from_device(
157 __isl_take isl_printer *p, struct gpu_array_info *array)
159 p = isl_printer_start_line(p);
160 p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(");
161 if (gpu_array_is_scalar(array))
162 p = isl_printer_print_str(p, "&");
163 p = isl_printer_print_str(p, array->name);
164 p = isl_printer_print_str(p, ", dev_");
165 p = isl_printer_print_str(p, array->name);
166 p = isl_printer_print_str(p, ", ");
167 p = gpu_array_info_print_size(p, array);
168 p = isl_printer_print_str(p, ", cudaMemcpyDeviceToHost));");
169 p = isl_printer_end_line(p);
171 return p;
174 static void print_reverse_list(FILE *out, int len, int *list)
176 int i;
178 if (!out || len == 0)
179 return;
181 fprintf(out, "(");
182 for (i = 0; i < len; ++i) {
183 if (i)
184 fprintf(out, ", ");
185 fprintf(out, "%d", list[len - 1 - i]);
187 fprintf(out, ")");
190 /* Print the effective grid size as a list of the sizes in each
191 * dimension, from innermost to outermost.
193 static __isl_give isl_printer *print_grid_size(__isl_take isl_printer *p,
194 struct ppcg_kernel *kernel)
196 int i;
197 int dim;
199 dim = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set);
200 if (dim == 0)
201 return p;
203 p = isl_printer_print_str(p, "(");
204 for (i = dim - 1; i >= 0; --i) {
205 isl_pw_aff *bound;
207 bound = isl_multi_pw_aff_get_pw_aff(kernel->grid_size, i);
208 p = isl_printer_print_pw_aff(p, bound);
209 isl_pw_aff_free(bound);
211 if (i > 0)
212 p = isl_printer_print_str(p, ", ");
215 p = isl_printer_print_str(p, ")");
217 return p;
220 /* Print the grid definition.
222 static __isl_give isl_printer *print_grid(__isl_take isl_printer *p,
223 struct ppcg_kernel *kernel)
225 p = isl_printer_start_line(p);
226 p = isl_printer_print_str(p, "dim3 k");
227 p = isl_printer_print_int(p, kernel->id);
228 p = isl_printer_print_str(p, "_dimGrid");
229 p = print_grid_size(p, kernel);
230 p = isl_printer_print_str(p, ";");
231 p = isl_printer_end_line(p);
233 return p;
236 /* Print the arguments to a kernel declaration or call. If "types" is set,
237 * then print a declaration (including the types of the arguments).
239 * The arguments are printed in the following order
240 * - the arrays accessed by the kernel
241 * - the parameters
242 * - the host loop iterators
244 static __isl_give isl_printer *print_kernel_arguments(__isl_take isl_printer *p,
245 struct gpu_prog *prog, struct ppcg_kernel *kernel, int types)
247 int i, n;
248 int first = 1;
249 unsigned nparam;
250 isl_space *space;
251 const char *type;
253 for (i = 0; i < prog->n_array; ++i) {
254 int required;
256 required = ppcg_kernel_requires_array_argument(kernel, i);
257 if (required < 0)
258 return isl_printer_free(p);
259 if (!required)
260 continue;
262 if (!first)
263 p = isl_printer_print_str(p, ", ");
265 if (types)
266 p = gpu_array_info_print_declaration_argument(p,
267 &prog->array[i], NULL);
268 else
269 p = gpu_array_info_print_call_argument(p,
270 &prog->array[i]);
272 first = 0;
275 space = isl_union_set_get_space(kernel->arrays);
276 nparam = isl_space_dim(space, isl_dim_param);
277 for (i = 0; i < nparam; ++i) {
278 const char *name;
280 name = isl_space_get_dim_name(space, isl_dim_param, i);
282 if (!first)
283 p = isl_printer_print_str(p, ", ");
284 if (types)
285 p = isl_printer_print_str(p, "int ");
286 p = isl_printer_print_str(p, name);
288 first = 0;
290 isl_space_free(space);
292 n = isl_space_dim(kernel->space, isl_dim_set);
293 type = isl_options_get_ast_iterator_type(prog->ctx);
294 for (i = 0; i < n; ++i) {
295 const char *name;
297 if (!first)
298 p = isl_printer_print_str(p, ", ");
299 name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
300 if (types) {
301 p = isl_printer_print_str(p, type);
302 p = isl_printer_print_str(p, " ");
304 p = isl_printer_print_str(p, name);
306 first = 0;
309 return p;
312 /* Print the header of the given kernel.
314 static __isl_give isl_printer *print_kernel_header(__isl_take isl_printer *p,
315 struct gpu_prog *prog, struct ppcg_kernel *kernel)
317 p = isl_printer_start_line(p);
318 p = isl_printer_print_str(p, "__global__ void kernel");
319 p = isl_printer_print_int(p, kernel->id);
320 p = isl_printer_print_str(p, "(");
321 p = print_kernel_arguments(p, prog, kernel, 1);
322 p = isl_printer_print_str(p, ")");
324 return p;
327 /* Print the header of the given kernel to both gen->cuda.kernel_h
328 * and gen->cuda.kernel_c.
330 static void print_kernel_headers(struct gpu_prog *prog,
331 struct ppcg_kernel *kernel, struct cuda_info *cuda)
333 isl_printer *p;
335 p = isl_printer_to_file(prog->ctx, cuda->kernel_h);
336 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
337 p = print_kernel_header(p, prog, kernel);
338 p = isl_printer_print_str(p, ";");
339 p = isl_printer_end_line(p);
340 isl_printer_free(p);
342 p = isl_printer_to_file(prog->ctx, cuda->kernel_c);
343 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
344 p = print_kernel_header(p, prog, kernel);
345 p = isl_printer_end_line(p);
346 isl_printer_free(p);
349 static void print_indent(FILE *dst, int indent)
351 fprintf(dst, "%*s", indent, "");
354 /* Print a list of iterators of type "type" with names "ids" to "out".
355 * Each iterator is assigned one of the cuda identifiers in cuda_dims.
356 * In particular, the last iterator is assigned the x identifier
357 * (the first in the list of cuda identifiers).
359 static void print_iterators(FILE *out, const char *type,
360 __isl_keep isl_id_list *ids, const char *cuda_dims[])
362 int i, n;
364 n = isl_id_list_n_id(ids);
365 if (n <= 0)
366 return;
367 print_indent(out, 4);
368 fprintf(out, "%s ", type);
369 for (i = 0; i < n; ++i) {
370 isl_id *id;
372 if (i)
373 fprintf(out, ", ");
374 id = isl_id_list_get_id(ids, i);
375 fprintf(out, "%s = %s", isl_id_get_name(id),
376 cuda_dims[n - 1 - i]);
377 isl_id_free(id);
379 fprintf(out, ";\n");
382 static void print_kernel_iterators(FILE *out, struct ppcg_kernel *kernel)
384 isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
385 const char *type;
386 const char *block_dims[] = { "blockIdx.x", "blockIdx.y" };
387 const char *thread_dims[] = { "threadIdx.x", "threadIdx.y",
388 "threadIdx.z" };
390 type = isl_options_get_ast_iterator_type(ctx);
392 print_iterators(out, type, kernel->block_ids, block_dims);
393 print_iterators(out, type, kernel->thread_ids, thread_dims);
396 static __isl_give isl_printer *print_kernel_var(__isl_take isl_printer *p,
397 struct ppcg_kernel_var *var)
399 int j;
401 p = isl_printer_start_line(p);
402 if (var->type == ppcg_access_shared)
403 p = isl_printer_print_str(p, "__shared__ ");
404 p = isl_printer_print_str(p, var->array->type);
405 p = isl_printer_print_str(p, " ");
406 p = isl_printer_print_str(p, var->name);
407 for (j = 0; j < var->array->n_index; ++j) {
408 isl_val *v;
410 p = isl_printer_print_str(p, "[");
411 v = isl_vec_get_element_val(var->size, j);
412 p = isl_printer_print_val(p, v);
413 isl_val_free(v);
414 p = isl_printer_print_str(p, "]");
416 p = isl_printer_print_str(p, ";");
417 p = isl_printer_end_line(p);
419 return p;
422 static __isl_give isl_printer *print_kernel_vars(__isl_take isl_printer *p,
423 struct ppcg_kernel *kernel)
425 int i;
427 for (i = 0; i < kernel->n_var; ++i)
428 p = print_kernel_var(p, &kernel->var[i]);
430 return p;
433 /* Print a sync statement.
435 static __isl_give isl_printer *print_sync(__isl_take isl_printer *p,
436 struct ppcg_kernel_stmt *stmt)
438 p = isl_printer_start_line(p);
439 p = isl_printer_print_str(p, "__syncthreads();");
440 p = isl_printer_end_line(p);
442 return p;
445 /* This function is called for each user statement in the AST,
446 * i.e., for each kernel body statement, copy statement or sync statement.
448 static __isl_give isl_printer *print_kernel_stmt(__isl_take isl_printer *p,
449 __isl_take isl_ast_print_options *print_options,
450 __isl_keep isl_ast_node *node, void *user)
452 isl_id *id;
453 struct ppcg_kernel_stmt *stmt;
455 id = isl_ast_node_get_annotation(node);
456 stmt = isl_id_get_user(id);
457 isl_id_free(id);
459 isl_ast_print_options_free(print_options);
461 switch (stmt->type) {
462 case ppcg_kernel_copy:
463 return ppcg_kernel_print_copy(p, stmt);
464 case ppcg_kernel_sync:
465 return print_sync(p, stmt);
466 case ppcg_kernel_domain:
467 return ppcg_kernel_print_domain(p, stmt);
470 return p;
473 static void print_kernel(struct gpu_prog *prog, struct ppcg_kernel *kernel,
474 struct cuda_info *cuda)
476 isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
477 isl_ast_print_options *print_options;
478 isl_printer *p;
480 print_kernel_headers(prog, kernel, cuda);
481 fprintf(cuda->kernel_c, "{\n");
482 print_kernel_iterators(cuda->kernel_c, kernel);
484 p = isl_printer_to_file(ctx, cuda->kernel_c);
485 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
486 p = isl_printer_indent(p, 4);
488 p = print_kernel_vars(p, kernel);
489 p = isl_printer_end_line(p);
490 p = ppcg_set_macro_names(p);
491 p = isl_ast_op_type_print_macro(isl_ast_op_fdiv_q, p);
492 p = ppcg_print_macros(p, kernel->tree);
494 print_options = isl_ast_print_options_alloc(ctx);
495 print_options = isl_ast_print_options_set_print_user(print_options,
496 &print_kernel_stmt, NULL);
497 p = isl_ast_node_print(kernel->tree, p, print_options);
498 isl_printer_free(p);
500 fprintf(cuda->kernel_c, "}\n");
503 /* Print a statement for copying an array to or from the device.
504 * The statement identifier is called "to_device_<array name>" or
505 * "from_device_<array name>" and its user pointer points
506 * to the gpu_array_info of the array that needs to be copied.
508 * Extract the array from the identifier and call
509 * copy_array_to_device or copy_array_from_device.
511 static __isl_give isl_printer *print_to_from_device(__isl_take isl_printer *p,
512 __isl_keep isl_ast_node *node, struct gpu_prog *prog)
514 isl_ast_expr *expr, *arg;
515 isl_id *id;
516 const char *name;
517 struct gpu_array_info *array;
519 expr = isl_ast_node_user_get_expr(node);
520 arg = isl_ast_expr_get_op_arg(expr, 0);
521 id = isl_ast_expr_get_id(arg);
522 name = isl_id_get_name(id);
523 array = isl_id_get_user(id);
524 isl_id_free(id);
525 isl_ast_expr_free(arg);
526 isl_ast_expr_free(expr);
528 if (!name)
529 array = NULL;
530 if (!array)
531 return isl_printer_free(p);
533 if (!prefixcmp(name, "to_device"))
534 return copy_array_to_device(p, array);
535 else
536 return copy_array_from_device(p, array);
539 struct print_host_user_data {
540 struct cuda_info *cuda;
541 struct gpu_prog *prog;
544 /* Print the user statement of the host code to "p".
546 * The host code may contain original user statements, kernel launches and
547 * statements that copy data to/from the device.
548 * The original user statements and the kernel launches have
549 * an associated annotation, while the data copy statements do not.
550 * The latter are handled by print_to_from_device.
551 * The annotation on the user statements is called "user".
553 * In case of a kernel launch, print a block of statements that
554 * defines the grid and the block and then launches the kernel.
556 static __isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
557 __isl_take isl_ast_print_options *print_options,
558 __isl_keep isl_ast_node *node, void *user)
560 isl_id *id;
561 int is_user;
562 struct ppcg_kernel *kernel;
563 struct ppcg_kernel_stmt *stmt;
564 struct print_host_user_data *data;
566 isl_ast_print_options_free(print_options);
568 data = (struct print_host_user_data *) user;
570 id = isl_ast_node_get_annotation(node);
571 if (!id)
572 return print_to_from_device(p, node, data->prog);
574 is_user = !strcmp(isl_id_get_name(id), "user");
575 kernel = is_user ? NULL : isl_id_get_user(id);
576 stmt = is_user ? isl_id_get_user(id) : NULL;
577 isl_id_free(id);
579 if (is_user)
580 return ppcg_kernel_print_domain(p, stmt);
582 p = isl_printer_start_line(p);
583 p = isl_printer_print_str(p, "{");
584 p = isl_printer_end_line(p);
585 p = isl_printer_indent(p, 2);
587 p = isl_printer_start_line(p);
588 p = isl_printer_print_str(p, "dim3 k");
589 p = isl_printer_print_int(p, kernel->id);
590 p = isl_printer_print_str(p, "_dimBlock");
591 print_reverse_list(isl_printer_get_file(p),
592 kernel->n_block, kernel->block_dim);
593 p = isl_printer_print_str(p, ";");
594 p = isl_printer_end_line(p);
596 p = print_grid(p, kernel);
598 p = isl_printer_start_line(p);
599 p = isl_printer_print_str(p, "kernel");
600 p = isl_printer_print_int(p, kernel->id);
601 p = isl_printer_print_str(p, " <<<k");
602 p = isl_printer_print_int(p, kernel->id);
603 p = isl_printer_print_str(p, "_dimGrid, k");
604 p = isl_printer_print_int(p, kernel->id);
605 p = isl_printer_print_str(p, "_dimBlock>>> (");
606 p = print_kernel_arguments(p, data->prog, kernel, 0);
607 p = isl_printer_print_str(p, ");");
608 p = isl_printer_end_line(p);
610 p = isl_printer_start_line(p);
611 p = isl_printer_print_str(p, "cudaCheckKernel();");
612 p = isl_printer_end_line(p);
614 p = isl_printer_indent(p, -2);
615 p = isl_printer_start_line(p);
616 p = isl_printer_print_str(p, "}");
617 p = isl_printer_end_line(p);
619 p = isl_printer_start_line(p);
620 p = isl_printer_end_line(p);
622 print_kernel(data->prog, kernel, data->cuda);
624 return p;
627 static __isl_give isl_printer *print_host_code(__isl_take isl_printer *p,
628 struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
629 struct cuda_info *cuda)
631 isl_ast_print_options *print_options;
632 isl_ctx *ctx = isl_ast_node_get_ctx(tree);
633 struct print_host_user_data data = { cuda, prog };
635 print_options = isl_ast_print_options_alloc(ctx);
636 print_options = isl_ast_print_options_set_print_user(print_options,
637 &print_host_user, &data);
639 p = ppcg_print_macros(p, tree);
640 p = isl_ast_node_print(tree, p, print_options);
642 return p;
645 /* Given a gpu_prog "prog" and the corresponding transformed AST
646 * "tree", print the entire CUDA code to "p".
647 * "types" collects the types for which a definition has already
648 * been printed.
650 static __isl_give isl_printer *print_cuda(__isl_take isl_printer *p,
651 struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
652 struct gpu_types *types, void *user)
654 struct cuda_info *cuda = user;
655 isl_printer *kernel;
657 kernel = isl_printer_to_file(isl_printer_get_ctx(p), cuda->kernel_c);
658 kernel = isl_printer_set_output_format(kernel, ISL_FORMAT_C);
659 kernel = gpu_print_types(kernel, types, prog);
660 isl_printer_free(kernel);
662 if (!kernel)
663 return isl_printer_free(p);
665 p = ppcg_start_block(p);
667 p = print_cuda_macros(p);
669 p = gpu_print_local_declarations(p, prog);
670 p = declare_device_arrays(p, prog);
671 p = allocate_device_arrays(p, prog);
673 p = print_host_code(p, prog, tree, cuda);
675 p = free_device_arrays(p, prog);
677 p = ppcg_end_block(p);
679 return p;
682 /* Transform the code in the file called "input" by replacing
683 * all scops by corresponding CUDA code.
684 * The names of the output files are derived from "input".
686 * We let generate_gpu do all the hard work and then let it call
687 * us back for printing the AST in print_cuda.
689 * To prepare for this printing, we first open the output files
690 * and we close them after generate_gpu has finished.
692 int generate_cuda(isl_ctx *ctx, struct ppcg_options *options,
693 const char *input)
695 struct cuda_info cuda;
696 int r;
698 cuda_open_files(&cuda, input);
700 r = generate_gpu(ctx, input, cuda.host_c, options, &print_cuda, &cuda);
702 cuda_close_files(&cuda);
704 return r;