README: update latest release of clang
[ppcg.git] / cuda.c
bloba23d22aa585a8d4cb70e2ab8108de918bb0709d3
1 /*
2 * Copyright 2012 Ecole Normale Superieure
4 * Use of this software is governed by the MIT license
6 * Written by Sven Verdoolaege,
7 * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
8 */
10 #include <isl/aff.h>
11 #include <isl/ast.h>
13 #include "cuda_common.h"
14 #include "cuda.h"
15 #include "gpu.h"
16 #include "gpu_print.h"
17 #include "print.h"
18 #include "util.h"
20 static __isl_give isl_printer *print_cuda_macros(__isl_take isl_printer *p)
22 const char *macros =
23 "#define cudaCheckReturn(ret) \\\n"
24 " do { \\\n"
25 " cudaError_t cudaCheckReturn_e = (ret); \\\n"
26 " if (cudaCheckReturn_e != cudaSuccess) { \\\n"
27 " fprintf(stderr, \"CUDA error: %s\\n\", "
28 "cudaGetErrorString(cudaCheckReturn_e)); \\\n"
29 " fflush(stderr); \\\n"
30 " } \\\n"
31 " assert(cudaCheckReturn_e == cudaSuccess); \\\n"
32 " } while(0)\n"
33 "#define cudaCheckKernel() \\\n"
34 " do { \\\n"
35 " cudaCheckReturn(cudaGetLastError()); \\\n"
36 " } while(0)\n\n";
38 p = isl_printer_print_str(p, macros);
39 return p;
42 /* Print a declaration for the device array corresponding to "array" on "p".
44 static __isl_give isl_printer *declare_device_array(__isl_take isl_printer *p,
45 struct gpu_array_info *array)
47 int i;
49 p = isl_printer_start_line(p);
50 p = isl_printer_print_str(p, array->type);
51 p = isl_printer_print_str(p, " ");
52 if (!array->linearize && array->n_index > 1)
53 p = isl_printer_print_str(p, "(");
54 p = isl_printer_print_str(p, "*dev_");
55 p = isl_printer_print_str(p, array->name);
56 if (!array->linearize && array->n_index > 1) {
57 p = isl_printer_print_str(p, ")");
58 for (i = 1; i < array->n_index; i++) {
59 isl_ast_expr *bound;
60 bound = isl_ast_expr_get_op_arg(array->bound_expr,
61 1 + i);
62 p = isl_printer_print_str(p, "[");
63 p = isl_printer_print_ast_expr(p, bound);
64 p = isl_printer_print_str(p, "]");
65 isl_ast_expr_free(bound);
68 p = isl_printer_print_str(p, ";");
69 p = isl_printer_end_line(p);
71 return p;
74 static __isl_give isl_printer *declare_device_arrays(__isl_take isl_printer *p,
75 struct gpu_prog *prog)
77 int i;
79 for (i = 0; i < prog->n_array; ++i) {
80 if (!gpu_array_requires_device_allocation(&prog->array[i]))
81 continue;
83 p = declare_device_array(p, &prog->array[i]);
85 p = isl_printer_start_line(p);
86 p = isl_printer_end_line(p);
87 return p;
90 static __isl_give isl_printer *allocate_device_arrays(
91 __isl_take isl_printer *p, struct gpu_prog *prog)
93 int i;
95 for (i = 0; i < prog->n_array; ++i) {
96 struct gpu_array_info *array = &prog->array[i];
98 if (!gpu_array_requires_device_allocation(&prog->array[i]))
99 continue;
100 p = ppcg_ast_expr_print_macros(array->bound_expr, p);
101 p = isl_printer_start_line(p);
102 p = isl_printer_print_str(p,
103 "cudaCheckReturn(cudaMalloc((void **) &dev_");
104 p = isl_printer_print_str(p, prog->array[i].name);
105 p = isl_printer_print_str(p, ", ");
106 p = gpu_array_info_print_size(p, &prog->array[i]);
107 p = isl_printer_print_str(p, "));");
108 p = isl_printer_end_line(p);
110 p = isl_printer_start_line(p);
111 p = isl_printer_end_line(p);
112 return p;
115 static __isl_give isl_printer *free_device_arrays(__isl_take isl_printer *p,
116 struct gpu_prog *prog)
118 int i;
120 for (i = 0; i < prog->n_array; ++i) {
121 if (!gpu_array_requires_device_allocation(&prog->array[i]))
122 continue;
123 p = isl_printer_start_line(p);
124 p = isl_printer_print_str(p, "cudaCheckReturn(cudaFree(dev_");
125 p = isl_printer_print_str(p, prog->array[i].name);
126 p = isl_printer_print_str(p, "));");
127 p = isl_printer_end_line(p);
130 return p;
133 /* Print code to "p" for copying "array" from the host to the device
134 * in its entirety. The bounds on the extent of "array" have
135 * been precomputed in extract_array_info and are used in
136 * gpu_array_info_print_size.
138 static __isl_give isl_printer *copy_array_to_device(__isl_take isl_printer *p,
139 struct gpu_array_info *array)
141 p = isl_printer_start_line(p);
142 p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(dev_");
143 p = isl_printer_print_str(p, array->name);
144 p = isl_printer_print_str(p, ", ");
146 if (gpu_array_is_scalar(array))
147 p = isl_printer_print_str(p, "&");
148 p = isl_printer_print_str(p, array->name);
149 p = isl_printer_print_str(p, ", ");
151 p = gpu_array_info_print_size(p, array);
152 p = isl_printer_print_str(p, ", cudaMemcpyHostToDevice));");
153 p = isl_printer_end_line(p);
155 return p;
158 /* Print code to "p" for copying "array" back from the device to the host
159 * in its entirety. The bounds on the extent of "array" have
160 * been precomputed in extract_array_info and are used in
161 * gpu_array_info_print_size.
163 static __isl_give isl_printer *copy_array_from_device(
164 __isl_take isl_printer *p, struct gpu_array_info *array)
166 p = isl_printer_start_line(p);
167 p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(");
168 if (gpu_array_is_scalar(array))
169 p = isl_printer_print_str(p, "&");
170 p = isl_printer_print_str(p, array->name);
171 p = isl_printer_print_str(p, ", dev_");
172 p = isl_printer_print_str(p, array->name);
173 p = isl_printer_print_str(p, ", ");
174 p = gpu_array_info_print_size(p, array);
175 p = isl_printer_print_str(p, ", cudaMemcpyDeviceToHost));");
176 p = isl_printer_end_line(p);
178 return p;
181 static void print_reverse_list(FILE *out, int len, int *list)
183 int i;
185 if (!out || len == 0)
186 return;
188 fprintf(out, "(");
189 for (i = 0; i < len; ++i) {
190 if (i)
191 fprintf(out, ", ");
192 fprintf(out, "%d", list[len - 1 - i]);
194 fprintf(out, ")");
197 /* Print the effective grid size as a list of the sizes in each
198 * dimension, from innermost to outermost.
200 static __isl_give isl_printer *print_grid_size(__isl_take isl_printer *p,
201 struct ppcg_kernel *kernel)
203 int i;
204 int dim;
206 dim = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set);
207 if (dim == 0)
208 return p;
210 p = isl_printer_print_str(p, "(");
211 for (i = dim - 1; i >= 0; --i) {
212 isl_ast_expr *bound;
214 bound = isl_ast_expr_get_op_arg(kernel->grid_size_expr, 1 + i);
215 p = isl_printer_print_ast_expr(p, bound);
216 isl_ast_expr_free(bound);
218 if (i > 0)
219 p = isl_printer_print_str(p, ", ");
222 p = isl_printer_print_str(p, ")");
224 return p;
227 /* Print the grid definition.
229 static __isl_give isl_printer *print_grid(__isl_take isl_printer *p,
230 struct ppcg_kernel *kernel)
232 p = isl_printer_start_line(p);
233 p = isl_printer_print_str(p, "dim3 k");
234 p = isl_printer_print_int(p, kernel->id);
235 p = isl_printer_print_str(p, "_dimGrid");
236 p = print_grid_size(p, kernel);
237 p = isl_printer_print_str(p, ";");
238 p = isl_printer_end_line(p);
240 return p;
243 /* Print the arguments to a kernel declaration or call. If "types" is set,
244 * then print a declaration (including the types of the arguments).
246 * The arguments are printed in the following order
247 * - the arrays accessed by the kernel
248 * - the parameters
249 * - the host loop iterators
251 static __isl_give isl_printer *print_kernel_arguments(__isl_take isl_printer *p,
252 struct gpu_prog *prog, struct ppcg_kernel *kernel, int types)
254 int i, n;
255 int first = 1;
256 unsigned nparam;
257 isl_space *space;
258 const char *type;
260 for (i = 0; i < prog->n_array; ++i) {
261 int required;
263 required = ppcg_kernel_requires_array_argument(kernel, i);
264 if (required < 0)
265 return isl_printer_free(p);
266 if (!required)
267 continue;
269 if (!first)
270 p = isl_printer_print_str(p, ", ");
272 if (types)
273 p = gpu_array_info_print_declaration_argument(p,
274 &prog->array[i], NULL);
275 else
276 p = gpu_array_info_print_call_argument(p,
277 &prog->array[i]);
279 first = 0;
282 space = isl_union_set_get_space(kernel->arrays);
283 nparam = isl_space_dim(space, isl_dim_param);
284 for (i = 0; i < nparam; ++i) {
285 const char *name;
287 name = isl_space_get_dim_name(space, isl_dim_param, i);
289 if (!first)
290 p = isl_printer_print_str(p, ", ");
291 if (types)
292 p = isl_printer_print_str(p, "int ");
293 p = isl_printer_print_str(p, name);
295 first = 0;
297 isl_space_free(space);
299 n = isl_space_dim(kernel->space, isl_dim_set);
300 type = isl_options_get_ast_iterator_type(prog->ctx);
301 for (i = 0; i < n; ++i) {
302 const char *name;
304 if (!first)
305 p = isl_printer_print_str(p, ", ");
306 name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
307 if (types) {
308 p = isl_printer_print_str(p, type);
309 p = isl_printer_print_str(p, " ");
311 p = isl_printer_print_str(p, name);
313 first = 0;
316 return p;
319 /* Print the header of the given kernel.
321 static __isl_give isl_printer *print_kernel_header(__isl_take isl_printer *p,
322 struct gpu_prog *prog, struct ppcg_kernel *kernel)
324 p = isl_printer_start_line(p);
325 p = isl_printer_print_str(p, "__global__ void kernel");
326 p = isl_printer_print_int(p, kernel->id);
327 p = isl_printer_print_str(p, "(");
328 p = print_kernel_arguments(p, prog, kernel, 1);
329 p = isl_printer_print_str(p, ")");
331 return p;
334 /* Print the header of the given kernel to both gen->cuda.kernel_h
335 * and gen->cuda.kernel_c.
337 static void print_kernel_headers(struct gpu_prog *prog,
338 struct ppcg_kernel *kernel, struct cuda_info *cuda)
340 isl_printer *p;
342 p = isl_printer_to_file(prog->ctx, cuda->kernel_h);
343 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
344 p = print_kernel_header(p, prog, kernel);
345 p = isl_printer_print_str(p, ";");
346 p = isl_printer_end_line(p);
347 isl_printer_free(p);
349 p = isl_printer_to_file(prog->ctx, cuda->kernel_c);
350 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
351 p = print_kernel_header(p, prog, kernel);
352 p = isl_printer_end_line(p);
353 isl_printer_free(p);
356 static void print_indent(FILE *dst, int indent)
358 fprintf(dst, "%*s", indent, "");
361 /* Print a list of iterators of type "type" with names "ids" to "out".
362 * Each iterator is assigned one of the cuda identifiers in cuda_dims.
363 * In particular, the last iterator is assigned the x identifier
364 * (the first in the list of cuda identifiers).
366 static void print_iterators(FILE *out, const char *type,
367 __isl_keep isl_id_list *ids, const char *cuda_dims[])
369 int i, n;
371 n = isl_id_list_n_id(ids);
372 if (n <= 0)
373 return;
374 print_indent(out, 4);
375 fprintf(out, "%s ", type);
376 for (i = 0; i < n; ++i) {
377 isl_id *id;
379 if (i)
380 fprintf(out, ", ");
381 id = isl_id_list_get_id(ids, i);
382 fprintf(out, "%s = %s", isl_id_get_name(id),
383 cuda_dims[n - 1 - i]);
384 isl_id_free(id);
386 fprintf(out, ";\n");
389 static void print_kernel_iterators(FILE *out, struct ppcg_kernel *kernel)
391 isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
392 const char *type;
393 const char *block_dims[] = { "blockIdx.x", "blockIdx.y" };
394 const char *thread_dims[] = { "threadIdx.x", "threadIdx.y",
395 "threadIdx.z" };
397 type = isl_options_get_ast_iterator_type(ctx);
399 print_iterators(out, type, kernel->block_ids, block_dims);
400 print_iterators(out, type, kernel->thread_ids, thread_dims);
403 static __isl_give isl_printer *print_kernel_var(__isl_take isl_printer *p,
404 struct ppcg_kernel_var *var)
406 int j;
408 p = isl_printer_start_line(p);
409 if (var->type == ppcg_access_shared)
410 p = isl_printer_print_str(p, "__shared__ ");
411 p = isl_printer_print_str(p, var->array->type);
412 p = isl_printer_print_str(p, " ");
413 p = isl_printer_print_str(p, var->name);
414 for (j = 0; j < var->array->n_index; ++j) {
415 isl_val *v;
417 p = isl_printer_print_str(p, "[");
418 v = isl_vec_get_element_val(var->size, j);
419 p = isl_printer_print_val(p, v);
420 isl_val_free(v);
421 p = isl_printer_print_str(p, "]");
423 p = isl_printer_print_str(p, ";");
424 p = isl_printer_end_line(p);
426 return p;
429 static __isl_give isl_printer *print_kernel_vars(__isl_take isl_printer *p,
430 struct ppcg_kernel *kernel)
432 int i;
434 for (i = 0; i < kernel->n_var; ++i)
435 p = print_kernel_var(p, &kernel->var[i]);
437 return p;
440 /* Print a sync statement.
442 static __isl_give isl_printer *print_sync(__isl_take isl_printer *p,
443 struct ppcg_kernel_stmt *stmt)
445 p = isl_printer_start_line(p);
446 p = isl_printer_print_str(p, "__syncthreads();");
447 p = isl_printer_end_line(p);
449 return p;
452 /* This function is called for each user statement in the AST,
453 * i.e., for each kernel body statement, copy statement or sync statement.
455 static __isl_give isl_printer *print_kernel_stmt(__isl_take isl_printer *p,
456 __isl_take isl_ast_print_options *print_options,
457 __isl_keep isl_ast_node *node, void *user)
459 isl_id *id;
460 struct ppcg_kernel_stmt *stmt;
462 id = isl_ast_node_get_annotation(node);
463 stmt = isl_id_get_user(id);
464 isl_id_free(id);
466 isl_ast_print_options_free(print_options);
468 switch (stmt->type) {
469 case ppcg_kernel_copy:
470 return ppcg_kernel_print_copy(p, stmt);
471 case ppcg_kernel_sync:
472 return print_sync(p, stmt);
473 case ppcg_kernel_domain:
474 return ppcg_kernel_print_domain(p, stmt);
477 return p;
480 static void print_kernel(struct gpu_prog *prog, struct ppcg_kernel *kernel,
481 struct cuda_info *cuda)
483 isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
484 isl_ast_print_options *print_options;
485 isl_printer *p;
487 print_kernel_headers(prog, kernel, cuda);
488 fprintf(cuda->kernel_c, "{\n");
489 print_kernel_iterators(cuda->kernel_c, kernel);
491 p = isl_printer_to_file(ctx, cuda->kernel_c);
492 p = isl_printer_set_output_format(p, ISL_FORMAT_C);
493 p = isl_printer_indent(p, 4);
495 p = print_kernel_vars(p, kernel);
496 p = isl_printer_end_line(p);
497 p = ppcg_set_macro_names(p);
498 p = gpu_print_macros(p, kernel->tree);
500 print_options = isl_ast_print_options_alloc(ctx);
501 print_options = isl_ast_print_options_set_print_user(print_options,
502 &print_kernel_stmt, NULL);
503 p = isl_ast_node_print(kernel->tree, p, print_options);
504 isl_printer_free(p);
506 fprintf(cuda->kernel_c, "}\n");
509 /* Print code for initializing the device for execution of the transformed
510 * code. This includes declaring locally defined variables as well as
511 * declaring and allocating the required copies of arrays on the device.
513 static __isl_give isl_printer *init_device(__isl_take isl_printer *p,
514 struct gpu_prog *prog)
516 p = print_cuda_macros(p);
518 p = gpu_print_local_declarations(p, prog);
519 p = declare_device_arrays(p, prog);
520 p = allocate_device_arrays(p, prog);
522 return p;
525 /* Print code for clearing the device after execution of the transformed code.
526 * In particular, free the memory that was allocated on the device.
528 static __isl_give isl_printer *clear_device(__isl_take isl_printer *p,
529 struct gpu_prog *prog)
531 p = free_device_arrays(p, prog);
533 return p;
536 /* Print a statement for copying an array to or from the device,
537 * or for initializing or clearing the device.
538 * The statement identifier of a copying node is called
539 * "to_device_<array name>" or "from_device_<array name>" and
540 * its user pointer points to the gpu_array_info of the array
541 * that needs to be copied.
542 * The node for initializing the device is called "init_device".
543 * The node for clearing the device is called "clear_device".
545 * Extract the array (if any) from the identifier and call
546 * init_device, clear_device, copy_array_to_device or copy_array_from_device.
548 static __isl_give isl_printer *print_device_node(__isl_take isl_printer *p,
549 __isl_keep isl_ast_node *node, struct gpu_prog *prog)
551 isl_ast_expr *expr, *arg;
552 isl_id *id;
553 const char *name;
554 struct gpu_array_info *array;
556 expr = isl_ast_node_user_get_expr(node);
557 arg = isl_ast_expr_get_op_arg(expr, 0);
558 id = isl_ast_expr_get_id(arg);
559 name = isl_id_get_name(id);
560 array = isl_id_get_user(id);
561 isl_id_free(id);
562 isl_ast_expr_free(arg);
563 isl_ast_expr_free(expr);
565 if (!name)
566 return isl_printer_free(p);
567 if (!strcmp(name, "init_device"))
568 return init_device(p, prog);
569 if (!strcmp(name, "clear_device"))
570 return clear_device(p, prog);
571 if (!array)
572 return isl_printer_free(p);
574 if (!prefixcmp(name, "to_device"))
575 return copy_array_to_device(p, array);
576 else
577 return copy_array_from_device(p, array);
580 struct print_host_user_data {
581 struct cuda_info *cuda;
582 struct gpu_prog *prog;
585 /* Print the user statement of the host code to "p".
587 * The host code may contain original user statements, kernel launches,
588 * statements that copy data to/from the device and statements
589 * the initialize or clear the device.
590 * The original user statements and the kernel launches have
591 * an associated annotation, while the other statements do not.
592 * The latter are handled by print_device_node.
593 * The annotation on the user statements is called "user".
595 * In case of a kernel launch, print a block of statements that
596 * defines the grid and the block and then launches the kernel.
598 static __isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
599 __isl_take isl_ast_print_options *print_options,
600 __isl_keep isl_ast_node *node, void *user)
602 isl_id *id;
603 int is_user;
604 struct ppcg_kernel *kernel;
605 struct ppcg_kernel_stmt *stmt;
606 struct print_host_user_data *data;
608 isl_ast_print_options_free(print_options);
610 data = (struct print_host_user_data *) user;
612 id = isl_ast_node_get_annotation(node);
613 if (!id)
614 return print_device_node(p, node, data->prog);
616 is_user = !strcmp(isl_id_get_name(id), "user");
617 kernel = is_user ? NULL : isl_id_get_user(id);
618 stmt = is_user ? isl_id_get_user(id) : NULL;
619 isl_id_free(id);
621 if (is_user)
622 return ppcg_kernel_print_domain(p, stmt);
624 p = ppcg_start_block(p);
626 p = isl_printer_start_line(p);
627 p = isl_printer_print_str(p, "dim3 k");
628 p = isl_printer_print_int(p, kernel->id);
629 p = isl_printer_print_str(p, "_dimBlock");
630 print_reverse_list(isl_printer_get_file(p),
631 kernel->n_block, kernel->block_dim);
632 p = isl_printer_print_str(p, ";");
633 p = isl_printer_end_line(p);
635 p = print_grid(p, kernel);
637 p = isl_printer_start_line(p);
638 p = isl_printer_print_str(p, "kernel");
639 p = isl_printer_print_int(p, kernel->id);
640 p = isl_printer_print_str(p, " <<<k");
641 p = isl_printer_print_int(p, kernel->id);
642 p = isl_printer_print_str(p, "_dimGrid, k");
643 p = isl_printer_print_int(p, kernel->id);
644 p = isl_printer_print_str(p, "_dimBlock>>> (");
645 p = print_kernel_arguments(p, data->prog, kernel, 0);
646 p = isl_printer_print_str(p, ");");
647 p = isl_printer_end_line(p);
649 p = isl_printer_start_line(p);
650 p = isl_printer_print_str(p, "cudaCheckKernel();");
651 p = isl_printer_end_line(p);
653 p = ppcg_end_block(p);
655 p = isl_printer_start_line(p);
656 p = isl_printer_end_line(p);
658 print_kernel(data->prog, kernel, data->cuda);
660 return p;
663 static __isl_give isl_printer *print_host_code(__isl_take isl_printer *p,
664 struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
665 struct cuda_info *cuda)
667 isl_ast_print_options *print_options;
668 isl_ctx *ctx = isl_ast_node_get_ctx(tree);
669 struct print_host_user_data data = { cuda, prog };
671 print_options = isl_ast_print_options_alloc(ctx);
672 print_options = isl_ast_print_options_set_print_user(print_options,
673 &print_host_user, &data);
675 p = gpu_print_macros(p, tree);
676 p = isl_ast_node_print(tree, p, print_options);
678 return p;
681 /* Given a gpu_prog "prog" and the corresponding transformed AST
682 * "tree", print the entire CUDA code to "p".
683 * "types" collects the types for which a definition has already
684 * been printed.
686 static __isl_give isl_printer *print_cuda(__isl_take isl_printer *p,
687 struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
688 struct gpu_types *types, void *user)
690 struct cuda_info *cuda = user;
691 isl_printer *kernel;
693 kernel = isl_printer_to_file(isl_printer_get_ctx(p), cuda->kernel_c);
694 kernel = isl_printer_set_output_format(kernel, ISL_FORMAT_C);
695 kernel = gpu_print_types(kernel, types, prog);
696 isl_printer_free(kernel);
698 if (!kernel)
699 return isl_printer_free(p);
701 p = print_host_code(p, prog, tree, cuda);
703 return p;
706 /* Transform the code in the file called "input" by replacing
707 * all scops by corresponding CUDA code.
708 * The names of the output files are derived from "input".
710 * We let generate_gpu do all the hard work and then let it call
711 * us back for printing the AST in print_cuda.
713 * To prepare for this printing, we first open the output files
714 * and we close them after generate_gpu has finished.
716 int generate_cuda(isl_ctx *ctx, struct ppcg_options *options,
717 const char *input)
719 struct cuda_info cuda;
720 int r;
722 cuda_open_files(&cuda, input);
724 r = generate_gpu(ctx, input, cuda.host_c, options, &print_cuda, &cuda);
726 cuda_close_files(&cuda);
728 return r;