cuda.c

   1 /*
   2  * Copyright 2012      Ecole Normale Superieure
   3  *
   4  * Use of this software is governed by the MIT license
   5  *
   6  * Written by Sven Verdoolaege,
   7  * Ecole Normale Superieure, 45 rue d’Ulm, 75230 Paris, France
   8  */
   9
  10 #include <isl/aff.h>
  11 #include <isl/ast.h>
  12
  13 #include "cuda_common.h"
  14 #include "cuda.h"
  15 #include "gpu.h"
  16 #include "gpu_print.h"
  17 #include "print.h"
  18 #include "util.h"
  19
  20 static __isl_give isl_printer *print_cuda_macros(__isl_take isl_printer *p)
  21 {
  22         const char *macros =
  23                 "#define cudaCheckReturn(ret) \\\n"
  24                 "  do { \\\n"
  25                 "    cudaError_t cudaCheckReturn_e = (ret); \\\n"
  26                 "    if (cudaCheckReturn_e != cudaSuccess) { \\\n"
  27                 "      fprintf(stderr, \"CUDA error: %s\\n\", "
  28                 "cudaGetErrorString(cudaCheckReturn_e)); \\\n"
  29                 "      fflush(stderr); \\\n"
  30                 "    } \\\n"
  31                 "    assert(cudaCheckReturn_e == cudaSuccess); \\\n"
  32                 "  } while(0)\n"
  33                 "#define cudaCheckKernel() \\\n"
  34                 "  do { \\\n"
  35                 "    cudaCheckReturn(cudaGetLastError()); \\\n"
  36                 "  } while(0)\n\n";
  37
  38         p = isl_printer_print_str(p, macros);
  39         return p;
  40 }
  41
  42 /* Print a declaration for the device array corresponding to "array" on "p".
  43  */
  44 static __isl_give isl_printer *declare_device_array(__isl_take isl_printer *p,
  45         struct gpu_array_info *array)
  46 {
  47         int i;
  48
  49         p = isl_printer_start_line(p);
  50         p = isl_printer_print_str(p, array->type);
  51         p = isl_printer_print_str(p, " ");
  52         if (!array->linearize && array->n_index > 1)
  53                 p = isl_printer_print_str(p, "(");
  54         p = isl_printer_print_str(p, "*dev_");
  55         p = isl_printer_print_str(p, array->name);
  56         if (!array->linearize && array->n_index > 1) {
  57                 p = isl_printer_print_str(p, ")");
  58                 for (i = 1; i < array->n_index; i++) {
  59                         isl_ast_expr *bound;
  60                         bound = isl_ast_expr_get_op_arg(array->bound_expr,
  61                                                         1 + i);
  62                         p = isl_printer_print_str(p, "[");
  63                         p = isl_printer_print_ast_expr(p, bound);
  64                         p = isl_printer_print_str(p, "]");
  65                         isl_ast_expr_free(bound);
  66                 }
  67         }
  68         p = isl_printer_print_str(p, ";");
  69         p = isl_printer_end_line(p);
  70
  71         return p;
  72 }
  73
  74 static __isl_give isl_printer *declare_device_arrays(__isl_take isl_printer *p,
  75         struct gpu_prog *prog)
  76 {
  77         int i;
  78
  79         for (i = 0; i < prog->n_array; ++i) {
  80                 if (!gpu_array_requires_device_allocation(&prog->array[i]))
  81                         continue;
  82
  83                 p = declare_device_array(p, &prog->array[i]);
  84         }
  85         p = isl_printer_start_line(p);
  86         p = isl_printer_end_line(p);
  87         return p;
  88 }
  89
  90 static __isl_give isl_printer *allocate_device_arrays(
  91         __isl_take isl_printer *p, struct gpu_prog *prog)
  92 {
  93         int i;
  94
  95         for (i = 0; i < prog->n_array; ++i) {
  96                 struct gpu_array_info *array = &prog->array[i];
  97
  98                 if (!gpu_array_requires_device_allocation(&prog->array[i]))
  99                         continue;
 100                 p = ppcg_ast_expr_print_macros(array->bound_expr, p);
 101                 p = isl_printer_start_line(p);
 102                 p = isl_printer_print_str(p,
 103                         "cudaCheckReturn(cudaMalloc((void **) &dev_");
 104                 p = isl_printer_print_str(p, prog->array[i].name);
 105                 p = isl_printer_print_str(p, ", ");
 106                 p = gpu_array_info_print_size(p, &prog->array[i]);
 107                 p = isl_printer_print_str(p, "));");
 108                 p = isl_printer_end_line(p);
 109         }
 110         p = isl_printer_start_line(p);
 111         p = isl_printer_end_line(p);
 112         return p;
 113 }
 114
 115 static __isl_give isl_printer *free_device_arrays(__isl_take isl_printer *p,
 116         struct gpu_prog *prog)
 117 {
 118         int i;
 119
 120         for (i = 0; i < prog->n_array; ++i) {
 121                 if (!gpu_array_requires_device_allocation(&prog->array[i]))
 122                         continue;
 123                 p = isl_printer_start_line(p);
 124                 p = isl_printer_print_str(p, "cudaCheckReturn(cudaFree(dev_");
 125                 p = isl_printer_print_str(p, prog->array[i].name);
 126                 p = isl_printer_print_str(p, "));");
 127                 p = isl_printer_end_line(p);
 128         }
 129
 130         return p;
 131 }
 132
 133 /* Print code to "p" for copying "array" from the host to the device
 134  * in its entirety.  The bounds on the extent of "array" have
 135  * been precomputed in extract_array_info and are used in
 136  * gpu_array_info_print_size.
 137  */
 138 static __isl_give isl_printer *copy_array_to_device(__isl_take isl_printer *p,
 139         struct gpu_array_info *array)
 140 {
 141         p = isl_printer_start_line(p);
 142         p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(dev_");
 143         p = isl_printer_print_str(p, array->name);
 144         p = isl_printer_print_str(p, ", ");
 145
 146         if (gpu_array_is_scalar(array))
 147                 p = isl_printer_print_str(p, "&");
 148         p = isl_printer_print_str(p, array->name);
 149         p = isl_printer_print_str(p, ", ");
 150
 151         p = gpu_array_info_print_size(p, array);
 152         p = isl_printer_print_str(p, ", cudaMemcpyHostToDevice));");
 153         p = isl_printer_end_line(p);
 154
 155         return p;
 156 }
 157
 158 /* Print code to "p" for copying "array" back from the device to the host
 159  * in its entirety.  The bounds on the extent of "array" have
 160  * been precomputed in extract_array_info and are used in
 161  * gpu_array_info_print_size.
 162  */
 163 static __isl_give isl_printer *copy_array_from_device(
 164         __isl_take isl_printer *p, struct gpu_array_info *array)
 165 {
 166         p = isl_printer_start_line(p);
 167         p = isl_printer_print_str(p, "cudaCheckReturn(cudaMemcpy(");
 168         if (gpu_array_is_scalar(array))
 169                 p = isl_printer_print_str(p, "&");
 170         p = isl_printer_print_str(p, array->name);
 171         p = isl_printer_print_str(p, ", dev_");
 172         p = isl_printer_print_str(p, array->name);
 173         p = isl_printer_print_str(p, ", ");
 174         p = gpu_array_info_print_size(p, array);
 175         p = isl_printer_print_str(p, ", cudaMemcpyDeviceToHost));");
 176         p = isl_printer_end_line(p);
 177
 178         return p;
 179 }
 180
 181 static void print_reverse_list(FILE *out, int len, int *list)
 182 {
 183         int i;
 184
 185         if (!out || len == 0)
 186                 return;
 187
 188         fprintf(out, "(");
 189         for (i = 0; i < len; ++i) {
 190                 if (i)
 191                         fprintf(out, ", ");
 192                 fprintf(out, "%d", list[len - 1 - i]);
 193         }
 194         fprintf(out, ")");
 195 }
 196
 197 /* Print the effective grid size as a list of the sizes in each
 198  * dimension, from innermost to outermost.
 199  */
 200 static __isl_give isl_printer *print_grid_size(__isl_take isl_printer *p,
 201         struct ppcg_kernel *kernel)
 202 {
 203         int i;
 204         int dim;
 205
 206         dim = isl_multi_pw_aff_dim(kernel->grid_size, isl_dim_set);
 207         if (dim == 0)
 208                 return p;
 209
 210         p = isl_printer_print_str(p, "(");
 211         for (i = dim - 1; i >= 0; --i) {
 212                 isl_ast_expr *bound;
 213
 214                 bound = isl_ast_expr_get_op_arg(kernel->grid_size_expr, 1 + i);
 215                 p = isl_printer_print_ast_expr(p, bound);
 216                 isl_ast_expr_free(bound);
 217
 218                 if (i > 0)
 219                         p = isl_printer_print_str(p, ", ");
 220         }
 221
 222         p = isl_printer_print_str(p, ")");
 223
 224         return p;
 225 }
 226
 227 /* Print the grid definition.
 228  */
 229 static __isl_give isl_printer *print_grid(__isl_take isl_printer *p,
 230         struct ppcg_kernel *kernel)
 231 {
 232         p = isl_printer_start_line(p);
 233         p = isl_printer_print_str(p, "dim3 k");
 234         p = isl_printer_print_int(p, kernel->id);
 235         p = isl_printer_print_str(p, "_dimGrid");
 236         p = print_grid_size(p, kernel);
 237         p = isl_printer_print_str(p, ";");
 238         p = isl_printer_end_line(p);
 239
 240         return p;
 241 }
 242
 243 /* Print the arguments to a kernel declaration or call.  If "types" is set,
 244  * then print a declaration (including the types of the arguments).
 245  *
 246  * The arguments are printed in the following order
 247  * - the arrays accessed by the kernel
 248  * - the parameters
 249  * - the host loop iterators
 250  */
 251 static __isl_give isl_printer *print_kernel_arguments(__isl_take isl_printer *p,
 252         struct gpu_prog *prog, struct ppcg_kernel *kernel, int types)
 253 {
 254         int i, n;
 255         int first = 1;
 256         unsigned nparam;
 257         isl_space *space;
 258         const char *type;
 259
 260         for (i = 0; i < prog->n_array; ++i) {
 261                 int required;
 262
 263                 required = ppcg_kernel_requires_array_argument(kernel, i);
 264                 if (required < 0)
 265                         return isl_printer_free(p);
 266                 if (!required)
 267                         continue;
 268
 269                 if (!first)
 270                         p = isl_printer_print_str(p, ", ");
 271
 272                 if (types)
 273                         p = gpu_array_info_print_declaration_argument(p,
 274                                 &prog->array[i], NULL);
 275                 else
 276                         p = gpu_array_info_print_call_argument(p,
 277                                 &prog->array[i]);
 278
 279                 first = 0;
 280         }
 281
 282         space = isl_union_set_get_space(kernel->arrays);
 283         nparam = isl_space_dim(space, isl_dim_param);
 284         for (i = 0; i < nparam; ++i) {
 285                 const char *name;
 286
 287                 name = isl_space_get_dim_name(space, isl_dim_param, i);
 288
 289                 if (!first)
 290                         p = isl_printer_print_str(p, ", ");
 291                 if (types)
 292                         p = isl_printer_print_str(p, "int ");
 293                 p = isl_printer_print_str(p, name);
 294
 295                 first = 0;
 296         }
 297         isl_space_free(space);
 298
 299         n = isl_space_dim(kernel->space, isl_dim_set);
 300         type = isl_options_get_ast_iterator_type(prog->ctx);
 301         for (i = 0; i < n; ++i) {
 302                 const char *name;
 303
 304                 if (!first)
 305                         p = isl_printer_print_str(p, ", ");
 306                 name = isl_space_get_dim_name(kernel->space, isl_dim_set, i);
 307                 if (types) {
 308                         p = isl_printer_print_str(p, type);
 309                         p = isl_printer_print_str(p, " ");
 310                 }
 311                 p = isl_printer_print_str(p, name);
 312
 313                 first = 0;
 314         }
 315
 316         return p;
 317 }
 318
 319 /* Print the header of the given kernel.
 320  */
 321 static __isl_give isl_printer *print_kernel_header(__isl_take isl_printer *p,
 322         struct gpu_prog *prog, struct ppcg_kernel *kernel)
 323 {
 324         p = isl_printer_start_line(p);
 325         p = isl_printer_print_str(p, "__global__ void kernel");
 326         p = isl_printer_print_int(p, kernel->id);
 327         p = isl_printer_print_str(p, "(");
 328         p = print_kernel_arguments(p, prog, kernel, 1);
 329         p = isl_printer_print_str(p, ")");
 330
 331         return p;
 332 }
 333
 334 /* Print the header of the given kernel to both gen->cuda.kernel_h
 335  * and gen->cuda.kernel_c.
 336  */
 337 static void print_kernel_headers(struct gpu_prog *prog,
 338         struct ppcg_kernel *kernel, struct cuda_info *cuda)
 339 {
 340         isl_printer *p;
 341
 342         p = isl_printer_to_file(prog->ctx, cuda->kernel_h);
 343         p = isl_printer_set_output_format(p, ISL_FORMAT_C);
 344         p = print_kernel_header(p, prog, kernel);
 345         p = isl_printer_print_str(p, ";");
 346         p = isl_printer_end_line(p);
 347         isl_printer_free(p);
 348
 349         p = isl_printer_to_file(prog->ctx, cuda->kernel_c);
 350         p = isl_printer_set_output_format(p, ISL_FORMAT_C);
 351         p = print_kernel_header(p, prog, kernel);
 352         p = isl_printer_end_line(p);
 353         isl_printer_free(p);
 354 }
 355
 356 static void print_indent(FILE *dst, int indent)
 357 {
 358         fprintf(dst, "%*s", indent, "");
 359 }
 360
 361 /* Print a list of iterators of type "type" with names "ids" to "out".
 362  * Each iterator is assigned one of the cuda identifiers in cuda_dims.
 363  * In particular, the last iterator is assigned the x identifier
 364  * (the first in the list of cuda identifiers).
 365  */
 366 static void print_iterators(FILE *out, const char *type,
 367         __isl_keep isl_id_list *ids, const char *cuda_dims[])
 368 {
 369         int i, n;
 370
 371         n = isl_id_list_n_id(ids);
 372         if (n <= 0)
 373                 return;
 374         print_indent(out, 4);
 375         fprintf(out, "%s ", type);
 376         for (i = 0; i < n; ++i) {
 377                 isl_id *id;
 378
 379                 if (i)
 380                         fprintf(out, ", ");
 381                 id = isl_id_list_get_id(ids, i);
 382                 fprintf(out, "%s = %s", isl_id_get_name(id),
 383                         cuda_dims[n - 1 - i]);
 384                 isl_id_free(id);
 385         }
 386         fprintf(out, ";\n");
 387 }
 388
 389 static void print_kernel_iterators(FILE *out, struct ppcg_kernel *kernel)
 390 {
 391         isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
 392         const char *type;
 393         const char *block_dims[] = { "blockIdx.x", "blockIdx.y" };
 394         const char *thread_dims[] = { "threadIdx.x", "threadIdx.y",
 395                                         "threadIdx.z" };
 396
 397         type = isl_options_get_ast_iterator_type(ctx);
 398
 399         print_iterators(out, type, kernel->block_ids, block_dims);
 400         print_iterators(out, type, kernel->thread_ids, thread_dims);
 401 }
 402
 403 static __isl_give isl_printer *print_kernel_var(__isl_take isl_printer *p,
 404         struct ppcg_kernel_var *var)
 405 {
 406         int j;
 407
 408         p = isl_printer_start_line(p);
 409         if (var->type == ppcg_access_shared)
 410                 p = isl_printer_print_str(p, "__shared__ ");
 411         p = isl_printer_print_str(p, var->array->type);
 412         p = isl_printer_print_str(p, " ");
 413         p = isl_printer_print_str(p,  var->name);
 414         for (j = 0; j < var->array->n_index; ++j) {
 415                 isl_val *v;
 416
 417                 p = isl_printer_print_str(p, "[");
 418                 v = isl_vec_get_element_val(var->size, j);
 419                 p = isl_printer_print_val(p, v);
 420                 isl_val_free(v);
 421                 p = isl_printer_print_str(p, "]");
 422         }
 423         p = isl_printer_print_str(p, ";");
 424         p = isl_printer_end_line(p);
 425
 426         return p;
 427 }
 428
 429 static __isl_give isl_printer *print_kernel_vars(__isl_take isl_printer *p,
 430         struct ppcg_kernel *kernel)
 431 {
 432         int i;
 433
 434         for (i = 0; i < kernel->n_var; ++i)
 435                 p = print_kernel_var(p, &kernel->var[i]);
 436
 437         return p;
 438 }
 439
 440 /* Print a sync statement.
 441  */
 442 static __isl_give isl_printer *print_sync(__isl_take isl_printer *p,
 443         struct ppcg_kernel_stmt *stmt)
 444 {
 445         p = isl_printer_start_line(p);
 446         p = isl_printer_print_str(p, "__syncthreads();");
 447         p = isl_printer_end_line(p);
 448
 449         return p;
 450 }
 451
 452 /* This function is called for each user statement in the AST,
 453  * i.e., for each kernel body statement, copy statement or sync statement.
 454  */
 455 static __isl_give isl_printer *print_kernel_stmt(__isl_take isl_printer *p,
 456         __isl_take isl_ast_print_options *print_options,
 457         __isl_keep isl_ast_node *node, void *user)
 458 {
 459         isl_id *id;
 460         struct ppcg_kernel_stmt *stmt;
 461
 462         id = isl_ast_node_get_annotation(node);
 463         stmt = isl_id_get_user(id);
 464         isl_id_free(id);
 465
 466         isl_ast_print_options_free(print_options);
 467
 468         switch (stmt->type) {
 469         case ppcg_kernel_copy:
 470                 return ppcg_kernel_print_copy(p, stmt);
 471         case ppcg_kernel_sync:
 472                 return print_sync(p, stmt);
 473         case ppcg_kernel_domain:
 474                 return ppcg_kernel_print_domain(p, stmt);
 475         }
 476
 477         return p;
 478 }
 479
 480 static void print_kernel(struct gpu_prog *prog, struct ppcg_kernel *kernel,
 481         struct cuda_info *cuda)
 482 {
 483         isl_ctx *ctx = isl_ast_node_get_ctx(kernel->tree);
 484         isl_ast_print_options *print_options;
 485         isl_printer *p;
 486
 487         print_kernel_headers(prog, kernel, cuda);
 488         fprintf(cuda->kernel_c, "{\n");
 489         print_kernel_iterators(cuda->kernel_c, kernel);
 490
 491         p = isl_printer_to_file(ctx, cuda->kernel_c);
 492         p = isl_printer_set_output_format(p, ISL_FORMAT_C);
 493         p = isl_printer_indent(p, 4);
 494
 495         p = print_kernel_vars(p, kernel);
 496         p = isl_printer_end_line(p);
 497         p = ppcg_set_macro_names(p);
 498         p = gpu_print_macros(p, kernel->tree);
 499
 500         print_options = isl_ast_print_options_alloc(ctx);
 501         print_options = isl_ast_print_options_set_print_user(print_options,
 502                                                     &print_kernel_stmt, NULL);
 503         p = isl_ast_node_print(kernel->tree, p, print_options);
 504         isl_printer_free(p);
 505
 506         fprintf(cuda->kernel_c, "}\n");
 507 }
 508
 509 /* Print code for initializing the device for execution of the transformed
 510  * code.  This includes declaring locally defined variables as well as
 511  * declaring and allocating the required copies of arrays on the device.
 512  */
 513 static __isl_give isl_printer *init_device(__isl_take isl_printer *p,
 514         struct gpu_prog *prog)
 515 {
 516         p = print_cuda_macros(p);
 517
 518         p = gpu_print_local_declarations(p, prog);
 519         p = declare_device_arrays(p, prog);
 520         p = allocate_device_arrays(p, prog);
 521
 522         return p;
 523 }
 524
 525 /* Print code for clearing the device after execution of the transformed code.
 526  * In particular, free the memory that was allocated on the device.
 527  */
 528 static __isl_give isl_printer *clear_device(__isl_take isl_printer *p,
 529         struct gpu_prog *prog)
 530 {
 531         p = free_device_arrays(p, prog);
 532
 533         return p;
 534 }
 535
 536 /* Print a statement for copying an array to or from the device,
 537  * or for initializing or clearing the device.
 538  * The statement identifier of a copying node is called
 539  * "to_device_<array name>" or "from_device_<array name>" and
 540  * its user pointer points to the gpu_array_info of the array
 541  * that needs to be copied.
 542  * The node for initializing the device is called "init_device".
 543  * The node for clearing the device is called "clear_device".
 544  *
 545  * Extract the array (if any) from the identifier and call
 546  * init_device, clear_device, copy_array_to_device or copy_array_from_device.
 547  */
 548 static __isl_give isl_printer *print_device_node(__isl_take isl_printer *p,
 549         __isl_keep isl_ast_node *node, struct gpu_prog *prog)
 550 {
 551         isl_ast_expr *expr, *arg;
 552         isl_id *id;
 553         const char *name;
 554         struct gpu_array_info *array;
 555
 556         expr = isl_ast_node_user_get_expr(node);
 557         arg = isl_ast_expr_get_op_arg(expr, 0);
 558         id = isl_ast_expr_get_id(arg);
 559         name = isl_id_get_name(id);
 560         array = isl_id_get_user(id);
 561         isl_id_free(id);
 562         isl_ast_expr_free(arg);
 563         isl_ast_expr_free(expr);
 564
 565         if (!name)
 566                 return isl_printer_free(p);
 567         if (!strcmp(name, "init_device"))
 568                 return init_device(p, prog);
 569         if (!strcmp(name, "clear_device"))
 570                 return clear_device(p, prog);
 571         if (!array)
 572                 return isl_printer_free(p);
 573
 574         if (!prefixcmp(name, "to_device"))
 575                 return copy_array_to_device(p, array);
 576         else
 577                 return copy_array_from_device(p, array);
 578 }
 579
 580 struct print_host_user_data {
 581         struct cuda_info *cuda;
 582         struct gpu_prog *prog;
 583 };
 584
 585 /* Print the user statement of the host code to "p".
 586  *
 587  * The host code may contain original user statements, kernel launches,
 588  * statements that copy data to/from the device and statements
 589  * the initialize or clear the device.
 590  * The original user statements and the kernel launches have
 591  * an associated annotation, while the other statements do not.
 592  * The latter are handled by print_device_node.
 593  * The annotation on the user statements is called "user".
 594  *
 595  * In case of a kernel launch, print a block of statements that
 596  * defines the grid and the block and then launches the kernel.
 597  */
 598 static __isl_give isl_printer *print_host_user(__isl_take isl_printer *p,
 599         __isl_take isl_ast_print_options *print_options,
 600         __isl_keep isl_ast_node *node, void *user)
 601 {
 602         isl_id *id;
 603         int is_user;
 604         struct ppcg_kernel *kernel;
 605         struct ppcg_kernel_stmt *stmt;
 606         struct print_host_user_data *data;
 607
 608         isl_ast_print_options_free(print_options);
 609
 610         data = (struct print_host_user_data *) user;
 611
 612         id = isl_ast_node_get_annotation(node);
 613         if (!id)
 614                 return print_device_node(p, node, data->prog);
 615
 616         is_user = !strcmp(isl_id_get_name(id), "user");
 617         kernel = is_user ? NULL : isl_id_get_user(id);
 618         stmt = is_user ? isl_id_get_user(id) : NULL;
 619         isl_id_free(id);
 620
 621         if (is_user)
 622                 return ppcg_kernel_print_domain(p, stmt);
 623
 624         p = ppcg_start_block(p);
 625
 626         p = isl_printer_start_line(p);
 627         p = isl_printer_print_str(p, "dim3 k");
 628         p = isl_printer_print_int(p, kernel->id);
 629         p = isl_printer_print_str(p, "_dimBlock");
 630         print_reverse_list(isl_printer_get_file(p),
 631                                 kernel->n_block, kernel->block_dim);
 632         p = isl_printer_print_str(p, ";");
 633         p = isl_printer_end_line(p);
 634
 635         p = print_grid(p, kernel);
 636
 637         p = isl_printer_start_line(p);
 638         p = isl_printer_print_str(p, "kernel");
 639         p = isl_printer_print_int(p, kernel->id);
 640         p = isl_printer_print_str(p, " <<<k");
 641         p = isl_printer_print_int(p, kernel->id);
 642         p = isl_printer_print_str(p, "_dimGrid, k");
 643         p = isl_printer_print_int(p, kernel->id);
 644         p = isl_printer_print_str(p, "_dimBlock>>> (");
 645         p = print_kernel_arguments(p, data->prog, kernel, 0);
 646         p = isl_printer_print_str(p, ");");
 647         p = isl_printer_end_line(p);
 648
 649         p = isl_printer_start_line(p);
 650         p = isl_printer_print_str(p, "cudaCheckKernel();");
 651         p = isl_printer_end_line(p);
 652
 653         p = ppcg_end_block(p);
 654
 655         p = isl_printer_start_line(p);
 656         p = isl_printer_end_line(p);
 657
 658         print_kernel(data->prog, kernel, data->cuda);
 659
 660         return p;
 661 }
 662
 663 static __isl_give isl_printer *print_host_code(__isl_take isl_printer *p,
 664         struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
 665         struct cuda_info *cuda)
 666 {
 667         isl_ast_print_options *print_options;
 668         isl_ctx *ctx = isl_ast_node_get_ctx(tree);
 669         struct print_host_user_data data = { cuda, prog };
 670
 671         print_options = isl_ast_print_options_alloc(ctx);
 672         print_options = isl_ast_print_options_set_print_user(print_options,
 673                                                 &print_host_user, &data);
 674
 675         p = gpu_print_macros(p, tree);
 676         p = isl_ast_node_print(tree, p, print_options);
 677
 678         return p;
 679 }
 680
 681 /* Given a gpu_prog "prog" and the corresponding transformed AST
 682  * "tree", print the entire CUDA code to "p".
 683  * "types" collects the types for which a definition has already
 684  * been printed.
 685  */
 686 static __isl_give isl_printer *print_cuda(__isl_take isl_printer *p,
 687         struct gpu_prog *prog, __isl_keep isl_ast_node *tree,
 688         struct gpu_types *types, void *user)
 689 {
 690         struct cuda_info *cuda = user;
 691         isl_printer *kernel;
 692
 693         kernel = isl_printer_to_file(isl_printer_get_ctx(p), cuda->kernel_c);
 694         kernel = isl_printer_set_output_format(kernel, ISL_FORMAT_C);
 695         kernel = gpu_print_types(kernel, types, prog);
 696         isl_printer_free(kernel);
 697
 698         if (!kernel)
 699                 return isl_printer_free(p);
 700
 701         p = print_host_code(p, prog, tree, cuda);
 702
 703         return p;
 704 }
 705
 706 /* Transform the code in the file called "input" by replacing
 707  * all scops by corresponding CUDA code.
 708  * The names of the output files are derived from "input".
 709  *
 710  * We let generate_gpu do all the hard work and then let it call
 711  * us back for printing the AST in print_cuda.
 712  *
 713  * To prepare for this printing, we first open the output files
 714  * and we close them after generate_gpu has finished.
 715  */
 716 int generate_cuda(isl_ctx *ctx, struct ppcg_options *options,
 717         const char *input)
 718 {
 719         struct cuda_info cuda;
 720         int r;
 721
 722         cuda_open_files(&cuda, input);
 723
 724         r = generate_gpu(ctx, input, cuda.host_c, options, &print_cuda, &cuda);
 725
 726         cuda_close_files(&cuda);
 727
 728         return r;
 729 }