gpu_group.c

   1 /*
   2  * Copyright 2010-2011 INRIA Saclay
   3  * Copyright 2012-2014 Ecole Normale Superieure
   4  * Copyright 2015      Sven Verdoolaege
   5  *
   6  * Use of this software is governed by the MIT license
   7  *
   8  * Written by Sven Verdoolaege, INRIA Saclay - Ile-de-France,
   9  * Parc Club Orsay Universite, ZAC des vignes, 4 rue Jacques Monod,
  10  * 91893 Orsay, France
  11  * and Ecole Normale Superieure, 45 rue d'Ulm, 75230 Paris, France
  12  */
  13
  14 #include <isl/aff.h>
  15 #include <isl/map.h>
  16 #include <isl/constraint.h>
  17
  18 #include "gpu_array_tile.h"
  19 #include "gpu_group.h"
  20 #include "gpu_tree.h"
  21 #include "schedule.h"
  22
  23 /* Print the name of the local copy of a given group of array references.
  24  */
  25 __isl_give isl_printer *gpu_array_ref_group_print_name(
  26         struct gpu_array_ref_group *group, __isl_take isl_printer *p)
  27 {
  28         int global = 0;
  29         enum ppcg_group_access_type type;
  30
  31         type = gpu_array_ref_group_type(group);
  32         if (type == ppcg_access_private)
  33                 p = isl_printer_print_str(p, "private_");
  34         else if (type == ppcg_access_shared)
  35                 p = isl_printer_print_str(p, "shared_");
  36         else
  37                 global = 1;
  38         p = isl_printer_print_str(p, group->array->name);
  39         if (!global && group->local_array->n_group > 1) {
  40                 p = isl_printer_print_str(p, "_");
  41                 p = isl_printer_print_int(p, group->nr);
  42         }
  43
  44         return p;
  45 }
  46
  47 /* Return the union of all read (read = 1) and/or write (write = 1)
  48  * access relations in the group.
  49  */
  50 __isl_give isl_union_map *gpu_array_ref_group_access_relation(
  51         struct gpu_array_ref_group *group, int read, int write)
  52 {
  53         int i;
  54         isl_union_map *access;
  55
  56         access = isl_union_map_empty(isl_map_get_space(group->access));
  57         for (i = 0; i < group->n_ref; ++i) {
  58                 isl_map *map_i;
  59
  60                 if (!((read && group->refs[i]->read) ||
  61                      (write && group->refs[i]->write)))
  62                         continue;
  63                 map_i = isl_map_copy(group->refs[i]->access);
  64                 access = isl_union_map_union(access,
  65                                             isl_union_map_from_map(map_i));
  66         }
  67
  68         return access;
  69 }
  70
  71 /* Should this array reference group be mapped to private, shared or global
  72  * memory?
  73  * If we have computed both a private and a shared tile, then
  74  * the tile with the smallest depth is used.  If both have the same depth,
  75  * then the private tile is used.
  76  */
  77 enum ppcg_group_access_type gpu_array_ref_group_type(
  78         struct gpu_array_ref_group *group)
  79 {
  80         if (group->private_tile && group->shared_tile &&
  81             group->shared_tile->depth < group->private_tile->depth)
  82                 return ppcg_access_shared;
  83         if (group->private_tile)
  84                 return ppcg_access_private;
  85         if (group->shared_tile)
  86                 return ppcg_access_shared;
  87         return ppcg_access_global;
  88 }
  89
  90
  91 /* Return the effective gpu_array_tile associated to "group" or
  92  * NULL if there is no such gpu_array_tile.
  93  */
  94 struct gpu_array_tile *gpu_array_ref_group_tile(
  95         struct gpu_array_ref_group *group)
  96 {
  97         switch (gpu_array_ref_group_type(group)) {
  98         case ppcg_access_global:
  99                 return NULL;
 100         case ppcg_access_shared:
 101                 return group->shared_tile;
 102         case ppcg_access_private:
 103                 return group->private_tile;
 104         }
 105 }
 106
 107 /* Does the tile associated to "group" require unrolling of the schedule
 108  * dimensions mapped to threads?
 109  * Note that this can only happen for private tiles.
 110  */
 111 int gpu_array_ref_group_requires_unroll(struct gpu_array_ref_group *group)
 112 {
 113         struct gpu_array_tile *tile;
 114
 115         tile = gpu_array_ref_group_tile(group);
 116         if (!tile)
 117                 return 0;
 118         return tile->requires_unroll;
 119 }
 120
 121 /* Given an array access "access", check if for any index i there is
 122  * a shift a(p) and a stride g such that
 123  *
 124  *      a(p) + i = 0 mod g
 125  *
 126  * If so, record the information in tile->bound[i]->stride and
 127  * tile->bound[i]->shift.
 128  * Otherwise, set tile->bound[i]->stride to 1 (and tile->bound[i]->shift to 0).
 129  * Return isl_bool_true if any non-trivial stride was found.
 130  *
 131  * Note that the stride info returned by isl_map_get_range_stride_info
 132  * is of the form
 133  *
 134  *      i = o(p) + g n
 135  *
 136  * a(p) can therefore be taken to be equal to -o(p).
 137  */
 138 static isl_bool detect_strides(struct gpu_array_tile *tile,
 139         __isl_keep isl_map *access)
 140 {
 141         int i;
 142         isl_bool has_strides = isl_bool_false;
 143
 144         for (i = 0; i < tile->n; ++i) {
 145                 struct gpu_array_bound *bound = &tile->bound[i];
 146                 isl_stride_info *si;
 147
 148                 si = isl_map_get_range_stride_info(access, i);
 149                 bound->stride = isl_stride_info_get_stride(si);
 150                 bound->shift = isl_aff_neg(isl_stride_info_get_offset(si));
 151                 isl_stride_info_free(si);
 152
 153                 if (!has_strides)
 154                         has_strides = isl_val_gt_si(bound->stride, 1);
 155                 if (has_strides < 0)
 156                         return isl_bool_error;
 157         }
 158
 159         return has_strides;
 160 }
 161
 162 /* Given an array access "access", remove the strides based
 163  * on the information in tile->bound[i]->stride and tile->bound[i]->shift.
 164  *
 165  * In particular let the access be A[a] and
 166  * let the shifts s_i(p) and the strides g_i be such that
 167  *
 168  *  S(p) + a = 0 mod G
 169  *
 170  * Replace the access by
 171  *
 172  *  A[(a + S(p))/G]
 173  *
 174  * First collect the shifts s_i into an isl_multi_aff and
 175  * the strides into the scaling function A[i] -> A[G i].
 176  * Then add the shifts to the original access and
 177  * take the preimage over the scaling.
 178  */
 179 static __isl_give isl_map *remove_strides(__isl_take isl_map *access,
 180         struct gpu_array_tile *tile)
 181 {
 182         int i;
 183         isl_space *space;
 184         isl_multi_aff *shift, *scale;
 185         isl_multi_val *stride;
 186
 187         space = isl_map_get_space(access);
 188         shift = isl_multi_aff_zero(isl_space_copy(space));
 189         space = isl_space_range(space);
 190         stride = isl_multi_val_zero(isl_space_copy(space));
 191         scale = isl_multi_aff_identity(isl_space_map_from_set(space));
 192         for (i = 0; i < tile->n; ++i) {
 193                 struct gpu_array_bound *bound = &tile->bound[i];
 194                 isl_aff *shift_i;
 195                 isl_val *stride_i;
 196
 197                 shift_i = isl_aff_copy(bound->shift);
 198                 stride_i = isl_val_copy(bound->stride);
 199                 shift = isl_multi_aff_set_aff(shift, i, shift_i);
 200                 stride = isl_multi_val_set_val(stride, i, stride_i);
 201         }
 202         scale = isl_multi_aff_scale_multi_val(scale, stride);
 203
 204         access = isl_map_sum(access, isl_map_from_multi_aff(shift));
 205         access = isl_map_preimage_range_multi_aff(access, scale);
 206
 207         return access;
 208 }
 209
 210 /* Check if we can find a memory tile for the given array
 211  * based on the given accesses, and if so, put the results in "tile".
 212  *
 213  * We project the accesses on each index in turn and look for a parametric
 214  * offset such that the size is constant, after removing
 215  * any stride that may appear in the accesses.
 216  *
 217  * tile->depth is initialized to the input dimension of the computed bounds.
 218  */
 219 static isl_bool can_tile(__isl_keep isl_map *access,
 220         struct gpu_array_tile *tile)
 221 {
 222         int i;
 223         isl_bool has_strides, valid;
 224         isl_fixed_box *box;
 225         isl_multi_aff *offset;
 226         isl_multi_val *size;
 227
 228         if (!tile)
 229                 return isl_bool_error;
 230
 231         isl_map_free(isl_map_detect_equalities(isl_map_copy(access)));
 232
 233         has_strides = detect_strides(tile, access);
 234         if (has_strides < 0)
 235                 return isl_bool_error;
 236
 237         tile->depth = isl_map_dim(access, isl_dim_in);
 238
 239         access = isl_map_copy(access);
 240         if (has_strides)
 241                 access = remove_strides(access, tile);
 242
 243         box = isl_map_get_range_simple_fixed_box_hull(access);
 244         isl_map_free(access);
 245
 246         valid = isl_fixed_box_is_valid(box);
 247         if (valid >= 0 && valid) {
 248                 offset = isl_fixed_box_get_offset(box);
 249                 size = isl_fixed_box_get_size(box);
 250                 for (i = 0; i < tile->n; ++i) {
 251                         tile->bound[i].size = isl_multi_val_get_val(size, i);
 252                         tile->bound[i].lb = isl_multi_aff_get_aff(offset, i);
 253                 }
 254                 isl_multi_aff_free(offset);
 255                 isl_multi_val_free(size);
 256         }
 257         isl_fixed_box_free(box);
 258
 259         return valid;
 260 }
 261
 262 /* Internal data structure for gpu_group_references.
 263  *
 264  * scop represents the input scop.
 265  * kernel_depth is the schedule depth where the kernel launch will
 266  * be introduced, i.e., it is the depth of the band that is mapped
 267  * to blocks.
 268  * shared_depth is the schedule depth at which the copying to/from
 269  * shared memory is computed.  The copy operation may then
 270  * later be hoisted to a higher level.
 271  * thread_depth is the schedule depth where the thread mark is located,
 272  * i.e., it is the depth of the band that is mapped to threads and also
 273  * the schedule depth at which the copying to/from private memory
 274  * is computed.  The copy operation may then later be hoisted to
 275  * a higher level.
 276  * n_thread is the number of schedule dimensions in the band that
 277  * is mapped to threads.
 278  * privatization lives in the range of thread_sched (i.e., it is
 279  * of dimension thread_depth + n_thread) and encodes the mapping
 280  * to thread identifiers (as parameters).
 281  * host_sched contains the kernel_depth dimensions of the host schedule.
 282  * shared_sched contains the first shared_depth dimensions of the
 283  * kernel schedule.
 284  * copy_sched contains the first thread_depth dimensions of the
 285  * kernel schedule.
 286  * thread_sched contains the first (thread_depth + n_thread) dimensions
 287  * of the kernel schedule.
 288  * full_sched is a union_map representation of the entire kernel schedule.
 289  * The schedules are all formulated in terms of the original statement
 290  * instances, i.e., those that appear in the domains of the access
 291  * relations.
 292  */
 293 struct gpu_group_data {
 294         struct ppcg_scop *scop;
 295         int kernel_depth;
 296         int shared_depth;
 297         int thread_depth;
 298         int n_thread;
 299         isl_set *privatization;
 300         isl_union_map *host_sched;
 301         isl_union_map *shared_sched;
 302         isl_union_map *copy_sched;
 303         isl_union_map *thread_sched;
 304         isl_union_map *full_sched;
 305 };
 306
 307 /* Construct a map from domain_space to domain_space that increments
 308  * the dimension at position "pos" and leaves all other dimensions
 309  * constant.
 310  */
 311 static __isl_give isl_map *next(__isl_take isl_space *domain_space, int pos)
 312 {
 313         isl_space *space;
 314         isl_aff *aff;
 315         isl_multi_aff *next;
 316
 317         space = isl_space_map_from_set(domain_space);
 318         next = isl_multi_aff_identity(space);
 319         aff = isl_multi_aff_get_aff(next, pos);
 320         aff = isl_aff_add_constant_si(aff, 1);
 321         next = isl_multi_aff_set_aff(next, pos, aff);
 322
 323         return isl_map_from_multi_aff(next);
 324 }
 325
 326 /* Check if the given access is coalesced (or if there is no point
 327  * in trying to coalesce the access by mapping the array to shared memory).
 328  * That is, check whether incrementing the dimension that will get
 329  * wrapped over the last thread index results in incrementing
 330  * the last array index.
 331  *
 332  * If no two consecutive array elements are ever accessed by "access",
 333  * then mapping the corresponding array to shared memory will not
 334  * improve coalescing.  In fact, the copying will likely be performed
 335  * by a single thread.  Consider the access as coalesced such that
 336  * the caller will not try and map the array to shared memory just
 337  * to improve coalescing.
 338  *
 339  * This function is only called for access relations without reuse and
 340  * kernels with at least one thread identifier.
 341  */
 342 static int access_is_coalesced(struct gpu_group_data *data,
 343         __isl_keep isl_union_map *access)
 344 {
 345         int dim;
 346         isl_space *space;
 347         isl_set *accessed;
 348         isl_map *access_map;
 349         isl_map *next_thread_x;
 350         isl_map *next_element;
 351         isl_map *map;
 352         int coalesced, empty;
 353
 354         access = isl_union_map_copy(access);
 355         access = isl_union_map_apply_domain(access,
 356                                 isl_union_map_copy(data->full_sched));
 357         access_map = isl_map_from_union_map(access);
 358
 359         space = isl_map_get_space(access_map);
 360         space = isl_space_range(space);
 361         dim = isl_space_dim(space, isl_dim_set);
 362         if (dim == 0)
 363                 next_element = isl_map_empty(isl_space_map_from_set(space));
 364         else
 365                 next_element = next(space, dim - 1);
 366
 367         accessed = isl_map_range(isl_map_copy(access_map));
 368         map = isl_map_copy(next_element);
 369         map = isl_map_intersect_domain(map, isl_set_copy(accessed));
 370         map = isl_map_intersect_range(map, accessed);
 371         empty = isl_map_is_empty(map);
 372         isl_map_free(map);
 373
 374         if (empty < 0 || empty) {
 375                 isl_map_free(next_element);
 376                 isl_map_free(access_map);
 377                 return empty;
 378         }
 379
 380         space = isl_map_get_space(access_map);
 381         space = isl_space_domain(space);
 382         next_thread_x = next(space, data->thread_depth + data->n_thread - 1);
 383
 384         map = isl_map_apply_domain(next_thread_x, isl_map_copy(access_map));
 385         map = isl_map_apply_range(map, access_map);
 386
 387         coalesced = isl_map_is_subset(map, next_element);
 388
 389         isl_map_free(next_element);
 390         isl_map_free(map);
 391
 392         return coalesced;
 393 }
 394
 395 /* Replace the host schedule dimensions in the access relation "access"
 396  * by parameters, so that they are treated as fixed when checking for reuse
 397  * (within a kernel) or whether two consecutive elements are accessed
 398  * (within a kernel).
 399  */
 400 static __isl_give isl_union_map *localize_access(struct gpu_group_data *data,
 401         __isl_take isl_union_map *access)
 402 {
 403         int n;
 404         isl_space *space;
 405         isl_set *param;
 406         isl_union_map *umap;
 407         isl_id_list *ids;
 408
 409         umap = isl_union_map_copy(data->host_sched);
 410         space = isl_union_map_get_space(umap);
 411         n = data->kernel_depth;
 412         ids = ppcg_scop_generate_names(data->scop, n, "__ppcg_host_");
 413         param = parametrization(space, n, 0, ids);
 414         isl_id_list_free(ids);
 415         umap = isl_union_map_intersect_range(umap,
 416                                                 isl_union_set_from_set(param));
 417         access = isl_union_map_intersect_domain(access,
 418                                                 isl_union_map_domain(umap));
 419
 420         return access;
 421 }
 422
 423 /* Given an access relation in terms of at least data->thread_depth initial
 424  * dimensions of the computed schedule, check if it is bijective for
 425  * fixed values of the first data->thread_depth dimensions.
 426  * We perform this check by equating these dimensions to parameters.
 427  */
 428 static int access_is_bijective(struct gpu_group_data *data,
 429         __isl_keep isl_map *access)
 430 {
 431         int res;
 432         int dim;
 433         isl_set *par;
 434         isl_space *space;
 435         isl_id_list *ids;
 436
 437         access = isl_map_copy(access);
 438         space = isl_space_params(isl_map_get_space(access));
 439         ids = ppcg_scop_generate_names(data->scop, data->thread_depth, "s");
 440         dim = isl_map_dim(access, isl_dim_in);
 441         par = parametrization(space, dim, 0, ids);
 442         isl_id_list_free(ids);
 443         access = isl_map_intersect_domain(access, par);
 444         res = isl_map_is_bijective(access);
 445         isl_map_free(access);
 446
 447         return res;
 448 }
 449
 450 /* Compute the number of outer schedule tile dimensions that affect
 451  * the offset of "tile".
 452  * If there is no such dimension, then return the index
 453  * of the first kernel dimension, i.e., data->kernel_depth.
 454  */
 455 static int compute_tile_depth(struct gpu_group_data *data,
 456         struct gpu_array_tile *tile)
 457 {
 458         int i, j;
 459
 460         for (j = tile->depth - 1; j >= data->kernel_depth; --j) {
 461                 for (i = 0; i < tile->n; ++i) {
 462                         isl_aff *lb;
 463                         isl_aff *shift;
 464
 465                         lb = tile->bound[i].lb;
 466                         if (isl_aff_involves_dims(lb, isl_dim_in, j, 1))
 467                                 break;
 468
 469                         shift = tile->bound[i].shift;
 470                         if (!shift)
 471                                 continue;
 472                         if (isl_aff_involves_dims(shift, isl_dim_in, j, 1))
 473                                 break;
 474                 }
 475                 if (i < tile->n)
 476                         break;
 477         }
 478
 479         return ++j;
 480 }
 481
 482 /* Return the lowest depth between data->kernel_depth and data->thread_depth
 483  * at which every array element accessed through "acc" is accessed
 484  * by a single thread.  The input dimension of "acc" is
 485  * data->thread_depth + data->n_thread, where the final data->n_thread
 486  * dimensions are those that will be mapped to threads.
 487  * If the values for these dimensions are uniquely determined
 488  * by the array index and a given number of outer dimensions, then
 489  * there is only one thread accessing that array element within those
 490  * outer dimensions.
 491  *
 492  * The input space of "acc" is first split up, such that it has the form
 493  *
 494  *      [O -> T] -> A
 495  *
 496  * with O the outer dimensions, T the dimensions that will be mapped to threads
 497  * and A the array index.
 498  *
 499  * Then the positions of T and A are interchanged to simplify the test
 500  * whether T uniquely depends on O and A.
 501  * In particular, the above access relation is first combined with
 502  *
 503  *      [O -> T] -> T
 504  *
 505  * to form
 506  *
 507  *      [O -> T] -> [A -> T]
 508  *
 509  * from which
 510  *
 511  *      O -> [A -> T]
 512  *
 513  * is extracted, which is then uncurried to
 514  *
 515  *      [O -> A] -> T
 516  *
 517  * Finally, the final dimensions of O are projected out one by one
 518  * until T is no longer uniquely determined by A and the remaining
 519  * dimensions in O.  The value returned is that of the last dimension
 520  * that was successfully projected out.
 521  * Note that there is no need to test whether [O -> A] -> T itself
 522  * is single-valued as that was already tested in access_is_bijective.
 523  */
 524 static int compute_accessed_by_single_thread_depth(struct gpu_group_data *data,
 525         __isl_keep isl_map *acc)
 526 {
 527         int i;
 528         isl_space *space;
 529         isl_map *map;
 530         isl_bool sv;
 531
 532         if (data->thread_depth == data->kernel_depth)
 533                 return data->thread_depth;
 534
 535         acc = isl_map_copy(acc);
 536
 537         space = isl_map_get_space(acc);
 538         space = isl_space_params(space);
 539         space = isl_space_set_from_params(space);
 540         space = isl_space_add_dims(space, isl_dim_set, data->thread_depth);
 541         space = isl_space_from_domain(space);
 542         space = isl_space_add_dims(space, isl_dim_out, data->n_thread);
 543         space = isl_space_wrap(space);
 544         map = isl_set_flatten_map(isl_set_universe(space));
 545         acc = isl_map_apply_range(map, acc);
 546
 547         space = isl_space_domain(isl_map_get_space(acc));
 548         map = isl_map_range_map(isl_map_universe(isl_space_unwrap(space)));
 549         acc = isl_map_range_product(acc, map);
 550         acc = isl_map_domain_factor_domain(acc);
 551         acc = isl_map_uncurry(acc);
 552
 553         for (i = data->thread_depth - 1; i >= data->kernel_depth; --i) {
 554                 acc = isl_map_project_out(acc, isl_dim_in, i, 1);
 555                 sv = isl_map_is_single_valued(acc);
 556                 if (sv < 0)
 557                         goto error;
 558                 if (!sv)
 559                         break;
 560         }
 561
 562         isl_map_free(acc);
 563
 564         return ++i;
 565 error:
 566         isl_map_free(acc);
 567         return -1;
 568 }
 569
 570 /* Adjust the fields of "tile" to reflect the new input dimension "depth".
 571  * The dimension beyond "depth" are assumed not to affect the tile,
 572  * so they can simply be dropped.
 573  */
 574 static int tile_adjust_depth(struct gpu_array_tile *tile, int depth)
 575 {
 576         int i;
 577
 578         if (tile->depth == depth)
 579                 return 0;
 580
 581         for (i = 0; i < tile->n; ++i) {
 582                 tile->bound[i].lb = isl_aff_drop_dims(tile->bound[i].lb,
 583                                         isl_dim_in, depth, tile->depth - depth);
 584                 if (!tile->bound[i].lb)
 585                         return -1;
 586                 if (!tile->bound[i].shift)
 587                         continue;
 588                 tile->bound[i].shift = isl_aff_drop_dims(tile->bound[i].shift,
 589                                         isl_dim_in, depth, tile->depth - depth);
 590                 if (!tile->bound[i].shift)
 591                         return -1;
 592         }
 593
 594         tile->depth = depth;
 595
 596         return 0;
 597 }
 598
 599 /* Determine the number of schedule dimensions that affect the offset of the
 600  * shared or private tile "tile" and store the result in tile->depth, with
 601  * a lower bound of data->kernel_depth.
 602  * Also adjust the fields of the tile to only refer to the tile->depth
 603  * outer schedule dimensions.
 604  */
 605 static isl_stat tile_set_depth(struct gpu_group_data *data,
 606         struct gpu_array_tile *tile)
 607 {
 608         if (tile_adjust_depth(tile, compute_tile_depth(data, tile)) < 0)
 609                 return isl_stat_error;
 610
 611         return isl_stat_ok;
 612 }
 613
 614 /* Determine the number of schedule dimensions that affect the offset of the
 615  * shared tile and store the minimum of the private and shared tile depth
 616  * in group->min_depth, with a lower bound of data->kernel_depth.
 617  * If there is no tile defined on the array reference group,
 618  * then set group->min_depth to data->thread_depth.
 619  */
 620 static int set_depth(struct gpu_group_data *data,
 621         struct gpu_array_ref_group *group)
 622 {
 623         group->min_depth = data->thread_depth;
 624
 625         if (group->private_tile) {
 626                 if (group->private_tile->depth < group->min_depth)
 627                         group->min_depth = group->private_tile->depth;
 628         }
 629         if (group->shared_tile) {
 630                 if (tile_set_depth(data, group->shared_tile) < 0)
 631                         return -1;
 632                 if (group->shared_tile->depth < group->min_depth)
 633                         group->min_depth = group->shared_tile->depth;
 634         }
 635
 636         return 0;
 637 }
 638
 639 /* Fill up the groups array with singleton groups, i.e., one group
 640  * per reference, initializing the array, access, write, n_ref and refs fields.
 641  * In particular the access field is initialized to the scheduled
 642  * access relation of the array reference.
 643  *
 644  * Return the number of elements initialized, i.e., the number of
 645  * active references in the current kernel.
 646  */
 647 static int populate_array_references(struct gpu_local_array_info *local,
 648         struct gpu_array_ref_group **groups, struct gpu_group_data *data)
 649 {
 650         int i;
 651         int n;
 652         isl_ctx *ctx = isl_union_map_get_ctx(data->copy_sched);
 653
 654         n = 0;
 655         for (i = 0; i < local->array->n_ref; ++i) {
 656                 isl_union_map *umap;
 657                 isl_map *map;
 658                 struct gpu_array_ref_group *group;
 659                 struct gpu_stmt_access *access = local->array->refs[i];
 660
 661                 map = isl_map_copy(access->access);
 662                 umap = isl_union_map_from_map(map);
 663                 umap = isl_union_map_apply_domain(umap,
 664                                 isl_union_map_copy(data->copy_sched));
 665
 666                 if (isl_union_map_is_empty(umap)) {
 667                         isl_union_map_free(umap);
 668                         continue;
 669                 }
 670
 671                 map = isl_map_from_union_map(umap);
 672                 map = isl_map_detect_equalities(map);
 673
 674                 group = isl_calloc_type(ctx, struct gpu_array_ref_group);
 675                 if (!group) {
 676                         isl_map_free(map);
 677                         return -1;
 678                 }
 679                 group->local_array = local;
 680                 group->array = local->array;
 681                 group->access = map;
 682                 group->write = access->write;
 683                 group->exact_write = access->exact_write;
 684                 group->slice = access->n_index < local->array->n_index;
 685                 group->refs = &local->array->refs[i];
 686                 group->n_ref = 1;
 687
 688                 groups[n++] = group;
 689         }
 690
 691         return n;
 692 }
 693
 694 /* If group->n_ref == 1, then group->refs was set by
 695  * populate_array_references to point directly into
 696  * group->array->refs and should not be freed.
 697  * If group->n_ref > 1, then group->refs was set by join_groups
 698  * to point to a newly allocated array.
 699  */
 700 struct gpu_array_ref_group *gpu_array_ref_group_free(
 701         struct gpu_array_ref_group *group)
 702 {
 703         if (!group)
 704                 return NULL;
 705         gpu_array_tile_free(group->shared_tile);
 706         gpu_array_tile_free(group->private_tile);
 707         isl_map_free(group->access);
 708         if (group->n_ref > 1)
 709                 free(group->refs);
 710         free(group);
 711         return NULL;
 712 }
 713
 714 /* Check if the access relations of group1 and group2 overlap within
 715  * copy_sched.
 716  */
 717 static int accesses_overlap(struct gpu_array_ref_group *group1,
 718         struct gpu_array_ref_group *group2)
 719 {
 720         int disjoint;
 721
 722         disjoint = isl_map_is_disjoint(group1->access, group2->access);
 723         if (disjoint < 0)
 724                 return -1;
 725
 726         return !disjoint;
 727 }
 728
 729 /* Combine the given two groups into a single group, containing
 730  * the references of both groups.
 731  */
 732 static struct gpu_array_ref_group *join_groups(
 733         struct gpu_array_ref_group *group1,
 734         struct gpu_array_ref_group *group2)
 735 {
 736         int i;
 737         isl_ctx *ctx;
 738         struct gpu_array_ref_group *group;
 739
 740         if (!group1 || !group2)
 741                 return NULL;
 742
 743         ctx = isl_map_get_ctx(group1->access);
 744         group = isl_calloc_type(ctx, struct gpu_array_ref_group);
 745         if (!group)
 746                 return NULL;
 747         group->local_array = group1->local_array;
 748         group->array = group1->array;
 749         group->access = isl_map_union(isl_map_copy(group1->access),
 750                                         isl_map_copy(group2->access));
 751         group->write = group1->write || group2->write;
 752         group->exact_write = group1->exact_write && group2->exact_write;
 753         group->slice = group1->slice || group2->slice;
 754         group->n_ref = group1->n_ref + group2->n_ref;
 755         group->refs = isl_alloc_array(ctx, struct gpu_stmt_access *,
 756                                         group->n_ref);
 757         if (!group->refs)
 758                 return gpu_array_ref_group_free(group);
 759         for (i = 0; i < group1->n_ref; ++i)
 760                 group->refs[i] = group1->refs[i];
 761         for (i = 0; i < group2->n_ref; ++i)
 762                 group->refs[group1->n_ref + i] = group2->refs[i];
 763
 764         return group;
 765 }
 766
 767 /* Combine the given two groups into a single group and free
 768  * the original two groups.
 769  */
 770 static struct gpu_array_ref_group *join_groups_and_free(
 771         struct gpu_array_ref_group *group1,
 772         struct gpu_array_ref_group *group2)
 773 {
 774         struct gpu_array_ref_group *group;
 775
 776         group = join_groups(group1, group2);
 777         gpu_array_ref_group_free(group1);
 778         gpu_array_ref_group_free(group2);
 779         return group;
 780 }
 781
 782 /* Report that the array reference group with the given access relation
 783  * is not mapped to shared memory in the given kernel because
 784  * it does not exhibit any reuse and is considered to be coalesced.
 785  */
 786 static void report_no_reuse_and_coalesced(struct ppcg_kernel *kernel,
 787         __isl_keep isl_union_map *access)
 788 {
 789         isl_ctx *ctx;
 790         isl_printer *p;
 791
 792         ctx = isl_union_map_get_ctx(access);
 793         p = isl_printer_to_file(ctx, stdout);
 794         p = isl_printer_print_str(p, "Array reference group ");
 795         p = isl_printer_print_union_map(p, access);
 796         p = isl_printer_print_str(p,
 797             " not considered for mapping to shared memory in kernel");
 798         p = isl_printer_print_int(p, kernel->id);
 799         p = isl_printer_print_str(p,
 800             " because it exhibits no reuse and is considered to be coalesced");
 801         p = isl_printer_end_line(p);
 802         isl_printer_free(p);
 803 }
 804
 805 /* Given an access relation in terms of the data->thread_depth initial
 806  * dimensions of the computed schedule and the thread identifiers
 807  * (as parameters), check if the use of the corresponding private tile
 808  * requires unrolling.
 809  *
 810  * If we are creating a private tile because we are forced to,
 811  * then no unrolling is required.
 812  * Otherwise we check if "access" is bijective and unrolling
 813  * is required if it is not.  Note that the access relation
 814  * has already been determined to be bijective before the introduction
 815  * of the thread identifiers and the removal of the schedule dimensions
 816  * that are mapped to these threads.  If the access relation is no longer
 817  * bijective, then this means that more than one value of one of those
 818  * schedule dimensions is mapped to the same thread and therefore
 819  * unrolling is required.
 820  */
 821 static int check_requires_unroll(struct gpu_group_data *data,
 822         __isl_keep isl_map *access, int force_private)
 823 {
 824         int bijective;
 825
 826         if (force_private)
 827                 return 0;
 828         bijective = access_is_bijective(data, access);
 829         if (bijective < 0)
 830                 return -1;
 831         return !bijective;
 832 }
 833
 834 /* Map the domain of "access" to the outer data->shared_depth
 835  * schedule dimensions.  When data->shared_depth is equal to
 836  * data->thread_depth, this result is already available in group->access.
 837  */
 838 static __isl_give isl_map *shared_access(struct gpu_array_ref_group *group,
 839         __isl_keep isl_union_map *access, struct gpu_group_data *data)
 840 {
 841         isl_union_map *shared;
 842
 843         if (data->shared_depth == data->thread_depth)
 844                 return isl_map_copy(group->access);
 845
 846         shared = isl_union_map_copy(access);
 847         shared = isl_union_map_apply_domain(shared,
 848                         isl_union_map_copy(data->shared_sched));
 849         return isl_map_from_union_map(shared);
 850 }
 851
 852 /* Compute the private and/or shared memory tiles for the array
 853  * reference group "group" of array "array".
 854  * Return isl_stat_ok on success and isl_stat_error on error.
 855  *
 856  * If the array is a read-only scalar or if the user requested
 857  * not to use shared or private memory, then we do not need to do anything.
 858  *
 859  * If any reference in the reference group accesses more than one element,
 860  * then we would have to make sure that the layout in shared memory
 861  * is the same as that in global memory.  Since we do not handle this yet
 862  * (and it may not even be possible), we refuse to map to private or
 863  * shared memory in such cases.
 864  *
 865  * If the array group involves any may writes (that are not must writes),
 866  * then we would have to make sure that we load the data into shared/private
 867  * memory first in case the data is not written by the kernel
 868  * (but still written back out to global memory).
 869  * Since we don't have any such mechanism at the moment, we don't
 870  * compute shared/private tiles for groups involving may writes.
 871  *
 872  * We only try to compute a shared memory tile if there is any reuse
 873  * or if the access is not coalesced.
 874  * Reuse and coalescing are checked within the given kernel.
 875  *
 876  * For computing a private memory tile, we also require that there is
 877  * some reuse.  Moreover, we require that the access is private
 878  * to the thread.  That is, we check that any given array element
 879  * is only accessed by a single thread.
 880  * We compute an access relation that maps the outer
 881  * data->thread_depth + data->n_thread schedule dimensions.
 882  * The latter data->n_thread will be mapped to thread identifiers.
 883  * We actually check that those iterators that will be wrapped
 884  * partition the array space.  This check is stricter than necessary
 885  * since several iterations may be mapped onto the same thread
 886  * and then they could be allowed to access the same memory elements,
 887  * but our check does not allow this situation.
 888  *
 889  * For private memory tiles, the number of schedule dimensions that
 890  * affect the offset is computed and stored in tile->depth, with
 891  * a lower bound of data->kernel_depth.  If this depth is smaller
 892  * than the minimal depth that still ensures that every element
 893  * is accessed by a single thread, then the depth is raised
 894  * to this minimal depth.
 895  * The fields of the tile are then adjusted to only refer to the tile->depth
 896  * outer schedule dimensions.
 897  *
 898  * We also check that the index expression only depends on parallel
 899  * loops.  That way, we can move those loops innermost and unroll them.
 900  * Again, we use a test that is stricter than necessary.
 901  * We actually check whether the index expression only depends
 902  * on the iterators that are wrapped over the threads.
 903  * These are necessarily parallel, but there may be more parallel loops.
 904  *
 905  * Combining the injectivity of the first test with the single-valuedness
 906  * of the second test, we simply test for bijectivity.
 907  *
 908  * If the use of the private tile requires unrolling, but some
 909  * of the other arrays are forcibly mapped to private memory,
 910  * then we do not allow the use of this private tile since
 911  * we cannot move the schedule dimensions that need to be unrolled down
 912  * without performing some kind of expansion on those arrays
 913  * that are forcibly mapped to private memory.
 914  *
 915  * If the array is marked force_private, then we bypass all checks
 916  * and assume we can (and should) use registers only.
 917  *
 918  * If it turns out we can (or have to) use registers, we compute
 919  * the private memory tile size using can_tile, after introducing a dependence
 920  * on the thread indices.
 921  */
 922 static isl_stat compute_group_bounds_core(struct ppcg_kernel *kernel,
 923         struct gpu_array_ref_group *group, struct gpu_group_data *data)
 924 {
 925         isl_ctx *ctx = isl_space_get_ctx(group->array->space);
 926         isl_union_map *access, *local;
 927         int n_index = group->array->n_index;
 928         int no_reuse, coalesced;
 929         isl_map *acc;
 930         int force_private = group->local_array->force_private;
 931         int use_shared = !force_private && kernel->options->use_shared_memory &&
 932                                 data->n_thread > 0;
 933         int use_private = force_private || kernel->options->use_private_memory;
 934         isl_stat r = isl_stat_ok;
 935         isl_bool ok;
 936         int requires_unroll;
 937         int unique_depth;
 938
 939         if (!use_shared && !use_private)
 940                 return isl_stat_ok;
 941         if (gpu_array_is_read_only_scalar(group->array))
 942                 return isl_stat_ok;
 943         if (!force_private && !group->exact_write)
 944                 return isl_stat_ok;
 945         if (group->slice)
 946                 return isl_stat_ok;
 947
 948         access = gpu_array_ref_group_access_relation(group, 1, 1);
 949         local = localize_access(data, isl_union_map_copy(access));
 950         no_reuse = isl_union_map_is_injective(local);
 951         if (no_reuse < 0)
 952                 r = isl_stat_error;
 953         if (use_shared && no_reuse)
 954                 coalesced = access_is_coalesced(data, local);
 955         isl_union_map_free(local);
 956
 957         if (r >= 0 && kernel->options->debug->verbose &&
 958             use_shared && no_reuse && coalesced)
 959                 report_no_reuse_and_coalesced(kernel, access);
 960
 961         if (use_shared && (!no_reuse || !coalesced)) {
 962                 group->shared_tile = gpu_array_tile_create(ctx,
 963                                                         group->array->n_index);
 964                 acc = shared_access(group, access, data);
 965                 ok = can_tile(acc, group->shared_tile);
 966                 if (ok < 0)
 967                         r = isl_stat_error;
 968                 else if (!ok)
 969                         group->shared_tile =
 970                                         gpu_array_tile_free(group->shared_tile);
 971                 isl_map_free(acc);
 972         }
 973
 974         if (r < 0 || (!force_private && (!use_private || no_reuse))) {
 975                 isl_union_map_free(access);
 976                 return r;
 977         }
 978
 979         access = isl_union_map_apply_domain(access,
 980                                         isl_union_map_copy(data->thread_sched));
 981
 982         acc = isl_map_from_union_map(access);
 983
 984         if (!force_private && !access_is_bijective(data, acc)) {
 985                 isl_map_free(acc);
 986                 return isl_stat_ok;
 987         }
 988
 989         unique_depth = compute_accessed_by_single_thread_depth(data, acc);
 990
 991         acc = isl_map_intersect_domain(acc, isl_set_copy(data->privatization));
 992         acc = isl_map_project_out(acc, isl_dim_in, data->thread_depth,
 993                                                                 data->n_thread);
 994         requires_unroll = check_requires_unroll(data, acc, force_private);
 995         if (unique_depth < 0 || requires_unroll < 0 ||
 996             (requires_unroll && kernel->any_force_private)) {
 997                 isl_map_free(acc);
 998                 return requires_unroll < 0 ? isl_stat_error : isl_stat_ok;
 999         }
1000
1001         group->private_tile = gpu_array_tile_create(ctx, n_index);
1002         group->private_tile->requires_unroll = requires_unroll;
1003         ok = can_tile(acc, group->private_tile);
1004         if (ok >= 0 && !ok)
1005                 group->private_tile = gpu_array_tile_free(group->private_tile);
1006         isl_map_free(acc);
1007         if (ok < 0)
1008                 return isl_stat_error;
1009
1010         if (group->private_tile) {
1011                 struct gpu_array_tile *tile = group->private_tile;
1012                 int tile_depth = compute_tile_depth(data, tile);
1013                 if (tile_depth < unique_depth)
1014                         tile_depth = unique_depth;
1015                 if (tile_adjust_depth(tile, tile_depth) < 0)
1016                         return isl_stat_error;
1017         }
1018
1019         if (force_private && !group->private_tile)
1020                 isl_die(ctx, isl_error_internal,
1021                         "unable to map array reference group to registers",
1022                         return isl_stat_error);
1023
1024         return isl_stat_ok;
1025 }
1026
1027 /* Compute the private and/or shared memory tiles for the array
1028  * reference group "group" of array "array" and set the tile depth.
1029  * Return 0 on success and -1 on error.
1030  */
1031 static int compute_group_bounds(struct ppcg_kernel *kernel,
1032         struct gpu_array_ref_group *group, struct gpu_group_data *data)
1033 {
1034         if (!group)
1035                 return -1;
1036         if (compute_group_bounds_core(kernel, group, data) < 0)
1037                 return -1;
1038         if (set_depth(data, group) < 0)
1039                 return -1;
1040
1041         return 0;
1042 }
1043
1044 /* If two groups have overlapping access relations (as determined by
1045  * the "overlap" function) and if one of them involves a write,
1046  * then merge the two groups into one.
1047  * If "compute_bounds" is set, then call compute_group_bounds
1048  * on the merged groups.
1049  * If any group is merged into the current group, then its access
1050  * relation may have changed or it may have been turned into a write.
1051  * The combined group might therefore overlap with groups that
1052  * the original group did not overlap with.  The groups therefore
1053  * need to be checked again.
1054  *
1055  * Return the updated number of groups.
1056  * Return -1 on error.
1057  */
1058 static int group_writes(struct ppcg_kernel *kernel,
1059         int n, struct gpu_array_ref_group **groups,
1060         int (*overlap)(struct gpu_array_ref_group *group1,
1061                 struct gpu_array_ref_group *group2), int compute_bounds,
1062         struct gpu_group_data *data)
1063 {
1064         int i, j;
1065         int any_merge;
1066
1067         for (i = 0; i < n; i += !any_merge) {
1068                 any_merge = 0;
1069                 for (j = n - 1; j > i; --j) {
1070                         if (!groups[i]->write && !groups[j]->write)
1071                                 continue;
1072
1073                         if (!overlap(groups[i], groups[j]))
1074                                 continue;
1075
1076                         any_merge = 1;
1077                         groups[i] = join_groups_and_free(groups[i], groups[j]);
1078                         if (j != n - 1)
1079                                 groups[j] = groups[n - 1];
1080                         groups[n - 1] = NULL;
1081                         n--;
1082
1083                         if (!groups[i])
1084                                 return -1;
1085                         if (compute_bounds &&
1086                             compute_group_bounds(kernel, groups[i], data) < 0)
1087                                 return -1;
1088                 }
1089         }
1090
1091         return n;
1092 }
1093
1094 /* If two groups have overlapping access relations (within the innermost
1095  * loop) and if one of them involves a write, then merge the two groups
1096  * into one.
1097  *
1098  * Return the updated number of groups.
1099  */
1100 static int group_overlapping_writes(struct ppcg_kernel *kernel,
1101         int n, struct gpu_array_ref_group **groups,
1102         struct gpu_group_data *data)
1103 {
1104         return group_writes(kernel, n, groups, &accesses_overlap, 0, data);
1105 }
1106
1107 /* Check if the access relations of group1 and group2 overlap within
1108  * the outermost min(group1->min_depth, group2->min_depth) loops.
1109  */
1110 static int depth_accesses_overlap(struct gpu_array_ref_group *group1,
1111         struct gpu_array_ref_group *group2)
1112 {
1113         int depth;
1114         int dim;
1115         int empty;
1116         isl_map *map_i, *map_j, *map;
1117
1118         depth = group1->min_depth;
1119         if (group2->min_depth < depth)
1120                 depth = group2->min_depth;
1121         map_i = isl_map_copy(group1->access);
1122         dim = isl_map_dim(map_i, isl_dim_in);
1123         map_i = isl_map_eliminate(map_i, isl_dim_in, depth, dim - depth);
1124         map_j = isl_map_copy(group2->access);
1125         map_j = isl_map_eliminate(map_j, isl_dim_in, depth, dim - depth);
1126         map = isl_map_intersect(map_i, map_j);
1127         empty = isl_map_is_empty(map);
1128         isl_map_free(map);
1129
1130         return !empty;
1131 }
1132
1133 /* If two groups have overlapping access relations (within the outer
1134  * depth loops) and if one of them involves a write,
1135  * then merge the two groups into one.
1136  *
1137  * Return the updated number of groups.
1138  */
1139 static int group_depth_overlapping_writes(struct ppcg_kernel *kernel,
1140         int n, struct gpu_array_ref_group **groups, struct gpu_group_data *data)
1141 {
1142         return group_writes(kernel, n, groups, &depth_accesses_overlap, 1,
1143                                 data);
1144 }
1145
1146 /* Is the size of the tile specified by "tile" smaller than the sum of
1147  * the sizes of the tiles specified by "tile1" and "tile2"?
1148  */
1149 static int smaller_tile(struct gpu_array_tile *tile,
1150         struct gpu_array_tile *tile1, struct gpu_array_tile *tile2)
1151 {
1152         int smaller;
1153         isl_val *size, *size1, *size2;
1154
1155         size = gpu_array_tile_size(tile);
1156         size1 = gpu_array_tile_size(tile1);
1157         size2 = gpu_array_tile_size(tile2);
1158
1159         size = isl_val_sub(size, size1);
1160         size = isl_val_sub(size, size2);
1161         smaller = isl_val_is_neg(size);
1162
1163         isl_val_free(size);
1164
1165         return smaller;
1166 }
1167
1168 /* Given an initial grouping of array references and shared memory tiles
1169  * for each group that allows for a shared memory tile, merge two groups
1170  * if both have a shared memory tile, the merged group also has
1171  * a shared memory tile and the size of the tile for the merge group
1172  * is smaller than the sum of the tile sizes of the individual groups.
1173  * If any group is merged into the current group, then it may become
1174  * profitable to combine it with groups that were considered before
1175  * the merge.  The groups are therefore checked again after a merge.
1176  *
1177  * If merging two groups decreases the depth of the tile of
1178  * one or both of the two groups, then we need to check for overlapping
1179  * writes again.
1180  *
1181  * Return the number of groups after merging.
1182  * Return -1 on error.
1183  */
1184 static int group_common_shared_memory_tile(struct ppcg_kernel *kernel,
1185         struct gpu_array_info *array, int n,
1186         struct gpu_array_ref_group **groups, struct gpu_group_data *data)
1187 {
1188         int i, j;
1189         int recompute_overlap = 0;
1190         int any_merge;
1191
1192         for (i = 0; i < n; i += !any_merge) {
1193                 any_merge = 0;
1194                 if (!groups[i]->shared_tile)
1195                         continue;
1196                 for (j = n - 1; j > i; --j) {
1197                         struct gpu_array_ref_group *group;
1198
1199                         if (!groups[j]->shared_tile)
1200                                 continue;
1201
1202                         if (!depth_accesses_overlap(groups[i], groups[j]))
1203                                 continue;
1204
1205                         group = join_groups(groups[i], groups[j]);
1206                         if (compute_group_bounds(kernel, group, data) < 0) {
1207                                 gpu_array_ref_group_free(group);
1208                                 return -1;
1209                         }
1210                         if (!group->shared_tile ||
1211                             !smaller_tile(group->shared_tile,
1212                                         groups[i]->shared_tile,
1213                                         groups[j]->shared_tile)) {
1214                                 gpu_array_ref_group_free(group);
1215                                 continue;
1216                         }
1217
1218                         any_merge = 1;
1219                         if (group->min_depth < groups[i]->min_depth ||
1220                             group->min_depth < groups[j]->min_depth)
1221                                 recompute_overlap = 1;
1222                         gpu_array_ref_group_free(groups[i]);
1223                         gpu_array_ref_group_free(groups[j]);
1224                         groups[i] = group;
1225                         if (j != n - 1)
1226                                 groups[j] = groups[n - 1];
1227                         n--;
1228                 }
1229         }
1230
1231         if (recompute_overlap)
1232                 n = group_depth_overlapping_writes(kernel, n, groups, data);
1233         return n;
1234 }
1235
1236 /* Set array->n_group and array->groups to n and groups.
1237  *
1238  * Additionally, set the "nr" field of each group.
1239  */
1240 static void set_array_groups(struct gpu_local_array_info *array,
1241         int n, struct gpu_array_ref_group **groups)
1242 {
1243         int i;
1244
1245         array->n_group = n;
1246         array->groups = groups;
1247
1248         for (i = 0; i < n; ++i)
1249                 groups[i]->nr = i;
1250 }
1251
1252 /* Combine all groups in "groups" into a single group and return
1253  * the new number of groups (1 or 0 if there were no groups to start with).
1254  */
1255 static int join_all_groups(int n, struct gpu_array_ref_group **groups)
1256 {
1257         int i;
1258
1259         for (i = n - 1; i > 0; --i) {
1260                 groups[0] = join_groups_and_free(groups[0], groups[i]);
1261                 groups[i] = NULL;
1262                 n--;
1263         }
1264
1265         return n;
1266 }
1267
1268 /* Group array references that should be considered together when
1269  * deciding whether to access them from private, shared or global memory.
1270  * Return -1 on error.
1271  *
1272  * In particular, if two array references overlap and if one of them
1273  * is a write, then the two references are grouped together.
1274  * We first perform an initial grouping based only on the access relation.
1275  * After computing shared and private memory tiles, we check for
1276  * overlapping writes again, but this time taking into account
1277  * the depth of the effective tile.
1278  *
1279  * Furthermore, if two groups admit a shared memory tile and if the
1280  * combination of the two also admits a shared memory tile, we merge
1281  * the two groups.
1282  *
1283  * If the array contains structures, then we compute a single
1284  * reference group without trying to find any tiles
1285  * since we do not map such arrays to private or shared
1286  * memory.  The only exception is when those arrays of structures
1287  * are required to be mapped to private memory.
1288  */
1289 static int group_array_references(struct ppcg_kernel *kernel,
1290         struct gpu_local_array_info *local, struct gpu_group_data *data)
1291 {
1292         int i;
1293         int n;
1294         isl_ctx *ctx = isl_union_map_get_ctx(data->shared_sched);
1295         struct gpu_array_ref_group **groups;
1296
1297         groups = isl_calloc_array(ctx, struct gpu_array_ref_group *,
1298                                         local->array->n_ref);
1299         if (!groups)
1300                 return -1;
1301
1302         n = populate_array_references(local, groups, data);
1303
1304         if (local->array->has_compound_element && !local->force_private) {
1305                 n = join_all_groups(n, groups);
1306                 set_array_groups(local, n, groups);
1307                 return 0;
1308         }
1309
1310         n = group_overlapping_writes(kernel, n, groups, data);
1311
1312         for (i = 0; i < n; ++i)
1313                 if (compute_group_bounds(kernel, groups[i], data) < 0)
1314                         n = -1;
1315
1316         n = group_depth_overlapping_writes(kernel, n, groups, data);
1317
1318         n = group_common_shared_memory_tile(kernel, local->array,
1319                                             n, groups, data);
1320
1321         set_array_groups(local, n, groups);
1322
1323         if (n >= 0)
1324                 return 0;
1325
1326         for (i = 0; i < local->array->n_ref; ++i)
1327                 gpu_array_ref_group_free(groups[i]);
1328         return -1;
1329 }
1330
1331 /* For each array in the input program that can be mapped to private memory,
1332  * check if there are any order dependences active inside the current kernel,
1333  * within the same iteration of the host schedule, i.e., the prefix
1334  * schedule at "node".
1335  * If so, mark the array as force_private so that its reference groups will be
1336  * mapped to a registers.
1337  *
1338  * Note that the arrays that cannot be mapped to private memory have
1339  * had their order dependences added to prog->array_order and
1340  * subsequently to the coincidence constraints.
1341  */
1342 static void check_can_be_private_live_ranges(struct ppcg_kernel *kernel,
1343         __isl_keep isl_schedule_node *node)
1344 {
1345         int i;
1346         isl_union_set *domain;
1347         isl_multi_union_pw_aff *prefix;
1348         isl_union_pw_multi_aff *contraction;
1349
1350         if (!kernel->options->live_range_reordering)
1351                 return;
1352
1353         kernel->any_force_private = 0;
1354
1355         prefix = isl_schedule_node_get_prefix_schedule_multi_union_pw_aff(node);
1356         contraction = isl_union_pw_multi_aff_copy(kernel->contraction);
1357         prefix = isl_multi_union_pw_aff_pullback_union_pw_multi_aff(prefix,
1358                                                                 contraction);
1359         domain = isl_union_set_copy(kernel->expanded_domain);
1360         domain = isl_union_set_universe(domain);
1361
1362         for (i = 0; i < kernel->n_array; ++i) {
1363                 struct gpu_local_array_info *local = &kernel->array[i];
1364                 isl_union_map *order;
1365
1366                 local->force_private = 0;
1367                 if (!gpu_array_can_be_private(local->array))
1368                         continue;
1369                 order = isl_union_map_copy(local->array->dep_order);
1370                 order = isl_union_map_intersect_domain(order,
1371                                                     isl_union_set_copy(domain));
1372                 order = isl_union_map_intersect_range(order,
1373                                                     isl_union_set_copy(domain));
1374                 order = isl_union_map_eq_at_multi_union_pw_aff(order,
1375                                         isl_multi_union_pw_aff_copy(prefix));
1376                 if (!isl_union_map_is_empty(order)) {
1377                         local->force_private = 1;
1378                         kernel->any_force_private = 1;
1379                 }
1380                 isl_union_map_free(order);
1381         }
1382
1383         isl_multi_union_pw_aff_free(prefix);
1384         isl_union_set_free(domain);
1385 }
1386
1387 /* Expand the domain of the schedule "s" by plugging in
1388  * the contraction "contraction" and return the result.
1389  */
1390 static __isl_give isl_union_map *expand(__isl_take isl_union_map *s,
1391         __isl_keep isl_union_pw_multi_aff *contraction)
1392 {
1393         contraction = isl_union_pw_multi_aff_copy(contraction);
1394         s = isl_union_map_preimage_domain_union_pw_multi_aff(s, contraction);
1395         return s;
1396 }
1397
1398 /* Create a set of dimension data->thread_depth + data->n_thread
1399  * that equates the residue of the final data->n_thread dimensions
1400  * modulo the kernel->block_dim sizes to the thread identifiers.
1401  * Store the computed set in data->privatization.
1402  *
1403  * The construction starts with the space of kernel->thread_filter,
1404  * which is known to reference all thread identifiers.
1405  */
1406 static void compute_privatization(struct gpu_group_data *data,
1407         struct ppcg_kernel *kernel)
1408 {
1409         int i;
1410         isl_ctx *ctx;
1411         isl_space *space;
1412         isl_local_space *ls;
1413         isl_set *set;
1414
1415         ctx = isl_union_map_get_ctx(data->shared_sched);
1416         space = isl_union_set_get_space(kernel->thread_filter);
1417         space = isl_space_set_from_params(space);
1418         space = isl_space_add_dims(space, isl_dim_set,
1419                                     data->thread_depth + data->n_thread);
1420         set = isl_set_universe(space);
1421         space = isl_set_get_space(set);
1422         ls = isl_local_space_from_space(space);
1423
1424         for (i = 0; i < data->n_thread; ++i) {
1425                 isl_aff *aff, *aff2;
1426                 isl_constraint *c;
1427                 isl_val *v;
1428                 isl_id *id;
1429                 int pos;
1430
1431                 if (!set)
1432                         break;
1433
1434                 aff = isl_aff_var_on_domain(isl_local_space_copy(ls),
1435                                         isl_dim_set, data->thread_depth + i);
1436                 v = isl_val_int_from_si(ctx, kernel->block_dim[i]);
1437                 aff = isl_aff_mod_val(aff, v);
1438                 id = isl_id_list_get_id(kernel->thread_ids, i);
1439                 pos = isl_set_find_dim_by_id(set, isl_dim_param, id);
1440                 isl_id_free(id);
1441                 aff2 = isl_aff_var_on_domain(isl_local_space_copy(ls),
1442                                         isl_dim_param, pos);
1443                 aff = isl_aff_sub(aff, aff2);
1444                 c = isl_equality_from_aff(aff);
1445                 set = isl_set_add_constraint(set, c);
1446         }
1447
1448         isl_local_space_free(ls);
1449         data->privatization = set;
1450 }
1451
1452 /* Return the prefix schedule at "node" as a relation
1453  * between domain elements and schedule dimensions after detecting
1454  * equalities in this relation.
1455  */
1456 static __isl_give isl_union_map *prefix_with_equalities(
1457         __isl_keep isl_schedule_node *node)
1458 {
1459         isl_union_map *schedule;
1460
1461         schedule = isl_schedule_node_get_prefix_schedule_relation(node);
1462         schedule = isl_union_map_detect_equalities(schedule);
1463
1464         return schedule;
1465 }
1466
1467 /* Group references of all arrays in "kernel".
1468  * "node" points to the kernel mark.
1469  * The mapping to shared memory in computed at the "shared" mark.
1470  *
1471  * We first extract all required schedule information into
1472  * a gpu_group_data structure and then consider each array
1473  * in turn.
1474  */
1475 int gpu_group_references(struct ppcg_kernel *kernel,
1476         __isl_keep isl_schedule_node *node)
1477 {
1478         int i;
1479         int r = 0;
1480         isl_union_pw_multi_aff *contraction;
1481         struct gpu_group_data data;
1482
1483         check_can_be_private_live_ranges(kernel, node);
1484
1485         data.scop = kernel->prog->scop;
1486
1487         data.kernel_depth = isl_schedule_node_get_schedule_depth(node);
1488         data.host_sched = isl_schedule_node_get_prefix_schedule_relation(node);
1489
1490         node = isl_schedule_node_copy(node);
1491         node = gpu_tree_move_down_to_shared(node, kernel->core);
1492         data.shared_depth = isl_schedule_node_get_schedule_depth(node);
1493         data.shared_sched = prefix_with_equalities(node);
1494
1495         node = gpu_tree_move_down_to_thread(node, kernel->core);
1496         node = isl_schedule_node_child(node, 0);
1497         data.thread_depth = isl_schedule_node_get_schedule_depth(node);
1498         data.n_thread = isl_schedule_node_band_n_member(node);
1499         if (data.thread_depth == data.shared_depth)
1500                 data.copy_sched = isl_union_map_copy(data.shared_sched);
1501         else
1502                 data.copy_sched = prefix_with_equalities(node);
1503         data.thread_sched = isl_union_map_copy(data.copy_sched);
1504         data.thread_sched = isl_union_map_flat_range_product(data.thread_sched,
1505                 isl_schedule_node_band_get_partial_schedule_union_map(node));
1506         data.thread_sched = isl_union_map_detect_equalities(data.thread_sched);
1507
1508         contraction = isl_union_pw_multi_aff_copy(kernel->contraction);
1509         data.host_sched = expand(data.host_sched, contraction);
1510         data.shared_sched = expand(data.shared_sched, contraction);
1511         if (data.thread_depth == data.shared_depth) {
1512                 isl_union_map_free(data.copy_sched);
1513                 data.copy_sched = isl_union_map_copy(data.shared_sched);
1514         } else {
1515                 data.copy_sched = expand(data.copy_sched, contraction);
1516         }
1517         data.thread_sched = expand(data.thread_sched, contraction);
1518         isl_union_pw_multi_aff_free(contraction);
1519
1520         node = isl_schedule_node_child(node, 0);
1521         data.full_sched = isl_union_map_copy(data.thread_sched);
1522         data.full_sched = isl_union_map_flat_range_product(data.full_sched,
1523                 isl_schedule_node_get_subtree_schedule_union_map(node));
1524         isl_schedule_node_free(node);
1525
1526         compute_privatization(&data, kernel);
1527
1528         for (i = 0; i < kernel->n_array; ++i) {
1529                 r = group_array_references(kernel, &kernel->array[i], &data);
1530                 if (r < 0)
1531                         break;
1532         }
1533
1534         isl_union_map_free(data.host_sched);
1535         isl_union_map_free(data.shared_sched);
1536         isl_union_map_free(data.copy_sched);
1537         isl_union_map_free(data.thread_sched);
1538         isl_union_map_free(data.full_sched);
1539         isl_set_free(data.privatization);
1540
1541         return r;
1542 }
1543
1544 /* Given a description of an array tile "tile" and the "space"
1545  *
1546  *      { D -> A }
1547  *
1548  * where D represents the first tile->depth schedule dimensions
1549  * and A represents the array, construct an isl_multi_aff
1550  *
1551  *      { [D[i] -> A[a]] -> A'[a'] }
1552  *
1553  * with A' a scaled down copy of A according to the shifts and strides
1554  * in "tile".  In particular,
1555  *
1556  *      a' = (a + shift(i))/stride
1557  *
1558  * "insert_array" represents
1559  *
1560  *      { [D -> A] -> D }
1561  *
1562  * and is used to insert A into the domain of functions that only
1563  * reference D.
1564  */
1565 static __isl_give isl_multi_aff *strided_tile(
1566         struct gpu_array_tile *tile, __isl_keep isl_space *space,
1567         __isl_keep isl_multi_aff *insert_array)
1568 {
1569         int i;
1570         isl_ctx *ctx;
1571         isl_multi_aff *shift;
1572         isl_multi_val *stride;
1573         isl_space *space2;
1574         isl_local_space *ls;
1575         isl_multi_aff *tiling;
1576
1577         ctx = isl_space_get_ctx(space);
1578         space2 = isl_space_domain(isl_space_copy(space));
1579         ls = isl_local_space_from_space(space2);
1580         space2 = isl_space_range(isl_space_copy(space));
1581         stride = isl_multi_val_zero(space2);
1582         shift = isl_multi_aff_zero(isl_space_copy(space));
1583
1584         for (i = 0; i < tile->n; ++i) {
1585                 struct gpu_array_bound *bound = &tile->bound[i];
1586                 isl_val *stride_i;
1587                 isl_aff *shift_i;
1588
1589                 stride_i = isl_val_copy(bound->stride);
1590                 shift_i = isl_aff_copy(bound->shift);
1591
1592                 stride = isl_multi_val_set_val(stride, i, stride_i);
1593                 shift = isl_multi_aff_set_aff(shift, i, shift_i);
1594         }
1595         isl_local_space_free(ls);
1596
1597         shift = isl_multi_aff_pullback_multi_aff(shift,
1598                                     isl_multi_aff_copy(insert_array));
1599
1600         tiling = isl_multi_aff_range_map(isl_space_copy(space));
1601         tiling = isl_multi_aff_add(tiling, shift);
1602         tiling = isl_multi_aff_scale_down_multi_val(tiling, stride);
1603
1604         return tiling;
1605 }
1606
1607 /* Compute a tiling for the array reference group "group".
1608  *
1609  * The tiling is of the form
1610  *
1611  *      { [D[i] -> A[a]] -> T[t] }
1612  *
1613  * where D represents the first tile->depth schedule dimensions,
1614  * A represents the global array and T represents the shared or
1615  * private memory tile.  The name of T is the name of the local
1616  * array.
1617  *
1618  * If there is any stride in the accesses, then the mapping is
1619  *
1620  *      t = (a + shift(i))/stride - lb(i)
1621  *
1622  * otherwise, it is simply
1623  *
1624  *      t = a - lb(i)
1625  */
1626 void gpu_array_ref_group_compute_tiling(struct gpu_array_ref_group *group)
1627 {
1628         int i;
1629         struct gpu_array_tile *tile;
1630         isl_space *space;
1631         isl_multi_aff *tiling, *lb, *insert_array;
1632         isl_printer *p;
1633         char *local_name;
1634
1635         tile = gpu_array_ref_group_tile(group);
1636         if (!tile)
1637                 return;
1638
1639         space = isl_map_get_space(group->access);
1640         space = isl_space_from_range(isl_space_range(space));
1641         space = isl_space_add_dims(space, isl_dim_in, tile->depth);
1642         insert_array = isl_multi_aff_domain_map(isl_space_copy(space));
1643
1644         for (i = 0; i < tile->n; ++i)
1645                 if (tile->bound[i].shift)
1646                         break;
1647
1648         if (i < tile->n)
1649                 tiling = strided_tile(tile, space, insert_array);
1650         else
1651                 tiling = isl_multi_aff_range_map(isl_space_copy(space));
1652
1653         lb = isl_multi_aff_zero(space);
1654         for (i = 0; i < tile->n; ++i) {
1655                 isl_aff *lb_i = isl_aff_copy(tile->bound[i].lb);
1656                 lb = isl_multi_aff_set_aff(lb, i, lb_i);
1657         }
1658         lb = isl_multi_aff_pullback_multi_aff(lb, insert_array);
1659
1660         tiling = isl_multi_aff_sub(tiling, lb);
1661
1662         p = isl_printer_to_str(isl_multi_aff_get_ctx(tiling));
1663         p = gpu_array_ref_group_print_name(group, p);
1664         local_name = isl_printer_get_str(p);
1665         isl_printer_free(p);
1666         tiling = isl_multi_aff_set_tuple_name(tiling, isl_dim_out, local_name);
1667         free(local_name);
1668
1669         tile->tiling = tiling;
1670 }