mlir/test/Dialect/Affine/loop-fusion-4.mlir

   1 // RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(affine-loop-fusion{mode=producer}))' -split-input-file | FileCheck %s --check-prefix=PRODUCER-CONSUMER
   2 // RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(affine-loop-fusion{fusion-maximal mode=sibling}))' -split-input-file | FileCheck %s --check-prefix=SIBLING-MAXIMAL
   3
   4 // Part I of fusion tests in  mlir/test/Transforms/loop-fusion.mlir.
   5 // Part II of fusion tests in mlir/test/Transforms/loop-fusion-2.mlir
   6 // Part III of fusion tests in mlir/test/Transforms/loop-fusion-3.mlir
   7
   8 // Expects fusion of producer into consumer at depth 4 and subsequent removal of
   9 // source loop.
  10 // PRODUCER-CONSUMER-LABEL: func @unflatten4d
  11 func.func @unflatten4d(%arg1: memref<7x8x9x10xf32>) {
  12   %m = memref.alloc() : memref<5040xf32>
  13   %cf7 = arith.constant 7.0 : f32
  14
  15   affine.for %i0 = 0 to 7 {
  16     affine.for %i1 = 0 to 8 {
  17       affine.for %i2 = 0 to 9 {
  18         affine.for %i3 = 0 to 10 {
  19           affine.store %cf7, %m[720 * %i0 + 90 * %i1 + 10 * %i2 + %i3] : memref<5040xf32>
  20         }
  21       }
  22     }
  23   }
  24   affine.for %i0 = 0 to 7 {
  25     affine.for %i1 = 0 to 8 {
  26       affine.for %i2 = 0 to 9 {
  27         affine.for %i3 = 0 to 10 {
  28           %v0 = affine.load %m[720 * %i0 + 90 * %i1 + 10 * %i2 + %i3] : memref<5040xf32>
  29           affine.store %v0, %arg1[%i0, %i1, %i2, %i3] : memref<7x8x9x10xf32>
  30         }
  31       }
  32     }
  33   }
  34   return
  35 }
  36
  37 // PRODUCER-CONSUMER:        affine.for
  38 // PRODUCER-CONSUMER-NEXT:     affine.for
  39 // PRODUCER-CONSUMER-NEXT:       affine.for
  40 // PRODUCER-CONSUMER-NEXT:         affine.for
  41 // PRODUCER-CONSUMER-NOT:    affine.for
  42 // PRODUCER-CONSUMER: return
  43
  44 // -----
  45
  46 // Expects fusion of producer into consumer at depth 2 and subsequent removal of
  47 // source loop.
  48 // PRODUCER-CONSUMER-LABEL: func @unflatten2d_with_transpose
  49 func.func @unflatten2d_with_transpose(%arg1: memref<8x7xf32>) {
  50   %m = memref.alloc() : memref<56xf32>
  51   %cf7 = arith.constant 7.0 : f32
  52
  53   affine.for %i0 = 0 to 7 {
  54     affine.for %i1 = 0 to 8 {
  55       affine.store %cf7, %m[8 * %i0 + %i1] : memref<56xf32>
  56     }
  57   }
  58   affine.for %i0 = 0 to 8 {
  59     affine.for %i1 = 0 to 7 {
  60       %v0 = affine.load %m[%i0 + 8 * %i1] : memref<56xf32>
  61       affine.store %v0, %arg1[%i0, %i1] : memref<8x7xf32>
  62     }
  63   }
  64   return
  65 }
  66
  67 // PRODUCER-CONSUMER:        affine.for
  68 // PRODUCER-CONSUMER-NEXT:     affine.for
  69 // PRODUCER-CONSUMER-NOT:    affine.for
  70 // PRODUCER-CONSUMER: return
  71
  72 // -----
  73
  74 // Expects fusion of producer into consumer at depth 1 and source loop to not
  75 // be removed due to difference in loop steps.
  76 // PRODUCER-CONSUMER-LABEL: func @check_src_dst_step
  77 func.func @check_src_dst_step(%m : memref<100xf32>,
  78                          %src: memref<100xf32>,
  79                          %out: memref<100xf32>) {
  80   affine.for %i0 = 0 to 100 {
  81     %r1 = affine.load %src[%i0]: memref<100xf32>
  82     affine.store %r1, %m[%i0] : memref<100xf32>
  83   }
  84   affine.for %i2 = 0 to 100 step 2 {
  85     %r2 = affine.load %m[%i2] : memref<100xf32>
  86     affine.store %r2, %out[%i2] : memref<100xf32>
  87   }
  88   return
  89 }
  90
  91 // Check if the fusion did take place as well as that the source loop was
  92 // not removed. To check if fusion took place, the read instruction from the
  93 // original source loop is checked to be in the fused loop.
  94 //
  95 // PRODUCER-CONSUMER:        affine.for %[[idx_0:.*]] = 0 to 100 {
  96 // PRODUCER-CONSUMER-NEXT:     %[[result_0:.*]] = affine.load %[[arr1:.*]][%[[idx_0]]] : memref<100xf32>
  97 // PRODUCER-CONSUMER-NEXT:     affine.store %[[result_0]], %{{.*}}[%[[idx_0]]] : memref<100xf32>
  98 // PRODUCER-CONSUMER-NEXT:   }
  99 // PRODUCER-CONSUMER:        affine.for %[[idx_1:.*]] = 0 to 100 step 2 {
 100 // PRODUCER-CONSUMER:          affine.load %[[arr1]][%[[idx_1]]] : memref<100xf32>
 101 // PRODUCER-CONSUMER:        }
 102 // PRODUCER-CONSUMER:        return
 103
 104 // -----
 105
 106 // SIBLING-MAXIMAL-LABEL:   func @reduce_add_non_maximal_f32_f32(
 107 func.func @reduce_add_non_maximal_f32_f32(%arg0: memref<64x64xf32, 1>, %arg1 : memref<1x64xf32, 1>, %arg2 : memref<1x64xf32, 1>) {
 108     %cst_0 = arith.constant 0.000000e+00 : f32
 109     %cst_1 = arith.constant 1.000000e+00 : f32
 110     affine.for %arg3 = 0 to 1 {
 111       affine.for %arg4 = 0 to 64 {
 112         %accum = affine.for %arg5 = 0 to 64 iter_args (%prevAccum = %cst_0) -> f32 {
 113           %4 = affine.load %arg0[%arg5, %arg4] : memref<64x64xf32, 1>
 114           %5 = arith.addf %prevAccum, %4 : f32
 115           affine.yield %5 : f32
 116         }
 117         %accum_dbl = arith.addf %accum, %accum : f32
 118         affine.store %accum_dbl, %arg1[%arg3, %arg4] : memref<1x64xf32, 1>
 119       }
 120     }
 121     affine.for %arg3 = 0 to 1 {
 122       affine.for %arg4 = 0 to 64 {
 123         // Following loop  trip count does not match the corresponding source trip count.
 124         %accum = affine.for %arg5 = 0 to 32 iter_args (%prevAccum = %cst_1) -> f32 {
 125           %4 = affine.load %arg0[%arg5, %arg4] : memref<64x64xf32, 1>
 126           %5 = arith.mulf %prevAccum, %4 : f32
 127           affine.yield %5 : f32
 128         }
 129         %accum_sqr = arith.mulf %accum, %accum : f32
 130         affine.store %accum_sqr, %arg2[%arg3, %arg4] : memref<1x64xf32, 1>
 131       }
 132     }
 133     return
 134 }
 135 // Test checks the loop structure is preserved after sibling fusion
 136 // since the destination loop and source loop trip counts do not
 137 // match.
 138 // SIBLING-MAXIMAL:        %[[cst_0:.*]] = arith.constant 0.000000e+00 : f32
 139 // SIBLING-MAXIMAL-NEXT:        %[[cst_1:.*]] = arith.constant 1.000000e+00 : f32
 140 // SIBLING-MAXIMAL-NEXT:           affine.for %[[idx_0:.*]]= 0 to 1 {
 141 // SIBLING-MAXIMAL-NEXT:             affine.for %[[idx_1:.*]] = 0 to 64 {
 142 // SIBLING-MAXIMAL-NEXT:               %[[result_1:.*]] = affine.for %[[idx_2:.*]] = 0 to 32 iter_args(%[[iter_0:.*]] = %[[cst_1]]) -> (f32) {
 143 // SIBLING-MAXIMAL-NEXT:                 %[[result_0:.*]] = affine.for %[[idx_3:.*]] = 0 to 64 iter_args(%[[iter_1:.*]] = %[[cst_0]]) -> (f32) {
 144
 145 // -----
 146
 147 // SIBLING-MAXIMAL-LABEL: func @sibling_load_only
 148 func.func @sibling_load_only(%arg0: memref<10xf32>) {
 149   affine.for %arg1 = 0 to 10 {
 150     %0 = affine.load %arg0[%arg1] : memref<10xf32>
 151   }
 152   affine.for %arg1 = 0 to 10 {
 153     %0 = affine.load %arg0[%arg1] : memref<10xf32>
 154   }
 155   // SIBLING-MAXIMAL-NEXT: affine.for
 156   // SIBLING-MAXIMAL-NEXT:   affine.load
 157   // SIBLING-MAXIMAL-NEXT:   affine.load
 158   return
 159 }
 160
 161 // -----
 162
 163 // PRODUCER-CONSUMER-LABEL: func @fusion_for_multiple_blocks() {
 164 func.func @fusion_for_multiple_blocks() {
 165 ^bb0:
 166   %m = memref.alloc() : memref<10xf32>
 167   %cf7 = arith.constant 7.0 : f32
 168
 169   affine.for %i0 = 0 to 10 {
 170     affine.store %cf7, %m[%i0] : memref<10xf32>
 171   }
 172   affine.for %i1 = 0 to 10 {
 173     %v0 = affine.load %m[%i1] : memref<10xf32>
 174   }
 175   // PRODUCER-CONSUMER:      affine.for %{{.*}} = 0 to 10 {
 176   // PRODUCER-CONSUMER-NEXT:   affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
 177   // PRODUCER-CONSUMER-NEXT:   affine.load %{{.*}}[0] : memref<1xf32>
 178   // PRODUCER-CONSUMER-NEXT: }
 179   cf.br ^bb1
 180 ^bb1:
 181   affine.for %i0 = 0 to 10 {
 182     affine.store %cf7, %m[%i0] : memref<10xf32>
 183   }
 184   affine.for %i1 = 0 to 10 {
 185     %v0 = affine.load %m[%i1] : memref<10xf32>
 186   }
 187   // PRODUCER-CONSUMER:      affine.for %{{.*}} = 0 to 10 {
 188   // PRODUCER-CONSUMER-NEXT:   affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32>
 189   // PRODUCER-CONSUMER-NEXT:   affine.load %{{.*}}[0] : memref<1xf32>
 190   // PRODUCER-CONSUMER-NEXT: }
 191   return
 192 }
 193
 194 // -----
 195
 196 // PRODUCER-CONSUMER-LABEL: @fuse_higher_dim_nest_into_lower_dim_nest
 197 func.func @fuse_higher_dim_nest_into_lower_dim_nest() {
 198   %A = memref.alloc() : memref<8x12x128x64xf32>
 199   %B = memref.alloc() : memref<8x128x12x64xf32>
 200   affine.for %arg205 = 0 to 8 {
 201     affine.for %arg206 = 0 to 128 {
 202       affine.for %arg207 = 0 to 12 {
 203         affine.for %arg208 = 0 to 64 {
 204           %a = affine.load %A[%arg205, %arg207, %arg206, %arg208] : memref<8x12x128x64xf32>
 205           affine.store %a, %B[%arg205, %arg206, %arg207, %arg208] : memref<8x128x12x64xf32>
 206         }
 207       }
 208     }
 209   }
 210   %C = memref.alloc() : memref<8x128x768xf16>
 211   affine.for %arg205 = 0 to 8 {
 212     affine.for %arg206 = 0 to 128 {
 213       affine.for %arg207 = 0 to 768 {
 214         %b = affine.load %B[%arg205, %arg206, %arg207 floordiv 64, %arg207 mod 64] : memref<8x128x12x64xf32>
 215         %c = arith.truncf %b : f32 to f16
 216         affine.store %c, %C[%arg205, %arg206, %arg207] : memref<8x128x768xf16>
 217       }
 218     }
 219   }
 220
 221   // Check that fusion happens into the innermost loop of the consumer.
 222   // PRODUCER-CONSUMER:      affine.for
 223   // PRODUCER-CONSUMER-NEXT:   affine.for %{{.*}} = 0 to 128
 224   // PRODUCER-CONSUMER-NEXT:     affine.for %{{.*}} = 0 to 768
 225   // PRODUCER-CONSUMER-NOT:  affine.for
 226   // PRODUCER-CONSUMER:      return
 227   return
 228 }