diff --git a/mlir/test/Transforms/loop-fusion-2.mlir b/mlir/test/Transforms/loop-fusion-2.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Transforms/loop-fusion-2.mlir @@ -0,0 +1,389 @@ +// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-fusion -split-input-file | FileCheck %s +// RUN: mlir-opt -allow-unregistered-dialect %s -affine-loop-fusion="fusion-maximal" -split-input-file | FileCheck %s --check-prefix=MAXIMAL + +//Part 1 of fusion tests in mlir/test/Transforms/loop-fusion.mlir +// ---- + +// MAXIMAL-LABEL: func @reduce_add_f32_f32( +func @reduce_add_f32_f32(%arg0: memref<64x64xf32, 1>, %arg1: memref<1x64xf32, 1>, %arg2: memref<1x64xf32, 1>) { + %cst_0 = constant 0.000000e+00 : f32 + %cst_1 = constant 1.000000e+00 : f32 + %0 = memref.alloca() : memref + %1 = memref.alloca() : memref + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 64 { + %accum = affine.for %arg5 = 0 to 64 iter_args (%prevAccum = %cst_0) -> f32 { + %4 = affine.load %arg0[%arg5, %arg4] : memref<64x64xf32, 1> + %5 = addf %prevAccum, %4 : f32 + affine.yield %5 : f32 + } + %accum_dbl = addf %accum, %accum : f32 + affine.store %accum_dbl, %arg1[%arg3, %arg4] : memref<1x64xf32, 1> + } + } + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 64 { + %accum = affine.for %arg5 = 0 to 64 iter_args (%prevAccum = %cst_1) -> f32 { + %4 = affine.load %arg0[%arg5, %arg4] : memref<64x64xf32, 1> + %5 = mulf %prevAccum, %4 : f32 + affine.yield %5 : f32 + } + %accum_sqr = mulf %accum, %accum : f32 + affine.store %accum_sqr, %arg2[%arg3, %arg4] : memref<1x64xf32, 1> + } + } + return +} +// The two loops here get maximally sibling-fused at the innermost +// insertion point. Test checks if the innermost reduction loop of the fused loop +// gets promoted into its outerloop. +// MAXIMAL-SAME: %[[arg_0:.*]]: memref<64x64xf32, 1>, +// MAXIMAL-SAME: %[[arg_1:.*]]: memref<1x64xf32, 1>, +// MAXIMAL-SAME: %[[arg_2:.*]]: memref<1x64xf32, 1>) { +// MAXIMAL: %[[cst:.*]] = constant 0 : index +// MAXIMAL-NEXT: %[[cst_0:.*]] = constant 0.000000e+00 : f32 +// MAXIMAL-NEXT: %[[cst_1:.*]] = constant 1.000000e+00 : f32 +// MAXIMAL: affine.for %[[idx_0:.*]] = 0 to 1 { +// MAXIMAL-NEXT: affine.for %[[idx_1:.*]] = 0 to 64 { +// MAXIMAL-NEXT: %[[results:.*]]:2 = affine.for %[[idx_2:.*]] = 0 to 64 iter_args(%[[iter_0:.*]] = %[[cst_1]], %[[iter_1:.*]] = %[[cst_0]]) -> (f32, f32) { +// MAXIMAL-NEXT: %[[val_0:.*]] = affine.load %[[arg_0]][%[[idx_2]], %[[idx_1]]] : memref<64x64xf32, 1> +// MAXIMAL-NEXT: %[[reduc_0:.*]] = addf %[[iter_1]], %[[val_0]] : f32 +// MAXIMAL-NEXT: %[[val_1:.*]] = affine.load %[[arg_0]][%[[idx_2]], %[[idx_1]]] : memref<64x64xf32, 1> +// MAXIMAL-NEXT: %[[reduc_1:.*]] = mulf %[[iter_0]], %[[val_1]] : f32 +// MAXIMAL-NEXT: affine.yield %[[reduc_1]], %[[reduc_0]] : f32, f32 +// MAXIMAL-NEXT: } +// MAXIMAL-NEXT: %[[reduc_0_dbl:.*]] = addf %[[results:.*]]#1, %[[results]]#1 : f32 +// MAXIMAL-NEXT: affine.store %[[reduc_0_dbl]], %[[arg_1]][%[[cst]], %[[idx_1]]] : memref<1x64xf32, 1> +// MAXIMAL-NEXT: %[[reduc_1_sqr:.*]] = mulf %[[results]]#0, %[[results]]#0 : f32 +// MAXIMAL-NEXT: affine.store %[[reduc_1_sqr]], %[[arg_2]][%[[idx_0]], %[[idx_1]]] : memref<1x64xf32, 1> +// MAXIMAL-NEXT: } +// MAXIMAL-NEXT: } +// MAXIMAL-NEXT: return +// MAXIMAL-NEXT: } + +// ----- + +// CHECK-LABEL: func @reduce_add_non_innermost +func @reduce_add_non_innermost(%arg0: memref<64x64xf32, 1>, %arg1: memref<1x64xf32, 1>, %arg2: memref<1x64xf32, 1>) { + %cst = constant 0.000000e+00 : f32 + %cst_0 = constant 1.000000e+00 : f32 + %0 = memref.alloca() : memref + %1 = memref.alloca() : memref + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 64 { + %accum = affine.for %arg5 = 0 to 64 iter_args (%prevAccum = %cst) -> f32 { + %4 = affine.load %arg0[%arg5, %arg4] : memref<64x64xf32, 1> + %5 = addf %prevAccum, %4 : f32 + affine.yield %5 : f32 + } + %accum_dbl = addf %accum, %accum : f32 + affine.store %accum_dbl, %arg1[%arg3, %arg4] : memref<1x64xf32, 1> + } + } + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 64 { + %accum = affine.for %arg5 = 0 to 64 iter_args (%prevAccum = %cst_0) -> f32 { + %4 = affine.load %arg0[%arg5, %arg4] : memref<64x64xf32, 1> + %5 = mulf %prevAccum, %4 : f32 + affine.yield %5 : f32 + } + %accum_sqr = mulf %accum, %accum : f32 + affine.store %accum_sqr, %arg2[%arg3, %arg4] : memref<1x64xf32, 1> + } + } + return +} +// Test checks the loop structure is preserved after sibling fusion. +// CHECK: affine.for +// CHECK-NEXT: affine.for +// CHECK-NEXT: affine.for +// CHECK affine.for + +// ----- +func @reduce_add_non_maximal_f32_f32(%arg0: memref<64x64xf32, 1>, %arg1 : memref<1x64xf32, 1>, %arg2 : memref<1x64xf32, 1>) { + %cst_0 = constant 0.000000e+00 : f32 + %cst_1 = constant 1.000000e+00 : f32 + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 64 { + %accum = affine.for %arg5 = 0 to 64 iter_args (%prevAccum = %cst_0) -> f32 { + %4 = affine.load %arg0[%arg5, %arg4] : memref<64x64xf32, 1> + %5 = addf %prevAccum, %4 : f32 + affine.yield %5 : f32 + } + %accum_dbl = addf %accum, %accum : f32 + affine.store %accum_dbl, %arg1[%arg3, %arg4] : memref<1x64xf32, 1> + } + } + affine.for %arg3 = 0 to 1 { + affine.for %arg4 = 0 to 64 { + // Following loop trip count does not match the corresponding source trip count. + %accum = affine.for %arg5 = 0 to 32 iter_args (%prevAccum = %cst_1) -> f32 { + %4 = affine.load %arg0[%arg5, %arg4] : memref<64x64xf32, 1> + %5 = mulf %prevAccum, %4 : f32 + affine.yield %5 : f32 + } + %accum_sqr = mulf %accum, %accum : f32 + affine.store %accum_sqr, %arg2[%arg3, %arg4] : memref<1x64xf32, 1> + } + } + return +} +// Test checks the loop structure is preserved after sibling fusion +// since the destination loop and source loop trip counts do not +// match. +// MAXIMAL-LABEL: func @reduce_add_non_maximal_f32_f32( +// MAXIMAL: %[[cst_0:.*]] = constant 0.000000e+00 : f32 +// MAXIMAL-NEXT: %[[cst_1:.*]] = constant 1.000000e+00 : f32 +// MAXIMAL-NEXT: affine.for %[[idx_0:.*]]= 0 to 1 { +// MAXIMAL-NEXT: affine.for %[[idx_1:.*]] = 0 to 64 { +// MAXIMAL-NEXT: %[[result_1:.*]] = affine.for %[[idx_2:.*]] = 0 to 32 iter_args(%[[iter_0:.*]] = %[[cst_1]]) -> (f32) { +// MAXIMAL-NEXT: %[[result_0:.*]] = affine.for %[[idx_3:.*]] = 0 to 64 iter_args(%[[iter_1:.*]] = %[[cst_0]]) -> (f32) { + +// ----- + +// CHECK-LABEL: func @fuse_large_number_of_loops +func @fuse_large_number_of_loops(%arg0: memref<20x10xf32, 1>, %arg1: memref<20x10xf32, 1>, %arg2: memref<20x10xf32, 1>, %arg3: memref<20x10xf32, 1>, %arg4: memref<20x10xf32, 1>, %arg5: memref, %arg6: memref, %arg7: memref, %arg8: memref, %arg9: memref<20x10xf32, 1>, %arg10: memref<20x10xf32, 1>, %arg11: memref<20x10xf32, 1>, %arg12: memref<20x10xf32, 1>) { + %cst = constant 1.000000e+00 : f32 + %0 = memref.alloc() : memref + affine.store %cst, %0[] : memref + %1 = memref.alloc() : memref<20x10xf32, 1> + affine.for %arg13 = 0 to 20 { + affine.for %arg14 = 0 to 10 { + %21 = affine.load %arg6[] : memref + affine.store %21, %1[%arg13, %arg14] : memref<20x10xf32, 1> + } + } + %2 = memref.alloc() : memref<20x10xf32, 1> + affine.for %arg13 = 0 to 20 { + affine.for %arg14 = 0 to 10 { + %21 = affine.load %1[%arg13, %arg14] : memref<20x10xf32, 1> + %22 = affine.load %arg3[%arg13, %arg14] : memref<20x10xf32, 1> + %23 = mulf %22, %21 : f32 + affine.store %23, %2[%arg13, %arg14] : memref<20x10xf32, 1> + } + } + %3 = memref.alloc() : memref + %4 = affine.load %arg6[] : memref + %5 = affine.load %0[] : memref + %6 = subf %5, %4 : f32 + affine.store %6, %3[] : memref + %7 = memref.alloc() : memref<20x10xf32, 1> + affine.for %arg13 = 0 to 20 { + affine.for %arg14 = 0 to 10 { + %21 = affine.load %3[] : memref + affine.store %21, %7[%arg13, %arg14] : memref<20x10xf32, 1> + } + } + %8 = memref.alloc() : memref<20x10xf32, 1> + affine.for %arg13 = 0 to 20 { + affine.for %arg14 = 0 to 10 { + %21 = affine.load %arg1[%arg13, %arg14] : memref<20x10xf32, 1> + %22 = affine.load %7[%arg13, %arg14] : memref<20x10xf32, 1> + %23 = mulf %22, %21 : f32 + affine.store %23, %8[%arg13, %arg14] : memref<20x10xf32, 1> + } + } + %9 = memref.alloc() : memref<20x10xf32, 1> + affine.for %arg13 = 0 to 20 { + affine.for %arg14 = 0 to 10 { + %21 = affine.load %arg1[%arg13, %arg14] : memref<20x10xf32, 1> + %22 = affine.load %8[%arg13, %arg14] : memref<20x10xf32, 1> + %23 = mulf %22, %21 : f32 + affine.store %23, %9[%arg13, %arg14] : memref<20x10xf32, 1> + } + } + affine.for %arg13 = 0 to 20 { + affine.for %arg14 = 0 to 10 { + %21 = affine.load %9[%arg13, %arg14] : memref<20x10xf32, 1> + %22 = affine.load %2[%arg13, %arg14] : memref<20x10xf32, 1> + %23 = addf %22, %21 : f32 + affine.store %23, %arg11[%arg13, %arg14] : memref<20x10xf32, 1> + } + } + %10 = memref.alloc() : memref<20x10xf32, 1> + affine.for %arg13 = 0 to 20 { + affine.for %arg14 = 0 to 10 { + %21 = affine.load %1[%arg13, %arg14] : memref<20x10xf32, 1> + %22 = affine.load %arg2[%arg13, %arg14] : memref<20x10xf32, 1> + %23 = mulf %22, %21 : f32 + affine.store %23, %10[%arg13, %arg14] : memref<20x10xf32, 1> + } + } + affine.for %arg13 = 0 to 20 { + affine.for %arg14 = 0 to 10 { + %21 = affine.load %8[%arg13, %arg14] : memref<20x10xf32, 1> + %22 = affine.load %10[%arg13, %arg14] : memref<20x10xf32, 1> + %23 = addf %22, %21 : f32 + affine.store %23, %arg10[%arg13, %arg14] : memref<20x10xf32, 1> + } + } + %11 = memref.alloc() : memref<20x10xf32, 1> + affine.for %arg13 = 0 to 20 { + affine.for %arg14 = 0 to 10 { + %21 = affine.load %arg10[%arg13, %arg14] : memref<20x10xf32, 1> + %22 = affine.load %arg10[%arg13, %arg14] : memref<20x10xf32, 1> + %23 = mulf %22, %21 : f32 + affine.store %23, %11[%arg13, %arg14] : memref<20x10xf32, 1> + } + } + %12 = memref.alloc() : memref<20x10xf32, 1> + affine.for %arg13 = 0 to 20 { + affine.for %arg14 = 0 to 10 { + %21 = affine.load %11[%arg13, %arg14] : memref<20x10xf32, 1> + %22 = affine.load %arg11[%arg13, %arg14] : memref<20x10xf32, 1> + %23 = subf %22, %21 : f32 + affine.store %23, %12[%arg13, %arg14] : memref<20x10xf32, 1> + } + } + %13 = memref.alloc() : memref<20x10xf32, 1> + affine.for %arg13 = 0 to 20 { + affine.for %arg14 = 0 to 10 { + %21 = affine.load %arg7[] : memref + affine.store %21, %13[%arg13, %arg14] : memref<20x10xf32, 1> + } + } + %14 = memref.alloc() : memref<20x10xf32, 1> + affine.for %arg13 = 0 to 20 { + affine.for %arg14 = 0 to 10 { + %21 = affine.load %arg4[%arg13, %arg14] : memref<20x10xf32, 1> + %22 = affine.load %13[%arg13, %arg14] : memref<20x10xf32, 1> + %23 = mulf %22, %21 : f32 + affine.store %23, %14[%arg13, %arg14] : memref<20x10xf32, 1> + } + } + %15 = memref.alloc() : memref<20x10xf32, 1> + affine.for %arg13 = 0 to 20 { + affine.for %arg14 = 0 to 10 { + %21 = affine.load %arg8[] : memref + affine.store %21, %15[%arg13, %arg14] : memref<20x10xf32, 1> + } + } + %16 = memref.alloc() : memref<20x10xf32, 1> + affine.for %arg13 = 0 to 20 { + affine.for %arg14 = 0 to 10 { + %21 = affine.load %15[%arg13, %arg14] : memref<20x10xf32, 1> + %22 = affine.load %12[%arg13, %arg14] : memref<20x10xf32, 1> + %23 = addf %22, %21 : f32 + affine.store %23, %16[%arg13, %arg14] : memref<20x10xf32, 1> + } + } + %17 = memref.alloc() : memref<20x10xf32, 1> + affine.for %arg13 = 0 to 20 { + affine.for %arg14 = 0 to 10 { + %21 = affine.load %16[%arg13, %arg14] : memref<20x10xf32, 1> + %22 = math.sqrt %21 : f32 + affine.store %22, %17[%arg13, %arg14] : memref<20x10xf32, 1> + } + } + %18 = memref.alloc() : memref<20x10xf32, 1> + affine.for %arg13 = 0 to 20 { + affine.for %arg14 = 0 to 10 { + %21 = affine.load %arg5[] : memref + affine.store %21, %18[%arg13, %arg14] : memref<20x10xf32, 1> + } + } + %19 = memref.alloc() : memref<20x10xf32, 1> + affine.for %arg13 = 0 to 20 { + affine.for %arg14 = 0 to 10 { + %21 = affine.load %arg1[%arg13, %arg14] : memref<20x10xf32, 1> + %22 = affine.load %18[%arg13, %arg14] : memref<20x10xf32, 1> + %23 = mulf %22, %21 : f32 + affine.store %23, %19[%arg13, %arg14] : memref<20x10xf32, 1> + } + } + %20 = memref.alloc() : memref<20x10xf32, 1> + affine.for %arg13 = 0 to 20 { + affine.for %arg14 = 0 to 10 { + %21 = affine.load %17[%arg13, %arg14] : memref<20x10xf32, 1> + %22 = affine.load %19[%arg13, %arg14] : memref<20x10xf32, 1> + %23 = divf %22, %21 : f32 + affine.store %23, %20[%arg13, %arg14] : memref<20x10xf32, 1> + } + } + affine.for %arg13 = 0 to 20 { + affine.for %arg14 = 0 to 10 { + %21 = affine.load %20[%arg13, %arg14] : memref<20x10xf32, 1> + %22 = affine.load %14[%arg13, %arg14] : memref<20x10xf32, 1> + %23 = addf %22, %21 : f32 + affine.store %23, %arg12[%arg13, %arg14] : memref<20x10xf32, 1> + } + } + affine.for %arg13 = 0 to 20 { + affine.for %arg14 = 0 to 10 { + %21 = affine.load %arg12[%arg13, %arg14] : memref<20x10xf32, 1> + %22 = affine.load %arg0[%arg13, %arg14] : memref<20x10xf32, 1> + %23 = subf %22, %21 : f32 + affine.store %23, %arg9[%arg13, %arg14] : memref<20x10xf32, 1> + } + } + return +} +// CHECK: affine.for +// CHECK: affine.for +// CHECK-NOT: affine.for + +// ----- + +// Expects fusion of producer into consumer at depth 4 and subsequent removal of +// source loop. +// CHECK-LABEL: func @unflatten4d +func @unflatten4d(%arg1: memref<7x8x9x10xf32>) { + %m = memref.alloc() : memref<5040xf32> + %cf7 = constant 7.0 : f32 + + affine.for %i0 = 0 to 7 { + affine.for %i1 = 0 to 8 { + affine.for %i2 = 0 to 9 { + affine.for %i3 = 0 to 10 { + affine.store %cf7, %m[720 * %i0 + 90 * %i1 + 10 * %i2 + %i3] : memref<5040xf32> + } + } + } + } + affine.for %i0 = 0 to 7 { + affine.for %i1 = 0 to 8 { + affine.for %i2 = 0 to 9 { + affine.for %i3 = 0 to 10 { + %v0 = affine.load %m[720 * %i0 + 90 * %i1 + 10 * %i2 + %i3] : memref<5040xf32> + affine.store %v0, %arg1[%i0, %i1, %i2, %i3] : memref<7x8x9x10xf32> + } + } + } + } + return +} + +// CHECK: affine.for +// CHECK-NEXT: affine.for +// CHECK-NEXT: affine.for +// CHECK-NEXT: affine.for +// CHECK-NOT: affine.for +// CHECK: return + +// ----- + +// Expects fusion of producer into consumer at depth 2 and subsequent removal of +// source loop. +// CHECK-LABEL: func @unflatten2d_with_transpose +func @unflatten2d_with_transpose(%arg1: memref<8x7xf32>) { + %m = memref.alloc() : memref<56xf32> + %cf7 = constant 7.0 : f32 + + affine.for %i0 = 0 to 7 { + affine.for %i1 = 0 to 8 { + affine.store %cf7, %m[8 * %i0 + %i1] : memref<56xf32> + } + } + affine.for %i0 = 0 to 8 { + affine.for %i1 = 0 to 7 { + %v0 = affine.load %m[%i0 + 8 * %i1] : memref<56xf32> + affine.store %v0, %arg1[%i0, %i1] : memref<8x7xf32> + } + } + return +} + +// CHECK: affine.for +// CHECK-NEXT: affine.for +// CHECK-NOT: affine.for +// CHECK: return diff --git a/mlir/test/Transforms/loop-fusion.mlir b/mlir/test/Transforms/loop-fusion.mlir --- a/mlir/test/Transforms/loop-fusion.mlir +++ b/mlir/test/Transforms/loop-fusion.mlir @@ -3149,387 +3149,5 @@ // CHECK-NEXT: affine.store // ----- +// Add further tests in mlir/test/Transforms/loop-fusion-2.mlir -// MAXIMAL-LABEL: func @reduce_add_f32_f32( -func @reduce_add_f32_f32(%arg0: memref<64x64xf32, 1>, %arg1: memref<1x64xf32, 1>, %arg2: memref<1x64xf32, 1>) { - %cst_0 = constant 0.000000e+00 : f32 - %cst_1 = constant 1.000000e+00 : f32 - %0 = memref.alloca() : memref - %1 = memref.alloca() : memref - affine.for %arg3 = 0 to 1 { - affine.for %arg4 = 0 to 64 { - %accum = affine.for %arg5 = 0 to 64 iter_args (%prevAccum = %cst_0) -> f32 { - %4 = affine.load %arg0[%arg5, %arg4] : memref<64x64xf32, 1> - %5 = addf %prevAccum, %4 : f32 - affine.yield %5 : f32 - } - %accum_dbl = addf %accum, %accum : f32 - affine.store %accum_dbl, %arg1[%arg3, %arg4] : memref<1x64xf32, 1> - } - } - affine.for %arg3 = 0 to 1 { - affine.for %arg4 = 0 to 64 { - %accum = affine.for %arg5 = 0 to 64 iter_args (%prevAccum = %cst_1) -> f32 { - %4 = affine.load %arg0[%arg5, %arg4] : memref<64x64xf32, 1> - %5 = mulf %prevAccum, %4 : f32 - affine.yield %5 : f32 - } - %accum_sqr = mulf %accum, %accum : f32 - affine.store %accum_sqr, %arg2[%arg3, %arg4] : memref<1x64xf32, 1> - } - } - return -} -// The two loops here get maximally sibling-fused at the innermost -// insertion point. Test checks if the innermost reduction loop of the fused loop -// gets promoted into its outerloop. -// MAXIMAL-SAME: %[[arg_0:.*]]: memref<64x64xf32, 1>, -// MAXIMAL-SAME: %[[arg_1:.*]]: memref<1x64xf32, 1>, -// MAXIMAL-SAME: %[[arg_2:.*]]: memref<1x64xf32, 1>) { -// MAXIMAL: %[[cst:.*]] = constant 0 : index -// MAXIMAL-NEXT: %[[cst_0:.*]] = constant 0.000000e+00 : f32 -// MAXIMAL-NEXT: %[[cst_1:.*]] = constant 1.000000e+00 : f32 -// MAXIMAL: affine.for %[[idx_0:.*]] = 0 to 1 { -// MAXIMAL-NEXT: affine.for %[[idx_1:.*]] = 0 to 64 { -// MAXIMAL-NEXT: %[[results:.*]]:2 = affine.for %[[idx_2:.*]] = 0 to 64 iter_args(%[[iter_0:.*]] = %[[cst_1]], %[[iter_1:.*]] = %[[cst_0]]) -> (f32, f32) { -// MAXIMAL-NEXT: %[[val_0:.*]] = affine.load %[[arg_0]][%[[idx_2]], %[[idx_1]]] : memref<64x64xf32, 1> -// MAXIMAL-NEXT: %[[reduc_0:.*]] = addf %[[iter_1]], %[[val_0]] : f32 -// MAXIMAL-NEXT: %[[val_1:.*]] = affine.load %[[arg_0]][%[[idx_2]], %[[idx_1]]] : memref<64x64xf32, 1> -// MAXIMAL-NEXT: %[[reduc_1:.*]] = mulf %[[iter_0]], %[[val_1]] : f32 -// MAXIMAL-NEXT: affine.yield %[[reduc_1]], %[[reduc_0]] : f32, f32 -// MAXIMAL-NEXT: } -// MAXIMAL-NEXT: %[[reduc_0_dbl:.*]] = addf %[[results:.*]]#1, %[[results]]#1 : f32 -// MAXIMAL-NEXT: affine.store %[[reduc_0_dbl]], %[[arg_1]][%[[cst]], %[[idx_1]]] : memref<1x64xf32, 1> -// MAXIMAL-NEXT: %[[reduc_1_sqr:.*]] = mulf %[[results]]#0, %[[results]]#0 : f32 -// MAXIMAL-NEXT: affine.store %[[reduc_1_sqr]], %[[arg_2]][%[[idx_0]], %[[idx_1]]] : memref<1x64xf32, 1> -// MAXIMAL-NEXT: } -// MAXIMAL-NEXT: } -// MAXIMAL-NEXT: return -// MAXIMAL-NEXT: } - -// ----- - -// CHECK-LABEL: func @reduce_add_non_innermost -func @reduce_add_non_innermost(%arg0: memref<64x64xf32, 1>, %arg1: memref<1x64xf32, 1>, %arg2: memref<1x64xf32, 1>) { - %cst = constant 0.000000e+00 : f32 - %cst_0 = constant 1.000000e+00 : f32 - %0 = memref.alloca() : memref - %1 = memref.alloca() : memref - affine.for %arg3 = 0 to 1 { - affine.for %arg4 = 0 to 64 { - %accum = affine.for %arg5 = 0 to 64 iter_args (%prevAccum = %cst) -> f32 { - %4 = affine.load %arg0[%arg5, %arg4] : memref<64x64xf32, 1> - %5 = addf %prevAccum, %4 : f32 - affine.yield %5 : f32 - } - %accum_dbl = addf %accum, %accum : f32 - affine.store %accum_dbl, %arg1[%arg3, %arg4] : memref<1x64xf32, 1> - } - } - affine.for %arg3 = 0 to 1 { - affine.for %arg4 = 0 to 64 { - %accum = affine.for %arg5 = 0 to 64 iter_args (%prevAccum = %cst_0) -> f32 { - %4 = affine.load %arg0[%arg5, %arg4] : memref<64x64xf32, 1> - %5 = mulf %prevAccum, %4 : f32 - affine.yield %5 : f32 - } - %accum_sqr = mulf %accum, %accum : f32 - affine.store %accum_sqr, %arg2[%arg3, %arg4] : memref<1x64xf32, 1> - } - } - return -} -// Test checks the loop structure is preserved after sibling fusion. -// CHECK: affine.for -// CHECK-NEXT: affine.for -// CHECK-NEXT: affine.for -// CHECK affine.for - -// ----- -func @reduce_add_non_maximal_f32_f32(%arg0: memref<64x64xf32, 1>, %arg1 : memref<1x64xf32, 1>, %arg2 : memref<1x64xf32, 1>) { - %cst_0 = constant 0.000000e+00 : f32 - %cst_1 = constant 1.000000e+00 : f32 - affine.for %arg3 = 0 to 1 { - affine.for %arg4 = 0 to 64 { - %accum = affine.for %arg5 = 0 to 64 iter_args (%prevAccum = %cst_0) -> f32 { - %4 = affine.load %arg0[%arg5, %arg4] : memref<64x64xf32, 1> - %5 = addf %prevAccum, %4 : f32 - affine.yield %5 : f32 - } - %accum_dbl = addf %accum, %accum : f32 - affine.store %accum_dbl, %arg1[%arg3, %arg4] : memref<1x64xf32, 1> - } - } - affine.for %arg3 = 0 to 1 { - affine.for %arg4 = 0 to 64 { - // Following loop trip count does not match the corresponding source trip count. - %accum = affine.for %arg5 = 0 to 32 iter_args (%prevAccum = %cst_1) -> f32 { - %4 = affine.load %arg0[%arg5, %arg4] : memref<64x64xf32, 1> - %5 = mulf %prevAccum, %4 : f32 - affine.yield %5 : f32 - } - %accum_sqr = mulf %accum, %accum : f32 - affine.store %accum_sqr, %arg2[%arg3, %arg4] : memref<1x64xf32, 1> - } - } - return -} -// Test checks the loop structure is preserved after sibling fusion -// since the destination loop and source loop trip counts do not -// match. -// MAXIMAL-LABEL: func @reduce_add_non_maximal_f32_f32( -// MAXIMAL: %[[cst_0:.*]] = constant 0.000000e+00 : f32 -// MAXIMAL-NEXT: %[[cst_1:.*]] = constant 1.000000e+00 : f32 -// MAXIMAL-NEXT: affine.for %[[idx_0:.*]]= 0 to 1 { -// MAXIMAL-NEXT: affine.for %[[idx_1:.*]] = 0 to 64 { -// MAXIMAL-NEXT: %[[result_1:.*]] = affine.for %[[idx_2:.*]] = 0 to 32 iter_args(%[[iter_0:.*]] = %[[cst_1]]) -> (f32) { -// MAXIMAL-NEXT: %[[result_0:.*]] = affine.for %[[idx_3:.*]] = 0 to 64 iter_args(%[[iter_1:.*]] = %[[cst_0]]) -> (f32) { - -// ----- - -// CHECK-LABEL: func @fuse_large_number_of_loops -func @fuse_large_number_of_loops(%arg0: memref<20x10xf32, 1>, %arg1: memref<20x10xf32, 1>, %arg2: memref<20x10xf32, 1>, %arg3: memref<20x10xf32, 1>, %arg4: memref<20x10xf32, 1>, %arg5: memref, %arg6: memref, %arg7: memref, %arg8: memref, %arg9: memref<20x10xf32, 1>, %arg10: memref<20x10xf32, 1>, %arg11: memref<20x10xf32, 1>, %arg12: memref<20x10xf32, 1>) { - %cst = constant 1.000000e+00 : f32 - %0 = memref.alloc() : memref - affine.store %cst, %0[] : memref - %1 = memref.alloc() : memref<20x10xf32, 1> - affine.for %arg13 = 0 to 20 { - affine.for %arg14 = 0 to 10 { - %21 = affine.load %arg6[] : memref - affine.store %21, %1[%arg13, %arg14] : memref<20x10xf32, 1> - } - } - %2 = memref.alloc() : memref<20x10xf32, 1> - affine.for %arg13 = 0 to 20 { - affine.for %arg14 = 0 to 10 { - %21 = affine.load %1[%arg13, %arg14] : memref<20x10xf32, 1> - %22 = affine.load %arg3[%arg13, %arg14] : memref<20x10xf32, 1> - %23 = mulf %22, %21 : f32 - affine.store %23, %2[%arg13, %arg14] : memref<20x10xf32, 1> - } - } - %3 = memref.alloc() : memref - %4 = affine.load %arg6[] : memref - %5 = affine.load %0[] : memref - %6 = subf %5, %4 : f32 - affine.store %6, %3[] : memref - %7 = memref.alloc() : memref<20x10xf32, 1> - affine.for %arg13 = 0 to 20 { - affine.for %arg14 = 0 to 10 { - %21 = affine.load %3[] : memref - affine.store %21, %7[%arg13, %arg14] : memref<20x10xf32, 1> - } - } - %8 = memref.alloc() : memref<20x10xf32, 1> - affine.for %arg13 = 0 to 20 { - affine.for %arg14 = 0 to 10 { - %21 = affine.load %arg1[%arg13, %arg14] : memref<20x10xf32, 1> - %22 = affine.load %7[%arg13, %arg14] : memref<20x10xf32, 1> - %23 = mulf %22, %21 : f32 - affine.store %23, %8[%arg13, %arg14] : memref<20x10xf32, 1> - } - } - %9 = memref.alloc() : memref<20x10xf32, 1> - affine.for %arg13 = 0 to 20 { - affine.for %arg14 = 0 to 10 { - %21 = affine.load %arg1[%arg13, %arg14] : memref<20x10xf32, 1> - %22 = affine.load %8[%arg13, %arg14] : memref<20x10xf32, 1> - %23 = mulf %22, %21 : f32 - affine.store %23, %9[%arg13, %arg14] : memref<20x10xf32, 1> - } - } - affine.for %arg13 = 0 to 20 { - affine.for %arg14 = 0 to 10 { - %21 = affine.load %9[%arg13, %arg14] : memref<20x10xf32, 1> - %22 = affine.load %2[%arg13, %arg14] : memref<20x10xf32, 1> - %23 = addf %22, %21 : f32 - affine.store %23, %arg11[%arg13, %arg14] : memref<20x10xf32, 1> - } - } - %10 = memref.alloc() : memref<20x10xf32, 1> - affine.for %arg13 = 0 to 20 { - affine.for %arg14 = 0 to 10 { - %21 = affine.load %1[%arg13, %arg14] : memref<20x10xf32, 1> - %22 = affine.load %arg2[%arg13, %arg14] : memref<20x10xf32, 1> - %23 = mulf %22, %21 : f32 - affine.store %23, %10[%arg13, %arg14] : memref<20x10xf32, 1> - } - } - affine.for %arg13 = 0 to 20 { - affine.for %arg14 = 0 to 10 { - %21 = affine.load %8[%arg13, %arg14] : memref<20x10xf32, 1> - %22 = affine.load %10[%arg13, %arg14] : memref<20x10xf32, 1> - %23 = addf %22, %21 : f32 - affine.store %23, %arg10[%arg13, %arg14] : memref<20x10xf32, 1> - } - } - %11 = memref.alloc() : memref<20x10xf32, 1> - affine.for %arg13 = 0 to 20 { - affine.for %arg14 = 0 to 10 { - %21 = affine.load %arg10[%arg13, %arg14] : memref<20x10xf32, 1> - %22 = affine.load %arg10[%arg13, %arg14] : memref<20x10xf32, 1> - %23 = mulf %22, %21 : f32 - affine.store %23, %11[%arg13, %arg14] : memref<20x10xf32, 1> - } - } - %12 = memref.alloc() : memref<20x10xf32, 1> - affine.for %arg13 = 0 to 20 { - affine.for %arg14 = 0 to 10 { - %21 = affine.load %11[%arg13, %arg14] : memref<20x10xf32, 1> - %22 = affine.load %arg11[%arg13, %arg14] : memref<20x10xf32, 1> - %23 = subf %22, %21 : f32 - affine.store %23, %12[%arg13, %arg14] : memref<20x10xf32, 1> - } - } - %13 = memref.alloc() : memref<20x10xf32, 1> - affine.for %arg13 = 0 to 20 { - affine.for %arg14 = 0 to 10 { - %21 = affine.load %arg7[] : memref - affine.store %21, %13[%arg13, %arg14] : memref<20x10xf32, 1> - } - } - %14 = memref.alloc() : memref<20x10xf32, 1> - affine.for %arg13 = 0 to 20 { - affine.for %arg14 = 0 to 10 { - %21 = affine.load %arg4[%arg13, %arg14] : memref<20x10xf32, 1> - %22 = affine.load %13[%arg13, %arg14] : memref<20x10xf32, 1> - %23 = mulf %22, %21 : f32 - affine.store %23, %14[%arg13, %arg14] : memref<20x10xf32, 1> - } - } - %15 = memref.alloc() : memref<20x10xf32, 1> - affine.for %arg13 = 0 to 20 { - affine.for %arg14 = 0 to 10 { - %21 = affine.load %arg8[] : memref - affine.store %21, %15[%arg13, %arg14] : memref<20x10xf32, 1> - } - } - %16 = memref.alloc() : memref<20x10xf32, 1> - affine.for %arg13 = 0 to 20 { - affine.for %arg14 = 0 to 10 { - %21 = affine.load %15[%arg13, %arg14] : memref<20x10xf32, 1> - %22 = affine.load %12[%arg13, %arg14] : memref<20x10xf32, 1> - %23 = addf %22, %21 : f32 - affine.store %23, %16[%arg13, %arg14] : memref<20x10xf32, 1> - } - } - %17 = memref.alloc() : memref<20x10xf32, 1> - affine.for %arg13 = 0 to 20 { - affine.for %arg14 = 0 to 10 { - %21 = affine.load %16[%arg13, %arg14] : memref<20x10xf32, 1> - %22 = math.sqrt %21 : f32 - affine.store %22, %17[%arg13, %arg14] : memref<20x10xf32, 1> - } - } - %18 = memref.alloc() : memref<20x10xf32, 1> - affine.for %arg13 = 0 to 20 { - affine.for %arg14 = 0 to 10 { - %21 = affine.load %arg5[] : memref - affine.store %21, %18[%arg13, %arg14] : memref<20x10xf32, 1> - } - } - %19 = memref.alloc() : memref<20x10xf32, 1> - affine.for %arg13 = 0 to 20 { - affine.for %arg14 = 0 to 10 { - %21 = affine.load %arg1[%arg13, %arg14] : memref<20x10xf32, 1> - %22 = affine.load %18[%arg13, %arg14] : memref<20x10xf32, 1> - %23 = mulf %22, %21 : f32 - affine.store %23, %19[%arg13, %arg14] : memref<20x10xf32, 1> - } - } - %20 = memref.alloc() : memref<20x10xf32, 1> - affine.for %arg13 = 0 to 20 { - affine.for %arg14 = 0 to 10 { - %21 = affine.load %17[%arg13, %arg14] : memref<20x10xf32, 1> - %22 = affine.load %19[%arg13, %arg14] : memref<20x10xf32, 1> - %23 = divf %22, %21 : f32 - affine.store %23, %20[%arg13, %arg14] : memref<20x10xf32, 1> - } - } - affine.for %arg13 = 0 to 20 { - affine.for %arg14 = 0 to 10 { - %21 = affine.load %20[%arg13, %arg14] : memref<20x10xf32, 1> - %22 = affine.load %14[%arg13, %arg14] : memref<20x10xf32, 1> - %23 = addf %22, %21 : f32 - affine.store %23, %arg12[%arg13, %arg14] : memref<20x10xf32, 1> - } - } - affine.for %arg13 = 0 to 20 { - affine.for %arg14 = 0 to 10 { - %21 = affine.load %arg12[%arg13, %arg14] : memref<20x10xf32, 1> - %22 = affine.load %arg0[%arg13, %arg14] : memref<20x10xf32, 1> - %23 = subf %22, %21 : f32 - affine.store %23, %arg9[%arg13, %arg14] : memref<20x10xf32, 1> - } - } - return -} -// CHECK: affine.for -// CHECK: affine.for -// CHECK-NOT: affine.for - -// ----- - -// Expects fusion of producer into consumer at depth 4 and subsequent removal of -// source loop. -// CHECK-LABEL: func @unflatten4d -func @unflatten4d(%arg1: memref<7x8x9x10xf32>) { - %m = memref.alloc() : memref<5040xf32> - %cf7 = constant 7.0 : f32 - - affine.for %i0 = 0 to 7 { - affine.for %i1 = 0 to 8 { - affine.for %i2 = 0 to 9 { - affine.for %i3 = 0 to 10 { - affine.store %cf7, %m[720 * %i0 + 90 * %i1 + 10 * %i2 + %i3] : memref<5040xf32> - } - } - } - } - affine.for %i0 = 0 to 7 { - affine.for %i1 = 0 to 8 { - affine.for %i2 = 0 to 9 { - affine.for %i3 = 0 to 10 { - %v0 = affine.load %m[720 * %i0 + 90 * %i1 + 10 * %i2 + %i3] : memref<5040xf32> - affine.store %v0, %arg1[%i0, %i1, %i2, %i3] : memref<7x8x9x10xf32> - } - } - } - } - return -} - -// CHECK: affine.for -// CHECK-NEXT: affine.for -// CHECK-NEXT: affine.for -// CHECK-NEXT: affine.for -// CHECK-NOT: affine.for -// CHECK: return - -// ----- - -// Expects fusion of producer into consumer at depth 2 and subsequent removal of -// source loop. -// CHECK-LABEL: func @unflatten2d_with_transpose -func @unflatten2d_with_transpose(%arg1: memref<8x7xf32>) { - %m = memref.alloc() : memref<56xf32> - %cf7 = constant 7.0 : f32 - - affine.for %i0 = 0 to 7 { - affine.for %i1 = 0 to 8 { - affine.store %cf7, %m[8 * %i0 + %i1] : memref<56xf32> - } - } - affine.for %i0 = 0 to 8 { - affine.for %i1 = 0 to 7 { - %v0 = affine.load %m[%i0 + 8 * %i1] : memref<56xf32> - affine.store %v0, %arg1[%i0, %i1] : memref<8x7xf32> - } - } - return -} - -// CHECK: affine.for -// CHECK-NEXT: affine.for -// CHECK-NOT: affine.for -// CHECK: return