Index: mlir/lib/Analysis/AffineAnalysis.cpp =================================================================== --- mlir/lib/Analysis/AffineAnalysis.cpp +++ mlir/lib/Analysis/AffineAnalysis.cpp @@ -98,6 +98,111 @@ return nullptr; } +/// This function performs similar task as getSupportedReduction, but +/// specifically checks whether given loop is a min/max reduction loop that can +/// be vectorized. +static Value getMinMaxReduction(AffineForOp forOp, unsigned pos, + AtomicRMWKind &kind) { + auto yieldOp = cast(forOp.getBody()->back()); + Value yielded = yieldOp.operands()[pos]; + + if (!yielded.hasOneUse()) + return nullptr; + + Operation *definition = yielded.getDefiningOp(); + BlockArgument iterArg = forOp.getRegionIterArgs()[pos]; + + if (std::distance(iterArg.getUsers().begin(), iterArg.getUsers().end()) != 2) + return nullptr; + + auto select = dyn_cast_or_null(definition); + + if (!select) + return nullptr; + + Operation *cmpOp = select->getOperand(0).getDefiningOp(); + + if (!isa_and_nonnull(cmpOp) || !cmpOp->hasOneUse() || + !select->hasOneUse()) + return nullptr; + + // Check if compare and select both have one input that is iterArg. + if ((cmpOp->getOperand(0) != iterArg && cmpOp->getOperand(1) != iterArg) || + (select->getOperand(1) != iterArg && select->getOperand(2) != iterArg)) + return nullptr; + + // Check that 'select' chooses between the same values as 'cmpOp' compares. + if ((cmpOp->getOperand(0) != select->getOperand(1) && + cmpOp->getOperand(0) != select->getOperand(2)) || + (cmpOp->getOperand(1) != select->getOperand(1) && + cmpOp->getOperand(1) != select->getOperand(2))) + return nullptr; + + // Check if comp and select have same input order + bool min_max_swapped = + (cmpOp->getOperand(0) != select->getOperand(1) ? true : false); + + Optional maybeKind = + TypeSwitch>(cmpOp) + .Case( + [min_max_swapped](CmpFOp cmpOp) -> Optional { + switch (cmpOp.getPredicate()) { + case CmpFPredicate::OGT: + case CmpFPredicate::OGE: + case CmpFPredicate::UGT: + case CmpFPredicate::UGE: + return (min_max_swapped ? AtomicRMWKind::minf + : AtomicRMWKind::maxf); + case CmpFPredicate::OLT: + case CmpFPredicate::OLE: + case CmpFPredicate::ULT: + case CmpFPredicate::ULE: + return (min_max_swapped ? AtomicRMWKind::maxf + : AtomicRMWKind::minf); + default: + return llvm::None; + } + }) + .Case( + [min_max_swapped](CmpIOp cmpOp) -> Optional { + switch (cmpOp.getPredicate()) { + case CmpIPredicate::sgt: + case CmpIPredicate::sge: + return (min_max_swapped ? AtomicRMWKind::mins + : AtomicRMWKind::maxs); + case CmpIPredicate::ugt: + case CmpIPredicate::uge: + return (min_max_swapped ? AtomicRMWKind::minu + : AtomicRMWKind::maxu); + case CmpIPredicate::slt: + case CmpIPredicate::sle: + return (min_max_swapped ? AtomicRMWKind::maxs + : AtomicRMWKind::mins); + case CmpIPredicate::ult: + case CmpIPredicate::ule: + return (min_max_swapped ? AtomicRMWKind::maxu + : AtomicRMWKind::minu); + default: + return llvm::None; + } + }) + .Default([](Operation *) -> Optional { + return llvm::None; + }); + if (!maybeKind) + return nullptr; + + kind = *maybeKind; + if (definition->getOperand(1) == forOp.getRegionIterArgs()[pos] && + !dependsOnIterArgs(definition->getOperand(2), forOp)) + return definition->getOperand(2); + if (definition->getOperand(2) == forOp.getRegionIterArgs()[pos] && + !dependsOnIterArgs(definition->getOperand(1), forOp)) + return definition->getOperand(1); + + return nullptr; +} + /// Returns true if `forOp' is a parallel loop. If `parallelReductions` is /// provided, populates it with descriptors of the parallelizable reductions and /// treats them as not preventing parallelization. @@ -117,6 +222,8 @@ AtomicRMWKind kind; if (Value value = getSupportedReduction(forOp, i, kind)) parallelReductions->emplace_back(LoopReduction{kind, i, value}); + else if (Value value = getMinMaxReduction(forOp, i, kind)) + parallelReductions->emplace_back(LoopReduction{kind, i, value}); } // Return later to allow for identifying all parallel reductions even if the Index: mlir/lib/Dialect/StandardOps/IR/Ops.cpp =================================================================== --- mlir/lib/Dialect/StandardOps/IR/Ops.cpp +++ mlir/lib/Dialect/StandardOps/IR/Ops.cpp @@ -359,9 +359,32 @@ Attribute mlir::getIdentityValueAttr(AtomicRMWKind kind, Type resultType, OpBuilder &builder, Location loc) { switch (kind) { + case AtomicRMWKind::maxf: + return builder.getFloatAttr( + resultType, + APFloat::getInf(resultType.cast().getFloatSemantics(), + /*Negative=*/true)); case AtomicRMWKind::addf: case AtomicRMWKind::addi: + case AtomicRMWKind::maxu: return builder.getZeroAttr(resultType); + case AtomicRMWKind::maxs: + return builder.getIntegerAttr( + resultType, + APInt::getSignedMinValue(resultType.cast().getWidth())); + case AtomicRMWKind::minf: + return builder.getFloatAttr( + resultType, + APFloat::getInf(resultType.cast().getFloatSemantics(), + /*Negative=*/false)); + case AtomicRMWKind::mins: + return builder.getIntegerAttr( + resultType, + APInt::getSignedMaxValue(resultType.cast().getWidth())); + case AtomicRMWKind::minu: + return builder.getIntegerAttr( + resultType, + APInt::getMaxValue(resultType.cast().getWidth())); case AtomicRMWKind::muli: return builder.getIntegerAttr(resultType, 1); case AtomicRMWKind::mulf: @@ -394,6 +417,30 @@ return builder.create(loc, lhs, rhs); case AtomicRMWKind::muli: return builder.create(loc, lhs, rhs); + case AtomicRMWKind::maxf: + return builder.create( + loc, builder.create(loc, CmpFPredicate::OGT, lhs, rhs), lhs, + rhs); + case AtomicRMWKind::minf: + return builder.create( + loc, builder.create(loc, CmpFPredicate::OLT, lhs, rhs), lhs, + rhs); + case AtomicRMWKind::maxs: + return builder.create( + loc, builder.create(loc, CmpIPredicate::sgt, lhs, rhs), lhs, + rhs); + case AtomicRMWKind::mins: + return builder.create( + loc, builder.create(loc, CmpIPredicate::slt, lhs, rhs), lhs, + rhs); + case AtomicRMWKind::maxu: + return builder.create( + loc, builder.create(loc, CmpIPredicate::ugt, lhs, rhs), lhs, + rhs); + case AtomicRMWKind::minu: + return builder.create( + loc, builder.create(loc, CmpIPredicate::ult, lhs, rhs), lhs, + rhs); // TODO: Add remaining reduction operations. default: (void)emitOptionalError(loc, "Reduction operation type not supported"); Index: mlir/lib/Dialect/Vector/VectorOps.cpp =================================================================== --- mlir/lib/Dialect/Vector/VectorOps.cpp +++ mlir/lib/Dialect/Vector/VectorOps.cpp @@ -357,6 +357,18 @@ return builder.create(vector.getLoc(), scalarType, builder.getStringAttr("mul"), vector, ValueRange{}); + case AtomicRMWKind::minf: + case AtomicRMWKind::mins: + case AtomicRMWKind::minu: + return builder.create(vector.getLoc(), scalarType, + builder.getStringAttr("min"), + vector, ValueRange{}); + case AtomicRMWKind::maxf: + case AtomicRMWKind::maxs: + case AtomicRMWKind::maxu: + return builder.create(vector.getLoc(), scalarType, + builder.getStringAttr("max"), + vector, ValueRange{}); // TODO: Add remaining reduction operations. default: (void)emitOptionalError(loc, "Reduction operation type not supported"); Index: mlir/test/Conversion/AffineToStandard/lower-affine.mlir =================================================================== --- mlir/test/Conversion/AffineToStandard/lower-affine.mlir +++ mlir/test/Conversion/AffineToStandard/lower-affine.mlir @@ -904,3 +904,71 @@ // CHECK: } // CHECK: scf.yield // CHECK: } + +///////////////////////////////////////////////////////////////////// + +func @affine_parallel_with_min_max_reductions(%arg0: memref<3x3xf32>, %arg1: memref<3x3xi8>) -> (f32, f32, i8, i8, i8, i8) { + %0:6 = affine.parallel (%kx, %ky) = (0, 0) to (2, 2) reduce ("minf", "maxf", "mins", "maxs", "minu", "maxu") -> (f32, f32, i8, i8, i8, i8) { + %3 = affine.load %arg0[%kx, %ky] : memref<3x3xf32> + %4 = affine.load %arg1[%kx, %ky] : memref<3x3xi8> + affine.yield %3, %3, %4, %4, %4, %4 : f32, f32, i8, i8, i8, i8 + } + return %0#0, %0#1, %0#2, %0#3, %0#4, %0#5 : f32, f32, i8, i8, i8, i8 +} + +// CHECK-LABEL: func @affine_parallel_with_min_max_reductions +// CHECK: %[[LOWER_1:.*]] = constant 0 : index +// CHECK-NEXT: %[[UPPER_1:.*]] = constant 2 : index +// CHECK-NEXT: %[[LOWER_2:.*]] = constant 0 : index +// CHECK-NEXT: %[[UPPER_2:.*]] = constant 2 : index +// CHECK-NEXT: %[[STEP_1:.*]] = constant 1 : index +// CHECK-NEXT: %[[STEP_2:.*]] = constant 1 : index +// CHECK-NEXT: %[[INIT_1:.*]] = constant 0x7F800000 : f32 +// CHECK-NEXT: %[[INIT_2:.*]] = constant 0xFF800000 : f32 +// CHECK-NEXT: %[[INIT_3:.*]] = constant 127 : i8 +// CHECK-NEXT: %[[INIT_4:.*]] = constant -128 : i8 +// CHECK-NEXT: %[[INIT_5:.*]] = constant -1 : i8 +// CHECK-NEXT: %[[INIT_6:.*]] = constant 0 : i8 +// CHECK-NEXT: %[[RES:.*]] = scf.parallel (%[[I:.*]], %[[J:.*]]) = (%[[LOWER_1]], %[[LOWER_2]]) to (%[[UPPER_1]], %[[UPPER_2]]) step (%[[STEP_1]], %[[STEP_2]]) init (%[[INIT_1]], %[[INIT_2]], %[[INIT_3]], %[[INIT_4]], %[[INIT_5]], %[[INIT_6]]) -> (f32, f32, i8, i8, i8, i8) { +// CHECK-NEXT: %[[VAL_1:.*]] = memref.load +// CHECK-NEXT: %[[VAL_2:.*]] = memref.load +// CHECK-NEXT: scf.reduce(%[[VAL_1]]) : f32 { +// CHECK-NEXT: ^bb0(%[[LHS:.*]]: f32, %[[RHS:.*]]: f32): +// CHECK-NEXT: %[[CMP:.*]] = cmpf olt, %[[LHS]], %[[RHS]] : f32 +// CHECK-NEXT: %[[SEL:.*]] = select %[[CMP]], %[[LHS]], %[[RHS]] : f32 +// CHECK-NEXT: scf.reduce.return %[[SEL]] : f32 +// CHECK-NEXT: } +// CHECK-NEXT: scf.reduce(%[[VAL_1]]) : f32 { +// CHECK-NEXT: ^bb0(%[[LHS:.*]]: f32, %[[RHS:.*]]: f32): +// CHECK-NEXT: %[[CMP:.*]] = cmpf ogt, %[[LHS]], %[[RHS]] : f32 +// CHECK-NEXT: %[[SEL:.*]] = select %[[CMP]], %[[LHS]], %[[RHS]] : f32 +// CHECK-NEXT: scf.reduce.return %[[SEL]] : f32 +// CHECK-NEXT: } +// CHECK-NEXT: scf.reduce(%[[VAL_2]]) : i8 { +// CHECK-NEXT: ^bb0(%[[LHS:.*]]: i8, %[[RHS:.*]]: i8): +// CHECK-NEXT: %[[CMP:.*]] = cmpi slt, %[[LHS]], %[[RHS]] : i8 +// CHECK-NEXT: %[[SEL:.*]] = select %[[CMP]], %[[LHS]], %[[RHS]] : i8 +// CHECK-NEXT: scf.reduce.return %[[SEL]] : i8 +// CHECK-NEXT: } +// CHECK-NEXT: scf.reduce(%[[VAL_2]]) : i8 { +// CHECK-NEXT: ^bb0(%[[LHS:.*]]: i8, %[[RHS:.*]]: i8): +// CHECK-NEXT: %[[CMP:.*]] = cmpi sgt, %[[LHS]], %[[RHS]] : i8 +// CHECK-NEXT: %[[SEL:.*]] = select %[[CMP]], %[[LHS]], %[[RHS]] : i8 +// CHECK-NEXT: scf.reduce.return %[[SEL]] : i8 +// CHECK-NEXT: } +// CHECK-NEXT: scf.reduce(%[[VAL_2]]) : i8 { +// CHECK-NEXT: ^bb0(%[[LHS:.*]]: i8, %[[RHS:.*]]: i8): +// CHECK-NEXT: %[[CMP:.*]] = cmpi ult, %[[LHS]], %[[RHS]] : i8 +// CHECK-NEXT: %[[SEL:.*]] = select %[[CMP]], %[[LHS]], %[[RHS]] : i8 +// CHECK-NEXT: scf.reduce.return %[[SEL]] : i8 +// CHECK-NEXT: } +// CHECK-NEXT: scf.reduce(%[[VAL_2]]) : i8 { +// CHECK-NEXT: ^bb0(%[[LHS:.*]]: i8, %[[RHS:.*]]: i8): +// CHECK-NEXT: %[[CMP:.*]] = cmpi ugt, %[[LHS]], %[[RHS]] : i8 +// CHECK-NEXT: %[[SEL:.*]] = select %[[CMP]], %[[LHS]], %[[RHS]] : i8 +// CHECK-NEXT: scf.reduce.return %[[SEL]] : i8 +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield +// CHECK-NEXT: } +// CHECK-NEXT: return +// CHECK-NEXT: } Index: mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir =================================================================== --- mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir +++ mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir @@ -466,3 +466,557 @@ // CHECK: %[[new_eacc:.*]] = select %[[mask]], %[[eadd]], %[[esum_iter]] : vector<128xi1>, vector<128xf32> // CHECK: affine.yield %[[new_acc]], %[[new_eacc]] : vector<128xf32> // CHECK: } + +// ----- + +// CHECK-LABEL: func @reduce_max_f32 +func @reduce_max_f32(%in: memref<1024xf32, 1>, %out: memref<1xf32, 1>) { + %cst = constant 0.0 : f32 + %0 = affine.for %it = 0 to 1024 step 1 iter_args(%iter = %cst) -> (f32) { + %t = affine.load %in[%it] : memref<1024xf32, 1> + %c = cmpf "ogt", %t, %iter : f32 + %s = select %c, %t, %iter : f32 + affine.yield %s : f32 + } + affine.store %0, %out[0] : memref<1xf32, 1> + return +} + +// CHECK: %[[cst_0:.*]] = constant 0.000000e+00 : f32 +// CHECK-NEXT: %[[cst_1:.*]] = constant dense<0xFF800000> : vector<128xf32> +// CHECK-NEXT: %[[V0:.*]] = affine.for %[[it:.*]] = 0 to 1024 step 128 iter_args(%[[iter:.*]] = %[[cst_1:.*]]) -> (vector<128xf32>) +// CHECK-NEXT: %[[cst_1:.*]] = constant 0.000000e+00 : f32 +// CHECK-NEXT: %[[V4:.*]] = vector.transfer_read %{{.*}}[%[[it:.*]]], %[[cst_1:.*]] : memref<1024xf32, 1>, vector<128xf32> +// CHECK-NEXT: %[[V5:.*]] = cmpf ogt, %[[V4:.*]], %[[iter:.*]] : vector<128xf32> +// CHECK-NEXT: %[[V6:.*]] = select %[[V5:.*]], %[[V4:.*]], %[[iter:.*]] : vector<128xi1>, vector<128xf32> +// CHECK-NEXT: affine.yield %[[V6:.*]] : vector<128xf32> +// CHECK-NEXT: } +// CHECK-NEXT: %[[V1:.*]] = vector.reduction "max", %[[V0:.*]] : vector<128xf32> into f32 +// CHECK-NEXT: %[[V2:.*]] = cmpf ogt, %[[V1:.*]], %[[cst_0:.*]] : f32 +// CHECK-NEXT: %[[V3:.*]] = select %[[V2:.*]], %[[V1:.*]], %[[cst_0:.*]] : f32 +// CHECK-NEXT: affine.store %[[V3:.*]], %{{.*}}[0] : memref<1xf32, 1> + +// ----- + +// CHECK-LABEL: func @reduce_max_swap_f32 +func @reduce_max_swap_f32(%in: memref<1024xf32, 1>, %out: memref<1xf32, 1>) { + %cst = constant 0.0 : f32 + %0 = affine.for %it = 0 to 1024 step 1 iter_args(%iter = %cst) -> (f32) { + %t = affine.load %in[%it] : memref<1024xf32, 1> + %c = cmpf "ogt", %t, %iter : f32 + %s = select %c, %iter, %t : f32 + affine.yield %s : f32 + } + affine.store %0, %out[0] : memref<1xf32, 1> + return +} + +// CHECK: %[[cst_0:.*]] = constant 0.000000e+00 : f32 +// CHECK-NEXT: %[[cst_1:.*]] = constant dense<0x7F800000> : vector<128xf32> +// CHECK-NEXT: %[[V0:.*]] = affine.for %[[it:.*]] = 0 to 1024 step 128 iter_args(%[[iter:.*]] = %[[cst_1:.*]]) -> (vector<128xf32>) +// CHECK-NEXT: %[[cst_1:.*]] = constant 0.000000e+00 : f32 +// CHECK-NEXT: %[[V4:.*]] = vector.transfer_read %{{.*}}[%[[it:.*]]], %[[cst_1:.*]] : memref<1024xf32, 1>, vector<128xf32> +// CHECK-NEXT: %[[V5:.*]] = cmpf ogt, %[[V4:.*]], %[[iter:.*]] : vector<128xf32> +// CHECK-NEXT: %[[V6:.*]] = select %[[V5:.*]], %[[iter:.*]], %[[V4:.*]] : vector<128xi1>, vector<128xf32> +// CHECK-NEXT: affine.yield %[[V6:.*]] : vector<128xf32> +// CHECK-NEXT: } +// CHECK-NEXT: %[[V1:.*]] = vector.reduction "min", %[[V0:.*]] : vector<128xf32> into f32 +// CHECK-NEXT: %[[V2:.*]] = cmpf olt, %[[V1:.*]], %[[cst_0:.*]] : f32 +// CHECK-NEXT: %[[V3:.*]] = select %[[V2:.*]], %[[V1:.*]], %[[cst_0:.*]] : f32 +// CHECK-NEXT: affine.store %[[V3:.*]], %{{.*}}[0] : memref<1xf32, 1> + +// ----- + +// CHECK-LABEL: func @reduce_min_f32 +func @reduce_min_f32(%in: memref<1024xf32, 1>, %out: memref<1xf32, 1>) { + %cst = constant 0.0 : f32 + %0 = affine.for %it = 0 to 1024 step 1 iter_args(%iter = %cst) -> (f32) { + %t = affine.load %in[%it] : memref<1024xf32, 1> + %c = cmpf "olt", %t, %iter : f32 + %s = select %c, %t, %iter : f32 + affine.yield %s : f32 + } + affine.store %0, %out[0] : memref<1xf32, 1> + return +} + +// CHECK: %[[cst_0:.*]] = constant 0.000000e+00 : f32 +// CHECK-NEXT: %[[cst_1:.*]] = constant dense<0x7F800000> : vector<128xf32> +// CHECK-NEXT: %[[V0:.*]] = affine.for %[[it:.*]] = 0 to 1024 step 128 iter_args(%[[iter:.*]] = %[[cst_1:.*]]) -> (vector<128xf32>) +// CHECK-NEXT: %[[cst_1:.*]] = constant 0.000000e+00 : f32 +// CHECK-NEXT: %[[V4:.*]] = vector.transfer_read %{{.*}}[%[[it:.*]]], %[[cst_1:.*]] : memref<1024xf32, 1>, vector<128xf32> +// CHECK-NEXT: %[[V5:.*]] = cmpf olt, %[[V4:.*]], %[[iter:.*]] : vector<128xf32> +// CHECK-NEXT: %[[V6:.*]] = select %[[V5:.*]], %[[V4:.*]], %[[iter:.*]] : vector<128xi1>, vector<128xf32> +// CHECK-NEXT: affine.yield %[[V6:.*]] : vector<128xf32> +// CHECK-NEXT: } +// CHECK-NEXT: %[[V1:.*]] = vector.reduction "min", %[[V0:.*]] : vector<128xf32> into f32 +// CHECK-NEXT: %[[V2:.*]] = cmpf olt, %[[V1:.*]], %[[cst_0:.*]] : f32 +// CHECK-NEXT: %[[V3:.*]] = select %[[V2:.*]], %[[V1:.*]], %[[cst_0:.*]] : f32 +// CHECK-NEXT: affine.store %[[V3:.*]], %{{.*}}[0] : memref<1xf32, 1> + +// ----- + +// CHECK-LABEL: func @reduce_min_swap_f32 +func @reduce_min_swap_f32(%in: memref<1024xf32, 1>, %out: memref<1xf32, 1>) { + %cst = constant 0.0 : f32 + %0 = affine.for %it = 0 to 1024 step 1 iter_args(%iter = %cst) -> (f32) { + %t = affine.load %in[%it] : memref<1024xf32, 1> + %c = cmpf "olt", %t, %iter : f32 + %s = select %c, %iter, %t : f32 + affine.yield %s : f32 + } + affine.store %0, %out[0] : memref<1xf32, 1> + return +} + +// CHECK: %[[cst_0:.*]] = constant 0.000000e+00 : f32 +// CHECK-NEXT: %[[cst_1:.*]] = constant dense<0xFF800000> : vector<128xf32> +// CHECK-NEXT: %[[V0:.*]] = affine.for %[[it:.*]] = 0 to 1024 step 128 iter_args(%[[iter:.*]] = %[[cst_1:.*]]) -> (vector<128xf32>) +// CHECK-NEXT: %[[cst_1:.*]] = constant 0.000000e+00 : f32 +// CHECK-NEXT: %[[V4:.*]] = vector.transfer_read %{{.*}}[%[[it:.*]]], %[[cst_1:.*]] : memref<1024xf32, 1>, vector<128xf32> +// CHECK-NEXT: %[[V5:.*]] = cmpf olt, %[[V4:.*]], %[[iter:.*]] : vector<128xf32> +// CHECK-NEXT: %[[V6:.*]] = select %[[V5:.*]], %[[iter:.*]], %[[V4:.*]] : vector<128xi1>, vector<128xf32> +// CHECK-NEXT: affine.yield %[[V6:.*]] : vector<128xf32> +// CHECK-NEXT: } +// CHECK-NEXT: %[[V1:.*]] = vector.reduction "max", %[[V0:.*]] : vector<128xf32> into f32 +// CHECK-NEXT: %[[V2:.*]] = cmpf ogt, %[[V1:.*]], %[[cst_0:.*]] : f32 +// CHECK-NEXT: %[[V3:.*]] = select %[[V2:.*]], %[[V1:.*]], %[[cst_0:.*]] : f32 +// CHECK-NEXT: affine.store %[[V3:.*]], %{{.*}}[0] : memref<1xf32, 1> + +// ----- + +// CHECK-LABEL: func @reduce_max_signed_i8 +func @reduce_max_signed_i8(%in: memref<1024xi8, 1>, %out: memref<1xi8, 1>) { + %cst = constant 0 : i8 + %0 = affine.for %it = 0 to 1024 step 1 iter_args(%iter = %cst) -> (i8) { + %t = affine.load %in[%it] : memref<1024xi8, 1> + %c = cmpi "sgt", %t, %iter : i8 + %s = select %c, %t, %iter : i8 + affine.yield %s : i8 + } + affine.store %0, %out[0] : memref<1xi8, 1> + return +} + +// CHECK: %[[c0_i8:.*]] = constant 0 : i8 +// CHECK-NEXT: %[[cst:.*]] = constant dense<-128> : vector<128xi8> +// CHECK-NEXT: %[[V0:.*]] = affine.for %[[it:.*]] = 0 to 1024 step 128 iter_args(%[[iter:.*]] = %[[cst:.*]]) -> (vector<128xi8>) +// CHECK-NEXT: %[[c0_i8_0:.*]] = constant 0 : i8 +// CHECK-NEXT: %[[V4:.*]] = vector.transfer_read %{{.*}}[%[[it:.*]]], %[[c0_i8_0:.*]] : memref<1024xi8, 1>, vector<128xi8> +// CHECK-NEXT: %[[V5:.*]] = cmpi sgt, %[[V4:.*]], %[[iter:.*]] : vector<128xi8> +// CHECK-NEXT: %[[V6:.*]] = select %[[V5:.*]], %[[V4:.*]], %[[iter:.*]] : vector<128xi1>, vector<128xi8> +// CHECK-NEXT: affine.yield %[[V6:.*]] : vector<128xi8> +// CHECK-NEXT: } +// CHECK-NEXT: %[[V1:.*]] = vector.reduction "max", %[[V0:.*]] : vector<128xi8> into i8 +// CHECK-NEXT: %[[V2:.*]] = cmpi sgt, %[[V1:.*]], %[[c0_i8:.*]] : i8 +// CHECK-NEXT: %[[V3:.*]] = select %[[V2:.*]], %[[V1:.*]], %[[c0_i8:.*]] : i8 +// CHECK-NEXT: affine.store %[[V3:.*]], %{{.*}}[0] : memref<1xi8, 1> + +// ----- + +// CHECK-LABEL: func @reduce_max_swap_signed_i8 +func @reduce_max_swap_signed_i8(%in: memref<1024xi8, 1>, %out: memref<1xi8, 1>) { + %cst = constant 0 : i8 + %0 = affine.for %it = 0 to 1024 step 1 iter_args(%iter = %cst) -> (i8) { + %t = affine.load %in[%it] : memref<1024xi8, 1> + %c = cmpi "sgt", %t, %iter : i8 + %s = select %c, %iter, %t : i8 + affine.yield %s : i8 + } + affine.store %0, %out[0] : memref<1xi8, 1> + return +} + +// CHECK: %[[c0_i8:.*]] = constant 0 : i8 +// CHECK-NEXT: %[[cst:.*]] = constant dense<127> : vector<128xi8> +// CHECK-NEXT: %[[V0:.*]] = affine.for %[[it:.*]] = 0 to 1024 step 128 iter_args(%[[iter:.*]] = %[[cst:.*]]) -> (vector<128xi8>) +// CHECK-NEXT: %[[c0_i8_0:.*]] = constant 0 : i8 +// CHECK-NEXT: %[[V4:.*]] = vector.transfer_read %{{.*}}[%[[it:.*]]], %[[c0_i8_0:.*]] : memref<1024xi8, 1>, vector<128xi8> +// CHECK-NEXT: %[[V5:.*]] = cmpi sgt, %[[V4:.*]], %[[iter:.*]] : vector<128xi8> +// CHECK-NEXT: %[[V6:.*]] = select %[[V5:.*]], %[[iter:.*]], %[[V4:.*]] : vector<128xi1>, vector<128xi8> +// CHECK-NEXT: affine.yield %[[V6:.*]] : vector<128xi8> +// CHECK-NEXT: } +// CHECK-NEXT: %[[V1:.*]] = vector.reduction "min", %[[V0:.*]] : vector<128xi8> into i8 +// CHECK-NEXT: %[[V2:.*]] = cmpi slt, %[[V1:.*]], %[[c0_i8:.*]] : i8 +// CHECK-NEXT: %[[V3:.*]] = select %[[V2:.*]], %[[V1:.*]], %[[c0_i8:.*]] : i8 +// CHECK-NEXT: affine.store %[[V3:.*]], %{{.*}}[0] : memref<1xi8, 1> + +// ----- + +// CHECK-LABEL: func @reduce_max_unsigned_i8 +func @reduce_max_unsigned_i8(%in: memref<1024xi8, 1>, %out: memref<1xi8, 1>) { + %cst = constant 0 : i8 + %0 = affine.for %it = 0 to 1024 step 1 iter_args(%iter = %cst) -> (i8) { + %t = affine.load %in[%it] : memref<1024xi8, 1> + %c = cmpi "ugt", %t, %iter : i8 + %s = select %c, %t, %iter : i8 + affine.yield %s : i8 + } + affine.store %0, %out[0] : memref<1xi8, 1> + return +} + +// CHECK: %[[c0_i8:.*]] = constant 0 : i8 +// CHECK-NEXT: %[[cst:.*]] = constant dense<0> : vector<128xi8> +// CHECK-NEXT: %[[V0:.*]] = affine.for %[[it:.*]] = 0 to 1024 step 128 iter_args(%[[iter:.*]] = %[[cst:.*]]) -> (vector<128xi8>) +// CHECK-NEXT: %[[c0_i8_0:.*]] = constant 0 : i8 +// CHECK-NEXT: %[[V4:.*]] = vector.transfer_read %{{.*}}[%[[it:.*]]], %[[c0_i8_0:.*]] : memref<1024xi8, 1>, vector<128xi8> +// CHECK-NEXT: %[[V5:.*]] = cmpi ugt, %[[V4:.*]], %[[iter:.*]] : vector<128xi8> +// CHECK-NEXT: %[[V6:.*]] = select %[[V5:.*]], %[[V4:.*]], %[[iter:.*]] : vector<128xi1>, vector<128xi8> +// CHECK-NEXT: affine.yield %[[V6:.*]] : vector<128xi8> +// CHECK-NEXT: } +// CHECK-NEXT: %[[V1:.*]] = vector.reduction "max", %[[V0:.*]] : vector<128xi8> into i8 +// CHECK-NEXT: affine.store %[[V1:.*]], %{{.*}}[0] : memref<1xi8, 1> + +// ----- + +// CHECK-LABEL: func @reduce_max_swap_unsigned_i8 +func @reduce_max_swap_unsigned_i8(%in: memref<1024xi8, 1>, %out: memref<1xi8, 1>) { + %cst = constant 0 : i8 + %0 = affine.for %it = 0 to 1024 step 1 iter_args(%iter = %cst) -> (i8) { + %t = affine.load %in[%it] : memref<1024xi8, 1> + %c = cmpi "ugt", %t, %iter : i8 + %s = select %c, %iter, %t : i8 + affine.yield %s : i8 + } + affine.store %0, %out[0] : memref<1xi8, 1> + return +} + +// CHECK: %[[c0_i8:.*]] = constant 0 : i8 +// CHECK-NEXT: %[[cst:.*]] = constant dense<-1> : vector<128xi8> +// CHECK-NEXT: %[[V0:.*]] = affine.for %[[it:.*]] = 0 to 1024 step 128 iter_args(%[[iter:.*]] = %[[cst:.*]]) -> (vector<128xi8>) +// CHECK-NEXT: %[[c0_i8_0:.*]] = constant 0 : i8 +// CHECK-NEXT: %[[V4:.*]] = vector.transfer_read %{{.*}}[%[[it:.*]]], %[[c0_i8_0:.*]] : memref<1024xi8, 1>, vector<128xi8> +// CHECK-NEXT: %[[V5:.*]] = cmpi ugt, %[[V4:.*]], %[[iter:.*]] : vector<128xi8> +// CHECK-NEXT: %[[V6:.*]] = select %[[V5:.*]], %[[iter:.*]], %[[V4:.*]] : vector<128xi1>, vector<128xi8> +// CHECK-NEXT: affine.yield %[[V6:.*]] : vector<128xi8> +// CHECK-NEXT: } +// CHECK-NEXT: %[[V1:.*]] = vector.reduction "min", %[[V0:.*]] : vector<128xi8> into i8 +// CHECK-NEXT: %[[V2:.*]] = cmpi ult, %[[V1:.*]], %[[c0_i8:.*]] : i8 +// CHECK-NEXT: %[[V3:.*]] = select %[[V2:.*]], %[[V1:.*]], %[[c0_i8:.*]] : i8 +// CHECK-NEXT: affine.store %[[V3:.*]], %{{.*}}[0] : memref<1xi8, 1> + +// ----- + +// CHECK-LABEL: func @reduce_min_signed_i8 +func @reduce_min_signed_i8(%in: memref<1024xi8, 1>, %out: memref<1xi8, 1>) { + %cst = constant 0 : i8 + %0 = affine.for %it = 0 to 1024 step 1 iter_args(%iter = %cst) -> (i8) { + %t = affine.load %in[%it] : memref<1024xi8, 1> + %c = cmpi "slt", %t, %iter : i8 + %s = select %c, %t, %iter : i8 + affine.yield %s : i8 + } + affine.store %0, %out[0] : memref<1xi8, 1> + return +} + +// CHECK: %[[c0_i8:.*]] = constant 0 : i8 +// CHECK-NEXT: %[[cst:.*]] = constant dense<127> : vector<128xi8> +// CHECK-NEXT: %[[V0:.*]] = affine.for %[[it:.*]] = 0 to 1024 step 128 iter_args(%[[iter:.*]] = %[[cst:.*]]) -> (vector<128xi8>) +// CHECK-NEXT: %[[c0_i8_0:.*]] = constant 0 : i8 +// CHECK-NEXT: %[[V4:.*]] = vector.transfer_read %{{.*}}[%[[it:.*]]], %[[c0_i8_0:.*]] : memref<1024xi8, 1>, vector<128xi8> +// CHECK-NEXT: %[[V5:.*]] = cmpi slt, %[[V4:.*]], %[[iter:.*]] : vector<128xi8> +// CHECK-NEXT: %[[V6:.*]] = select %[[V5:.*]], %[[V4:.*]], %[[iter:.*]] : vector<128xi1>, vector<128xi8> +// CHECK-NEXT: affine.yield %[[V6:.*]] : vector<128xi8> +// CHECK-NEXT: } +// CHECK-NEXT: %[[V1:.*]] = vector.reduction "min", %[[V0:.*]] : vector<128xi8> into i8 +// CHECK-NEXT: %[[V2:.*]] = cmpi slt, %[[V1:.*]], %[[c0_i8:.*]] : i8 +// CHECK-NEXT: %[[V3:.*]] = select %[[V2:.*]], %[[V1:.*]], %[[c0_i8:.*]] : i8 +// CHECK-NEXT: affine.store %[[V3:.*]], %{{.*}}[0] : memref<1xi8, 1> + +// ----- + +// CHECK-LABEL: func @reduce_min_swap_signed_i8 +func @reduce_min_swap_signed_i8(%in: memref<1024xi8, 1>, %out: memref<1xi8, 1>) { + %cst = constant 0 : i8 + %0 = affine.for %it = 0 to 1024 step 1 iter_args(%iter = %cst) -> (i8) { + %t = affine.load %in[%it] : memref<1024xi8, 1> + %c = cmpi "slt", %t, %iter : i8 + %s = select %c, %iter, %t : i8 + affine.yield %s : i8 + } + affine.store %0, %out[0] : memref<1xi8, 1> + return +} + +// CHECK: %[[c0_i8:.*]] = constant 0 : i8 +// CHECK-NEXT: %[[cst:.*]] = constant dense<-128> : vector<128xi8> +// CHECK-NEXT: %[[V0:.*]] = affine.for %[[it:.*]] = 0 to 1024 step 128 iter_args(%[[iter:.*]] = %[[cst:.*]]) -> (vector<128xi8>) +// CHECK-NEXT: %[[c0_i8_0:.*]] = constant 0 : i8 +// CHECK-NEXT: %[[V4:.*]] = vector.transfer_read %{{.*}}[%[[it:.*]]], %[[c0_i8_0:.*]] : memref<1024xi8, 1>, vector<128xi8> +// CHECK-NEXT: %[[V5:.*]] = cmpi slt, %[[V4:.*]], %[[iter:.*]] : vector<128xi8> +// CHECK-NEXT: %[[V6:.*]] = select %[[V5:.*]], %[[iter:.*]], %[[V4:.*]] : vector<128xi1>, vector<128xi8> +// CHECK-NEXT: affine.yield %[[V6:.*]] : vector<128xi8> +// CHECK-NEXT: } +// CHECK-NEXT: %[[V1:.*]] = vector.reduction "max", %[[V0:.*]] : vector<128xi8> into i8 +// CHECK-NEXT: %[[V2:.*]] = cmpi sgt, %[[V1:.*]], %[[c0_i8:.*]] : i8 +// CHECK-NEXT: %[[V3:.*]] = select %[[V2:.*]], %[[V1:.*]], %[[c0_i8:.*]] : i8 +// CHECK-NEXT: affine.store %[[V3:.*]], %{{.*}}[0] : memref<1xi8, 1> + +// ----- + +// CHECK-LABEL: func @reduce_min_unsigned_i8 +func @reduce_min_unsigned_i8(%in: memref<1024xi8, 1>, %out: memref<1xi8, 1>) { + %cst = constant 0 : i8 + %0 = affine.for %it = 0 to 1024 step 1 iter_args(%iter = %cst) -> (i8) { + %t = affine.load %in[%it] : memref<1024xi8, 1> + %c = cmpi "ult", %t, %iter : i8 + %s = select %c, %t, %iter : i8 + affine.yield %s : i8 + } + affine.store %0, %out[0] : memref<1xi8, 1> + return +} + +// CHECK: %[[c0_i8:.*]] = constant 0 : i8 +// CHECK-NEXT: %[[cst:.*]] = constant dense<-1> : vector<128xi8> +// CHECK-NEXT: %[[V0:.*]] = affine.for %[[it:.*]] = 0 to 1024 step 128 iter_args(%[[iter:.*]] = %[[cst:.*]]) -> (vector<128xi8>) +// CHECK-NEXT: %[[c0_i8_0:.*]] = constant 0 : i8 +// CHECK-NEXT: %[[V4:.*]] = vector.transfer_read %{{.*}}[%[[it:.*]]], %[[c0_i8_0:.*]] : memref<1024xi8, 1>, vector<128xi8> +// CHECK-NEXT: %[[V5:.*]] = cmpi ult, %[[V4:.*]], %[[iter:.*]] : vector<128xi8> +// CHECK-NEXT: %[[V6:.*]] = select %[[V5:.*]], %[[iter:.*]], %[[V4:.*]] : vector<128xi1>, vector<128xi8> +// CHECK-NEXT: affine.yield %[[V6:.*]] : vector<128xi8> +// CHECK-NEXT: } +// CHECK-NEXT: %[[V1:.*]] = vector.reduction "min", %[[V0:.*]] : vector<128xi8> into i8 +// CHECK-NEXT: %[[V2:.*]] = cmpi ult, %[[V1:.*]], %[[c0_i8:.*]] : i8 +// CHECK-NEXT: %[[V3:.*]] = select %[[V2:.*]], %[[V1:.*]], %[[c0_i8:.*]] : i8 +// CHECK-NEXT: affine.store %[[V3:.*]], %{{.*}}[0] : memref<1xi8, 1> + +// ----- + +// CHECK-LABEL: func @reduce_min_swap_unsigned_i8 +func @reduce_min_swap_unsigned_i8(%in: memref<1024xi8, 1>, %out: memref<1xi8, 1>) { + %cst = constant 0 : i8 + %0 = affine.for %it = 0 to 1024 step 1 iter_args(%iter = %cst) -> (i8) { + %t = affine.load %in[%it] : memref<1024xi8, 1> + %c = cmpi "ult", %t, %iter : i8 + %s = select %c, %iter, %t : i8 + affine.yield %s : i8 + } + affine.store %0, %out[0] : memref<1xi8, 1> + return +} + +// CHECK: %[[c0_i8:.*]] = constant 0 : i8 +// CHECK-NEXT: %[[cst:.*]] = constant dense<0> : vector<128xi8> +// CHECK-NEXT: %[[V0:.*]] = affine.for %[[it:.*]] = 0 to 1024 step 128 iter_args(%[[iter:.*]] = %[[cst:.*]]) -> (vector<128xi8>) +// CHECK-NEXT: %[[c0_i8_0:.*]] = constant 0 : i8 +// CHECK-NEXT: %[[V4:.*]] = vector.transfer_read %{{.*}}[%[[it:.*]]], %[[c0_i8_0:.*]] : memref<1024xi8, 1>, vector<128xi8> +// CHECK-NEXT: %[[V5:.*]] = cmpi ult, %[[V4:.*]], %[[iter:.*]] : vector<128xi8> +// CHECK-NEXT: %[[V6:.*]] = select %[[V5:.*]], %[[V4:.*]], %[[iter:.*]] : vector<128xi1>, vector<128xi8> +// CHECK-NEXT: affine.yield %[[V6:.*]] : vector<128xi8> +// CHECK-NEXT: } +// CHECK-NEXT: %[[V1:.*]] = vector.reduction "max", %[[V0:.*]] : vector<128xi8> into i8 +// CHECK-NEXT: affine.store %[[V1:.*]], %{{.*}}[0] : memref<1xi8, 1> + +// ----- + +// CHECK-LABEL: func @reduce_max_masked_f32 +func @reduce_max_masked_f32(%in: memref<501xf32, 1>, %out: memref<1xf32, 1>) { + %cst = constant 0.0 : f32 + %0 = affine.for %it = 0 to 501 step 1 iter_args(%iter = %cst) -> (f32) { + %t = affine.load %in[%it] : memref<501xf32, 1> + %c = cmpf "ogt", %t, %iter : f32 + %s = select %c, %t, %iter : f32 + affine.yield %s : f32 + } + affine.store %0, %out[0] : memref<1xf32, 1> + return +} + +// CHECK: %[[cst:.*]] = constant 0.000000e+00 : f32 +// CHECK-NEXT: %[[cst_0:.*]] = constant dense<0xFF800000> : vector<128xf32> +// CHECK-NEXT: %[[V0:.*]] = affine.for %[[it:.*]] = 0 to 501 step 128 iter_args(%[[iter:.*]] = %[[cst_0:.*]]) -> (vector<128xf32>) +// CHECK-NEXT: %[[V4:.*]] = affine.apply #map(%{{.*}}) +// CHECK-NEXT: %[[V5:.*]] = vector.create_mask %[[V4:.*]] : vector<128xi1> +// CHECK-NEXT: %[[cst_1:.*]] = constant 0.000000e+00 : f32 +// CHECK-NEXT: %[[V6:.*]] = vector.transfer_read %{{.*}}[%[[it:.*]]], %[[cst_1:.*]] : memref<501xf32, 1>, vector<128xf32> +// CHECK-NEXT: %[[V7:.*]] = cmpf ogt, %[[V6:.*]], %[[iter:.*]] : vector<128xf32> +// CHECK-NEXT: %[[V8:.*]] = select %[[V7:.*]], %[[V6:.*]], %[[iter:.*]] : vector<128xi1>, vector<128xf32> +// CHECK-NEXT: %[[V9:.*]] = select %[[V5:.*]], %[[V8:.*]], %[[iter:.*]] : vector<128xi1>, vector<128xf32> +// CHECK-NEXT: affine.yield %[[V9:.*]] : vector<128xf32> +// CHECK-NEXT: } +// CHECK-NEXT: %[[V1:.*]] = vector.reduction "max", %[[V0:.*]] : vector<128xf32> into f32 +// CHECK-NEXT: %[[V2:.*]] = cmpf ogt, %[[V1:.*]], %[[cst:.*]] : f32 +// CHECK-NEXT: %[[V3:.*]] = select %[[V2:.*]], %[[V1:.*]], %[[cst:.*]] : f32 +// CHECK-NEXT: affine.store %[[V3:.*]], %{{.*}}[0] : memref<1xf32, 1> + +// ----- + +// CHECK-LABEL: func @reduce_oeq_f32 +func @reduce_oeq_f32(%in: memref<1024xf32, 1>, %out: memref<1xf32, 1>) { + %cst = constant 0.0 : f32 + %0 = affine.for %it = 0 to 1024 step 1 iter_args(%iter = %cst) -> (f32) { + %t = affine.load %in[%it] : memref<1024xf32, 1> + %c = cmpf "oeq", %t, %iter : f32 + %s = select %c, %t, %iter : f32 + affine.yield %s : f32 + } + affine.store %0, %out[0] : memref<1xf32, 1> + return +} + +// CHECK-NOT: vector.reduction + +// ----- + +// CHECK-LABEL: func @reduce_wrong_select_1_f32 +func @reduce_wrong_select_1_f32(%in: memref<1024xf32, 1>, %out: memref<1xf32, 1>) { + %cst = constant 0.0 : f32 + %0 = affine.for %it = 0 to 1024 step 1 iter_args(%iter = %cst) -> (f32) { + %t = affine.load %in[%it] : memref<1024xf32, 1> + %c = cmpf "ogt", %t, %iter : f32 + %s = select %c, %t, %t : f32 + affine.yield %s : f32 + } + affine.store %0, %out[0] : memref<1xf32, 1> + return +} + +// CHECK-NOT: vector.reduction + +// ----- + +// CHECK-LABEL: func @reduce_wrong_select_2_f32 +func @reduce_wrong_select_2_f32(%in: memref<1024xf32, 1>, %out: memref<1xf32, 1>) { + %cst = constant 0.0 : f32 + %0 = affine.for %it = 0 to 1024 step 1 iter_args(%iter = %cst) -> (f32) { + %t = affine.load %in[%it] : memref<1024xf32, 1> + %c1 = cmpf "ogt", %t, %iter : f32 + %c2 = cmpf "ogt", %t, %iter : f32 + %s1 = select %c1, %t, %t : f32 + %s2 = select %c2, %t, %t : f32 + affine.yield %s1 : f32 + } + affine.store %0, %out[0] : memref<1xf32, 1> + return +} + +// CHECK-NOT: vector.reduction + +// ----- + +// CHECK-LABEL: func @reduce_wrong_cmpf_1_f32 +func @reduce_wrong_cmpf_1_f32(%in: memref<1024xf32, 1>, %out: memref<1xf32, 1>) { + %cst = constant 0.0 : f32 + %0 = affine.for %it = 0 to 1024 step 1 iter_args(%iter = %cst) -> (f32) { + %t = affine.load %in[%it] : memref<1024xf32, 1> + %c = cmpf "ogt", %t, %t : f32 + %s = select %c, %t, %iter : f32 + affine.yield %s : f32 + } + affine.store %0, %out[0] : memref<1xf32, 1> + return +} + +// CHECK-NOT: vector.reduction + +// ----- + +// CHECK-LABEL: func @reduce_wrong_cmpf_2_f32 +func @reduce_wrong_cmpf_2_f32(%in: memref<1024xf32, 1>, %out: memref<1xf32, 1>) { + %cst = constant 0.0 : f32 + %0 = affine.for %it = 0 to 1024 step 1 iter_args(%iter = %cst) -> (f32) { + %t = affine.load %in[%it] : memref<1024xf32, 1> + %c1 = cmpf "ogt", %t, %t : f32 + %c2 = cmpf "ogt", %t, %t : f32 + %s1 = select %c1, %t, %iter : f32 + %s2 = select %c2, %t, %iter : f32 + affine.yield %s1 : f32 + } + affine.store %0, %out[0] : memref<1xf32, 1> + return +} + +// CHECK-NOT: vector.reduction + +// ----- + +// CHECK-LABEL: func @reduce_inconsistent_cmpf_select_1_f32 +func @reduce_inconsistent_cmpf_select_1_f32(%in: memref<1024xf32, 1>, %out: memref<1xf32, 1>) { + %cst = constant 0.0 : f32 + %0 = affine.for %it = 0 to 1024 step 1 iter_args(%iter = %cst) -> (f32) { + %t1 = affine.load %in[%it] : memref<1024xf32, 1> + %t2 = affine.load %in[%it] : memref<1024xf32, 1> + %c = cmpf "ogt", %t1, %iter : f32 + %s = select %c, %t2, %iter : f32 + affine.yield %s : f32 + } + affine.store %0, %out[0] : memref<1xf32, 1> + return +} + +// CHECK-NOT: vector.reduction + +// ----- + +// CHECK-LABEL: func @reduce_inconsistent_cmpf_select_2_f32 +func @reduce_inconsistent_cmpf_select_2_f32(%in: memref<1024xf32, 1>, %out: memref<1xf32, 1>) { + %cst = constant 0.0 : f32 + %0 = affine.for %it = 0 to 1024 step 1 iter_args(%iter = %cst) -> (f32) { + %t1 = affine.load %in[%it] : memref<1024xf32, 1> + %t2 = affine.load %in[%it] : memref<1024xf32, 1> + %c = cmpf "ogt", %iter, %t1 : f32 + %s = select %c, %iter, %t2 : f32 + affine.yield %s : f32 + } + affine.store %0, %out[0] : memref<1xf32, 1> + return +} + +// CHECK-NOT: vector.reduction + +// ----- + +// CHECK-LABEL: func @reduce_leaked_1 +func @reduce_leaked_1(%in: memref<1024xf32, 1>, %out1: memref<1xf32, 1>, %out2: memref<1024xf32, 1>) { + %cst = constant 0.0 : f32 + %0 = affine.for %it = 0 to 1024 step 1 iter_args(%iter = %cst) -> (f32) { + %t = affine.load %in[%it] : memref<1024xf32, 1> + %c = cmpf "ogt", %t, %iter : f32 + %s = select %c, %t, %iter : f32 + affine.store %s, %out2[%it] : memref<1024xf32, 1> + affine.yield %s : f32 + } + affine.store %0, %out1[0] : memref<1xf32, 1> + return +} + +// CHECK-NOT: vector.reduction + +// ----- + +// CHECK-LABEL: func @reduce_leaked_2 +func @reduce_leaked_2(%in: memref<1024xf32, 1>, %out1: memref<1xf32, 1>, %out2: memref<1024xf32, 1>) { + %cst = constant 0.0 : f32 + %0 = affine.for %it = 0 to 1024 step 1 iter_args(%iter = %cst) -> (f32) { + %t = affine.load %in[%it] : memref<1024xf32, 1> + %c = cmpf "ogt", %t, %iter : f32 + %s = select %c, %t, %iter : f32 + affine.store %iter, %out2[%it] : memref<1024xf32, 1> + affine.yield %s : f32 + } + affine.store %0, %out1[0] : memref<1xf32, 1> + return +} + +// CHECK-NOT: vector.reduction + +// ----- + +// CHECK-LABEL: func @reduce_leaked_3 +func @reduce_leaked_3(%in: memref<1024xf32, 1>, %out1: memref<1xf32, 1>, %out2: memref<1024xi1, 1>) { + %cst = constant 0.0 : f32 + %0 = affine.for %it = 0 to 1024 step 1 iter_args(%iter = %cst) -> (f32) { + %t = affine.load %in[%it] : memref<1024xf32, 1> + %c = cmpf "ogt", %t, %iter : f32 + %s = select %c, %t, %iter : f32 + affine.store %c, %out2[%it] : memref<1024xi1, 1> + affine.yield %s : f32 + } + affine.store %0, %out1[0] : memref<1xf32, 1> + return +} +