diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -1341,8 +1341,8 @@ if (!IsIntVec && !FMF.allowReassoc()) return; - auto CanBeFlattened = [this](Value *Op) { - if (match(Op, m_BinOp()) && ShapeMap.find(Op) != ShapeMap.end()) + auto CanBeFlattened = [](Value *Op) { + if (match(Op, m_BinOp())) return true; return match( Op, m_OneUse(m_CombineOr( @@ -1355,6 +1355,9 @@ // the returned cost is < 0, the argument is cheaper to use in the // dot-product lowering. auto GetCostForArg = [this, &CanBeFlattened](Value *Op, unsigned N) { + if (ShapeMap.find(Op) == ShapeMap.end()) + return InstructionCost::getInvalid(); + if (!isa(Op)) return InstructionCost(0); @@ -1365,7 +1368,7 @@ InstructionCost EmbedCost(0); // Roughly estimate the cost for embedding the columns into a vector. for (unsigned I = 1; I < N; ++I) - EmbedCost -= + EmbedCost += TTI.getShuffleCost(TTI::SK_Splice, FixedVectorType::get(EltTy, 1), std::nullopt, TTI::TCK_RecipThroughput); return EmbedCost; @@ -1387,7 +1390,7 @@ // vector. InstructionCost EmbedCost(0); for (unsigned I = 1; I < N; ++I) - EmbedCost += + EmbedCost -= TTI.getShuffleCost(TTI::SK_Splice, FixedVectorType::get(EltTy, 1), std::nullopt, TTI::TCK_RecipThroughput); return EmbedCost; @@ -1400,7 +1403,26 @@ return TTI.getMemoryOpCost(Instruction::Load, VecTy, Align(1), 0) - N * TTI.getMemoryOpCost(Instruction::Load, EltTy, Align(1), 0); }; - auto LHSCost = GetCostForArg(LHS, LShape.NumColumns); + + SmallPtrSet Seen; + SmallVector WorkList; + SmallVector ToFlatten; + WorkList.push_back(LHS); + InstructionCost LHSCost(0); + while (!WorkList.empty()) { + Value *Op = WorkList.pop_back_val(); + if (!Seen.insert(Op).second) + continue; + + InstructionCost OpCost = GetCostForArg(Op, LShape.NumColumns); + if (OpCost + LHSCost >= LHSCost) + continue; + + LHSCost += OpCost; + ToFlatten.push_back(Op); + if (auto *I = dyn_cast(Op)) + WorkList.append(I->op_begin(), I->op_end()); + } // We compare the costs of a vector.reduce.add to sequential add. int AddOpCode = IsIntVec ? Instruction::Add : Instruction::FAdd; @@ -1421,16 +1443,16 @@ FusedInsts.insert(MatMul); IRBuilder<> Builder(MatMul); auto FlattenArg = [&Builder, &FusedInsts, &CanBeFlattened, - this](Value *Op) -> Value * { + this](Value *Op) { // Matmul must be the only user of loads because we don't use LowerLoad // for row vectors (LowerLoad results in scalar loads and shufflevectors // instead of single vector load). if (!CanBeFlattened(Op)) - return Op; + return; if (match(Op, m_BinOp()) && ShapeMap.find(Op) != ShapeMap.end()) { ShapeMap[Op] = ShapeMap[Op].t(); - return Op; + return; } FusedInsts.insert(cast(Op)); @@ -1441,16 +1463,19 @@ auto *NewLoad = Builder.CreateLoad(Op->getType(), Arg); Op->replaceAllUsesWith(NewLoad); cast(Op)->eraseFromParent(); - return NewLoad; + return; } else if (match(Op, m_Intrinsic( m_Value(Arg)))) { ToRemove.push_back(cast(Op)); - return Arg; + Op->replaceAllUsesWith(Arg); + return; } - - return Op; }; - LHS = FlattenArg(LHS); + + for (auto *V : ToFlatten) + FlattenArg(V); + + LHS = MatMul->getArgOperand(0); // Insert mul/fmul and llvm.vector.reduce.fadd Value *Mul = diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/dot-product-int.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/dot-product-int.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/dot-product-int.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/dot-product-int.ll @@ -119,44 +119,15 @@ define <1 x i32> @add_chain_feeding_dotproduct_i32_v8_1(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) { ; CHECK-LABEL: @add_chain_feeding_dotproduct_i32_v8_1( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <1 x i32> -; CHECK-NEXT: [[SPLIT2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <1 x i32> -; CHECK-NEXT: [[SPLIT3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <1 x i32> -; CHECK-NEXT: [[SPLIT4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <1 x i32> -; CHECK-NEXT: [[SPLIT5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <1 x i32> -; CHECK-NEXT: [[SPLIT6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <1 x i32> -; CHECK-NEXT: [[SPLIT7:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <1 x i32> -; CHECK-NEXT: [[SPLIT8:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[SPLIT9:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <1 x i32> -; CHECK-NEXT: [[SPLIT10:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <1 x i32> -; CHECK-NEXT: [[SPLIT11:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <1 x i32> -; CHECK-NEXT: [[SPLIT12:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <1 x i32> -; CHECK-NEXT: [[SPLIT13:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <1 x i32> -; CHECK-NEXT: [[SPLIT14:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <1 x i32> -; CHECK-NEXT: [[SPLIT15:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <1 x i32> -; CHECK-NEXT: [[TMP0:%.*]] = add <1 x i32> [[SPLIT]], [[SPLIT8]] -; CHECK-NEXT: [[TMP1:%.*]] = add <1 x i32> [[SPLIT1]], [[SPLIT9]] -; CHECK-NEXT: [[TMP2:%.*]] = add <1 x i32> [[SPLIT2]], [[SPLIT10]] -; CHECK-NEXT: [[TMP3:%.*]] = add <1 x i32> [[SPLIT3]], [[SPLIT11]] -; CHECK-NEXT: [[TMP4:%.*]] = add <1 x i32> [[SPLIT4]], [[SPLIT12]] -; CHECK-NEXT: [[TMP5:%.*]] = add <1 x i32> [[SPLIT5]], [[SPLIT13]] -; CHECK-NEXT: [[TMP6:%.*]] = add <1 x i32> [[SPLIT6]], [[SPLIT14]] -; CHECK-NEXT: [[TMP7:%.*]] = add <1 x i32> [[SPLIT7]], [[SPLIT15]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <1 x i32> [[TMP0]], <1 x i32> [[TMP1]], <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <1 x i32> [[TMP2]], <1 x i32> [[TMP3]], <2 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <1 x i32> [[TMP4]], <1 x i32> [[TMP5]], <2 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <1 x i32> [[TMP6]], <1 x i32> [[TMP7]], <2 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <4 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <4 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <8 x i32> -; CHECK-NEXT: [[SPLIT16:%.*]] = shufflevector <8 x i32> [[TMP14]], <8 x i32> poison, <8 x i32> -; CHECK-NEXT: [[SPLIT17:%.*]] = shufflevector <8 x i32> [[C:%.*]], <8 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = add <8 x i32> [[SPLIT16]], [[SPLIT17]] -; CHECK-NEXT: [[TMP16:%.*]] = mul <8 x i32> [[TMP15]], [[D:%.*]] -; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP16]]) -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <1 x i32> poison, i32 [[TMP17]], i64 0 -; CHECK-NEXT: ret <1 x i32> [[TMP18]] +; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <8 x i32> +; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = add <8 x i32> [[SPLIT]], [[SPLIT1]] +; CHECK-NEXT: [[SPLIT2:%.*]] = shufflevector <8 x i32> [[C:%.*]], <8 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = add <8 x i32> [[TMP0]], [[SPLIT2]] +; CHECK-NEXT: [[TMP2:%.*]] = mul <8 x i32> [[TMP1]], [[D:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]]) +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <1 x i32> poison, i32 [[TMP3]], i64 0 +; CHECK-NEXT: ret <1 x i32> [[TMP4]] ; entry: %add.1 = add <8 x i32> %a, %b