Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -726,7 +726,7 @@ /// non-constant operands. The types of the arguments are ordinarily /// scalar, in which case the costs are multiplied with VF. unsigned getOperandsScalarizationOverhead(ArrayRef Args, - unsigned VF) const; + ArrayRef Tys) const; /// If target has efficient vector element load/store instructions, it can /// return true here so that insertion/extraction costs are not added to @@ -1476,7 +1476,7 @@ bool Insert, bool Extract) = 0; virtual unsigned getOperandsScalarizationOverhead(ArrayRef Args, - unsigned VF) = 0; + ArrayRef Tys) = 0; virtual bool supportsEfficientVectorElementLoadStore() = 0; virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0; virtual MemCmpExpansionOptions @@ -1859,8 +1859,8 @@ return Impl.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract); } unsigned getOperandsScalarizationOverhead(ArrayRef Args, - unsigned VF) override { - return Impl.getOperandsScalarizationOverhead(Args, VF); + ArrayRef Tys) override { + return Impl.getOperandsScalarizationOverhead(Args, Tys); } bool supportsEfficientVectorElementLoadStore() override { Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -288,7 +288,7 @@ } unsigned getOperandsScalarizationOverhead(ArrayRef Args, - unsigned VF) const { + ArrayRef Tys) const { return 0; } Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -613,28 +613,22 @@ /// non-constant operands. The types of the arguments are ordinarily /// scalar, in which case the costs are multiplied with VF. unsigned getOperandsScalarizationOverhead(ArrayRef Args, - unsigned VF) { + ArrayRef Tys) { + assert(Args.size() == Tys.size() && "Expected matching Args and Tys"); + unsigned Cost = 0; SmallPtrSet UniqueOperands; - for (const Value *A : Args) { + for (int I = 0, E = Args.size(); I != E; I++) { // Disregard things like metadata arguments. - Type *Ty = A->getType(); + const Value *A = Args[I]; + Type *Ty = Tys[I]; if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() && !Ty->isPtrOrPtrVectorTy()) continue; if (!isa(A) && UniqueOperands.insert(A).second) { - auto *VecTy = dyn_cast(Ty); - if (VecTy) { - // If A is a vector operand, VF should be 1 or correspond to A. - assert((VF == 1 || - VF == cast(VecTy)->getNumElements()) && - "Vector argument does not match VF"); - } - else - VecTy = FixedVectorType::get(Ty, VF); - - Cost += getScalarizationOverhead(VecTy, false, true); + if (auto *VecTy = dyn_cast(Ty)) + Cost += getScalarizationOverhead(VecTy, false, true); } } @@ -648,9 +642,12 @@ unsigned Cost = 0; Cost += getScalarizationOverhead(Ty, true, false); - if (!Args.empty()) - Cost += getOperandsScalarizationOverhead(Args, Ty->getNumElements()); - else + if (!Args.empty()) { + // Assume all arguments are of type Ty, once vectorized. This function is + // called from getArithmeticInstrCost, so that is a likely heuristic. + SmallVector Tys(Args.size(), Ty); + Cost += getOperandsScalarizationOverhead(Args, Tys); + } else // When no information on arguments is provided, we add the cost // associated with one argument as a heuristic. Cost += getScalarizationOverhead(Ty, false, true); @@ -1349,7 +1346,7 @@ ScalarizationCost += getScalarizationOverhead(cast(RetTy), true, false); ScalarizationCost += - getOperandsScalarizationOverhead(Args, RetVF.getKnownMinValue()); + getOperandsScalarizationOverhead(Args, ICA.getArgTypes()); } IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I, Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -471,8 +471,8 @@ } unsigned TargetTransformInfo::getOperandsScalarizationOverhead( - ArrayRef Args, unsigned VF) const { - return TTIImpl->getOperandsScalarizationOverhead(Args, VF); + ArrayRef Args, ArrayRef Tys) const { + return TTIImpl->getOperandsScalarizationOverhead(Args, Tys); } bool TargetTransformInfo::supportsEfficientVectorElementLoadStore() const { Index: llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -748,7 +748,8 @@ if (!RetTy->isVoidTy()) ScalarizationCost += getScalarizationOverhead(cast(RetTy), true, false); - ScalarizationCost += getOperandsScalarizationOverhead(Args, RetVF); + ScalarizationCost += + getOperandsScalarizationOverhead(Args, ICA.getArgTypes()); } IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, ICA.getArgTypes(), FMF, I, Index: llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h =================================================================== --- llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -106,7 +106,7 @@ unsigned getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract); unsigned getOperandsScalarizationOverhead(ArrayRef Args, - unsigned VF); + ArrayRef Tys); unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef Tys, TTI::TargetCostKind CostKind); unsigned getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, Index: llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -114,9 +114,10 @@ return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract); } -unsigned HexagonTTIImpl::getOperandsScalarizationOverhead( - ArrayRef Args, unsigned VF) { - return BaseT::getOperandsScalarizationOverhead(Args, VF); +unsigned +HexagonTTIImpl::getOperandsScalarizationOverhead(ArrayRef Args, + ArrayRef Tys) { + return BaseT::getOperandsScalarizationOverhead(Args, Tys); } unsigned HexagonTTIImpl::getCallInstrCost(Function *F, Type *RetTy, Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3796,15 +3796,15 @@ return Cost; } +static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { + if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) + return Elt; + return VectorType::get(Elt, VF); +} + InstructionCost LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, ElementCount VF) { - auto MaybeVectorizeType = [](Type *Elt, ElementCount VF) -> Type * { - if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) - return Elt; - return VectorType::get(Elt, VF); - }; - Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); assert(ID && "Expected intrinsic call!"); Type *RetTy = MaybeVectorizeType(CI->getType(), VF); @@ -3815,7 +3815,8 @@ SmallVector Arguments(CI->arg_begin(), CI->arg_end()); FunctionType *FTy = CI->getCalledFunction()->getFunctionType(); SmallVector ParamTys; - std::transform(FTy->param_begin(), FTy->param_end(), ParamTys.begin(), + std::transform(FTy->param_begin(), FTy->param_end(), + std::back_inserter(ParamTys), [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); }); IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF, @@ -7073,8 +7074,11 @@ // Skip operands that do not require extraction/scalarization and do not incur // any overhead. + SmallVector Tys; + for (auto *V : filterExtractingOperands(Ops, VF)) + Tys.push_back(MaybeVectorizeType(V->getType(), VF)); return Cost + TTI.getOperandsScalarizationOverhead( - filterExtractingOperands(Ops, VF), VF.getKnownMinValue()); + filterExtractingOperands(Ops, VF), Tys); } void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { Index: llvm/test/Analysis/CostModel/PowerPC/matrix.ll =================================================================== --- /dev/null +++ llvm/test/Analysis/CostModel/PowerPC/matrix.ll @@ -0,0 +1,22 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -cost-model -analyze -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=+vsx | FileCheck %s +target datalayout = "E-m:e-i64:64-n32:64" +target triple = "powerpc64-unknown-linux-gnu" + +; This test checks we don't crash on certain matrix operations, more than +; checks the cost of the intrinsics per-se. + +define void @matrix() { +; CHECK-LABEL: 'matrix' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %matrix1 = call <1 x i32> @llvm.matrix.column.major.load.v1i32(i32* nonnull align 4 undef, i64 1, i1 false, i32 1, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 452 for instruction: %0 = call <10 x i32> @llvm.matrix.multiply.v10i32.v10i32.v1i32(<10 x i32> undef, <1 x i32> %matrix1, i32 10, i32 1, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +entry: + %matrix1 = call <1 x i32> @llvm.matrix.column.major.load.v1i32(i32* nonnull align 4 undef, i64 1, i1 false, i32 1, i32 1) + %0 = call <10 x i32> @llvm.matrix.multiply.v10i32.v10i32.v1i32(<10 x i32> undef, <1 x i32> %matrix1, i32 10, i32 1, i32 1) + ret void +} + +declare <1 x i32> @llvm.matrix.column.major.load.v1i32(i32* nocapture, i64, i1 immarg, i32 immarg, i32 immarg) #2 +declare <10 x i32> @llvm.matrix.multiply.v10i32.v10i32.v1i32(<10 x i32>, <1 x i32>, i32 immarg, i32 immarg, i32 immarg) #3