diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -726,10 +726,10 @@ bool Insert, bool Extract) const; /// Estimate the overhead of scalarizing an instructions unique - /// non-constant operands. The types of the arguments are ordinarily - /// scalar, in which case the costs are multiplied with VF. + /// non-constant operands. The (potentially vector) types to use for each of + /// argument are passes via Tys. unsigned getOperandsScalarizationOverhead(ArrayRef Args, - unsigned VF) const; + ArrayRef Tys) const; /// If target has efficient vector element load/store instructions, it can /// return true here so that insertion/extraction costs are not added to @@ -1479,7 +1479,7 @@ bool Insert, bool Extract) = 0; virtual unsigned getOperandsScalarizationOverhead(ArrayRef Args, - unsigned VF) = 0; + ArrayRef Tys) = 0; virtual bool supportsEfficientVectorElementLoadStore() = 0; virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0; virtual MemCmpExpansionOptions @@ -1864,8 +1864,8 @@ return Impl.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract); } unsigned getOperandsScalarizationOverhead(ArrayRef Args, - unsigned VF) override { - return Impl.getOperandsScalarizationOverhead(Args, VF); + ArrayRef Tys) override { + return Impl.getOperandsScalarizationOverhead(Args, Tys); } bool supportsEfficientVectorElementLoadStore() override { diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -289,7 +289,7 @@ } unsigned getOperandsScalarizationOverhead(ArrayRef Args, - unsigned VF) const { + ArrayRef Tys) const { return 0; } diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -609,51 +609,48 @@ return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract); } - /// Estimate the overhead of scalarizing an instruction's unique - /// non-constant operands. The types of the arguments are ordinarily - /// scalar, in which case the costs are multiplied with VF. + /// Estimate the overhead of scalarizing an instructions unique + /// non-constant operands. The (potentially vector) types to use for each of + /// argument are passes via Tys. unsigned getOperandsScalarizationOverhead(ArrayRef Args, - unsigned VF) { + ArrayRef Tys) { + assert(Args.size() == Tys.size() && "Expected matching Args and Tys"); + unsigned Cost = 0; SmallPtrSet UniqueOperands; - for (const Value *A : Args) { + for (int I = 0, E = Args.size(); I != E; I++) { // Disregard things like metadata arguments. - Type *Ty = A->getType(); + const Value *A = Args[I]; + Type *Ty = Tys[I]; if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() && !Ty->isPtrOrPtrVectorTy()) continue; if (!isa(A) && UniqueOperands.insert(A).second) { - auto *VecTy = dyn_cast(Ty); - if (VecTy) { - // If A is a vector operand, VF should be 1 or correspond to A. - assert((VF == 1 || - VF == cast(VecTy)->getNumElements()) && - "Vector argument does not match VF"); - } - else - VecTy = FixedVectorType::get(Ty, VF); - - Cost += getScalarizationOverhead(VecTy, false, true); + if (auto *VecTy = dyn_cast(Ty)) + Cost += getScalarizationOverhead(VecTy, false, true); } } return Cost; } - unsigned getScalarizationOverhead(VectorType *InTy, - ArrayRef Args) { - auto *Ty = cast(InTy); - + /// Estimate the overhead of scalarizing the inputs and outputs of an + /// instruction, with return type RetTy and arguments Args of type Tys. If + /// Args are unknown (empty), then the cost associated with one argument is + /// added as a heuristic. + unsigned getScalarizationOverhead(VectorType *RetTy, + ArrayRef Args, + ArrayRef Tys) { unsigned Cost = 0; - Cost += getScalarizationOverhead(Ty, true, false); + Cost += getScalarizationOverhead(RetTy, true, false); if (!Args.empty()) - Cost += getOperandsScalarizationOverhead(Args, Ty->getNumElements()); + Cost += getOperandsScalarizationOverhead(Args, Tys); else // When no information on arguments is provided, we add the cost // associated with one argument as a heuristic. - Cost += getScalarizationOverhead(Ty, false, true); + Cost += getScalarizationOverhead(RetTy, false, true); return Cost; } @@ -710,7 +707,8 @@ Opd1PropInfo, Opd2PropInfo, Args, CxtI); // Return the cost of multiple scalar invocation plus the cost of // inserting and extracting the values. - return getScalarizationOverhead(VTy, Args) + Num * Cost; + SmallVector Tys(Args.size(), Ty); + return getScalarizationOverhead(VTy, Args, Tys) + Num * Cost; } // We don't know anything about this scalar instruction. @@ -1354,7 +1352,7 @@ ScalarizationCost += getScalarizationOverhead(cast(RetTy), true, false); ScalarizationCost += - getOperandsScalarizationOverhead(Args, RetVF.getKnownMinValue()); + getOperandsScalarizationOverhead(Args, ICA.getArgTypes()); } IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I, diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -469,8 +469,8 @@ } unsigned TargetTransformInfo::getOperandsScalarizationOverhead( - ArrayRef Args, unsigned VF) const { - return TTIImpl->getOperandsScalarizationOverhead(Args, VF); + ArrayRef Args, ArrayRef Tys) const { + return TTIImpl->getOperandsScalarizationOverhead(Args, Tys); } bool TargetTransformInfo::supportsEfficientVectorElementLoadStore() const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -549,7 +549,8 @@ Opd1PropInfo, Opd2PropInfo, Args, CxtI); // Return the cost of multiple scalar invocation plus the cost of // inserting and extracting the values. - return getScalarizationOverhead(VTy, Args) + Num * Cost; + SmallVector Tys(Args.size(), Ty); + return getScalarizationOverhead(VTy, Args, Tys) + Num * Cost; } // We don't know anything about this scalar instruction. @@ -752,7 +753,8 @@ if (!RetTy->isVoidTy()) ScalarizationCost += getScalarizationOverhead(cast(RetTy), true, false); - ScalarizationCost += getOperandsScalarizationOverhead(Args, RetVF); + ScalarizationCost += + getOperandsScalarizationOverhead(Args, ICA.getArgTypes()); } IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, ICA.getArgTypes(), FMF, I, diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1363,7 +1363,8 @@ CostKind); // Return the cost of multiple scalar invocation plus the cost of // inserting and extracting the values. - return BaseT::getScalarizationOverhead(VTy, Args) + Num * Cost; + SmallVector Tys(Args.size(), Ty); + return BaseT::getScalarizationOverhead(VTy, Args, Tys) + Num * Cost; } return BaseCost; diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -107,7 +107,7 @@ unsigned getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract); unsigned getOperandsScalarizationOverhead(ArrayRef Args, - unsigned VF); + ArrayRef Tys); unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef Tys, TTI::TargetCostKind CostKind); unsigned getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -118,9 +118,10 @@ return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract); } -unsigned HexagonTTIImpl::getOperandsScalarizationOverhead( - ArrayRef Args, unsigned VF) { - return BaseT::getOperandsScalarizationOverhead(Args, VF); +unsigned +HexagonTTIImpl::getOperandsScalarizationOverhead(ArrayRef Args, + ArrayRef Tys) { + return BaseT::getOperandsScalarizationOverhead(Args, Tys); } unsigned HexagonTTIImpl::getCallInstrCost(Function *F, Type *RetTy, diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -487,8 +487,10 @@ if (DivRemConstPow2) return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1)); - if (DivRemConst) - return VF * DivMulSeqCost + getScalarizationOverhead(VTy, Args); + if (DivRemConst) { + SmallVector Tys(Args.size(), Ty); + return VF * DivMulSeqCost + getScalarizationOverhead(VTy, Args, Tys); + } if ((SignedDivRem || UnsignedDivRem) && VF > 4) // Temporary hack: disable high vectorization factors with integer // division/remainder, which will get scalarized and handled with @@ -511,7 +513,9 @@ // inserting and extracting the values. unsigned ScalarCost = getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind); - unsigned Cost = (VF * ScalarCost) + getScalarizationOverhead(VTy, Args); + SmallVector Tys(Args.size(), Ty); + unsigned Cost = + (VF * ScalarCost) + getScalarizationOverhead(VTy, Args, Tys); // FIXME: VF 2 for these FP operations are currently just as // expensive as for VF 4. if (VF == 2) @@ -528,7 +532,9 @@ // There is no native support for FRem. if (Opcode == Instruction::FRem) { - unsigned Cost = (VF * LIBCALL_COST) + getScalarizationOverhead(VTy, Args); + SmallVector Tys(Args.size(), Ty); + unsigned Cost = + (VF * LIBCALL_COST) + getScalarizationOverhead(VTy, Args, Tys); // FIXME: VF 2 for float is currently just as expensive as for VF 4. if (VF == 2 && ScalarBits == 32) Cost *= 2; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3634,15 +3634,15 @@ return Cost; } +static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { + if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) + return Elt; + return VectorType::get(Elt, VF); +} + InstructionCost LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, ElementCount VF) { - auto MaybeVectorizeType = [](Type *Elt, ElementCount VF) -> Type * { - if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) - return Elt; - return VectorType::get(Elt, VF); - }; - Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); assert(ID && "Expected intrinsic call!"); Type *RetTy = MaybeVectorizeType(CI->getType(), VF); @@ -6914,8 +6914,11 @@ // Skip operands that do not require extraction/scalarization and do not incur // any overhead. + SmallVector Tys; + for (auto *V : filterExtractingOperands(Ops, VF)) + Tys.push_back(MaybeVectorizeType(V->getType(), VF)); return Cost + TTI.getOperandsScalarizationOverhead( - filterExtractingOperands(Ops, VF), VF.getKnownMinValue()); + filterExtractingOperands(Ops, VF), Tys); } void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { diff --git a/llvm/test/Analysis/CostModel/PowerPC/matrix.ll b/llvm/test/Analysis/CostModel/PowerPC/matrix.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Analysis/CostModel/PowerPC/matrix.ll @@ -0,0 +1,22 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -cost-model -analyze -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=+vsx | FileCheck %s +target datalayout = "E-m:e-i64:64-n32:64" +target triple = "powerpc64-unknown-linux-gnu" + +; This test checks we don't crash on certain matrix operations, more than +; checks the cost of the intrinsics per-se. + +define void @matrix() { +; CHECK-LABEL: 'matrix' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %matrix1 = call <1 x i32> @llvm.matrix.column.major.load.v1i32(i32* nonnull align 4 undef, i64 1, i1 false, i32 1, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 452 for instruction: %0 = call <10 x i32> @llvm.matrix.multiply.v10i32.v10i32.v1i32(<10 x i32> undef, <1 x i32> %matrix1, i32 10, i32 1, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +entry: + %matrix1 = call <1 x i32> @llvm.matrix.column.major.load.v1i32(i32* nonnull align 4 undef, i64 1, i1 false, i32 1, i32 1) + %0 = call <10 x i32> @llvm.matrix.multiply.v10i32.v10i32.v1i32(<10 x i32> undef, <1 x i32> %matrix1, i32 10, i32 1, i32 1) + ret void +} + +declare <1 x i32> @llvm.matrix.column.major.load.v1i32(i32* nocapture, i64, i1 immarg, i32 immarg, i32 immarg) #2 +declare <10 x i32> @llvm.matrix.multiply.v10i32.v10i32.v1i32(<10 x i32>, <1 x i32>, i32 immarg, i32 immarg, i32 immarg) #3 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll @@ -72,10 +72,11 @@ } define void @vec_intrinsic(i64 %N, double* nocapture readonly %a) { +;; FIXME: Should be calling sin_vec, once the cost of scalarizing is handled. ; CHECK-LABEL: @vec_intrinsic ; CHECK: vector.body: ; CHECK: %[[LOAD:.*]] = load , * -; CHECK: call fast @sin_vec( %[[LOAD]]) +; CHECK: call fast @llvm.sin.nxv2f64( %[[LOAD]]) entry: %cmp7 = icmp sgt i64 %N, 0 br i1 %cmp7, label %for.body, label %for.end @@ -86,6 +87,7 @@ %0 = load double, double* %arrayidx, align 8 %1 = call fast double @llvm.sin.f64(double %0) #2 %add = fadd fast double %1, 1.000000e+00 + store double %add, double* %arrayidx, align 8 %iv.next = add nuw nsw i64 %iv, 1 %exitcond = icmp eq i64 %iv.next, %N br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1