diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1169,6 +1169,7 @@ getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, + OperandValueKind OpdInfo = OK_AnyValue, const Instruction *I = nullptr) const; /// \return The cost of VP Load and Store instructions. @@ -1725,11 +1726,10 @@ const APInt &DemandedDstElts, TTI::TargetCostKind CostKind) = 0; - virtual InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, - Align Alignment, - unsigned AddressSpace, - TTI::TargetCostKind CostKind, - const Instruction *I) = 0; + virtual InstructionCost + getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, + unsigned AddressSpace, TTI::TargetCostKind CostKind, + OperandValueKind OpdInfo, const Instruction *I) = 0; virtual InstructionCost getVPMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, @@ -2275,9 +2275,10 @@ InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, + OperandValueKind OpdInfo, const Instruction *I) override { - return Impl.getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, - CostKind, I); + return Impl.getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind, + OpdInfo, I); } InstructionCost getVPMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -580,6 +580,7 @@ InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, + TTI::OperandValueKind OpdInfo, const Instruction *I) const { return 1; } @@ -1074,9 +1075,11 @@ case Instruction::Store: { auto *SI = cast(U); Type *ValTy = U->getOperand(0)->getType(); + TTI::OperandValueProperties OpVP = TTI::OP_None; + TTI::OperandValueKind OpVK = TTI::getOperandInfo(U->getOperand(0), OpVP); return TargetTTI->getMemoryOpCost(Opcode, ValTy, SI->getAlign(), - SI->getPointerAddressSpace(), - CostKind, I); + SI->getPointerAddressSpace(), CostKind, + OpVK, I); } case Instruction::Load: { auto *LI = cast(U); @@ -1094,8 +1097,8 @@ LoadType = TI->getDestTy(); } return TargetTTI->getMemoryOpCost(Opcode, LoadType, LI->getAlign(), - LI->getPointerAddressSpace(), - CostKind, I); + LI->getPointerAddressSpace(), CostKind, + TTI::OK_AnyValue, I); } case Instruction::Select: { const Value *Op0, *Op1; diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1187,10 +1187,11 @@ return Cost; } - InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, - MaybeAlign Alignment, unsigned AddressSpace, - TTI::TargetCostKind CostKind, - const Instruction *I = nullptr) { + InstructionCost + getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, + unsigned AddressSpace, TTI::TargetCostKind CostKind, + TTI::OperandValueKind OpdInfo = TTI::OK_AnyValue, + const Instruction *I = nullptr) { assert(!Src->isVoidTy() && "Invalid type"); // Assume types, such as structs, are expensive. if (getTLI()->getValueType(DL, Src, true) == MVT::Other) diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -875,11 +875,12 @@ InstructionCost TargetTransformInfo::getMemoryOpCost( unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, - TTI::TargetCostKind CostKind, const Instruction *I) const { + TTI::TargetCostKind CostKind, TTI::OperandValueKind OpdInfo, + const Instruction *I) const { assert((I == nullptr || I->getOpcode() == Opcode) && "Opcode should reflect passed instruction."); - InstructionCost Cost = TTIImpl->getMemoryOpCost(Opcode, Src, Alignment, - AddressSpace, CostKind, I); + InstructionCost Cost = TTIImpl->getMemoryOpCost( + Opcode, Src, Alignment, AddressSpace, CostKind, OpdInfo, I); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -207,10 +207,11 @@ bool IsZeroCmp) const; bool useNeonVector(const Type *Ty) const; - InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, - MaybeAlign Alignment, unsigned AddressSpace, - TTI::TargetCostKind CostKind, - const Instruction *I = nullptr); + InstructionCost + getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, + unsigned AddressSpace, TTI::TargetCostKind CostKind, + TTI::OperandValueKind OpdInfo = TTI::OK_AnyValue, + const Instruction *I = nullptr); InstructionCost getCostOfKeepingLiveOverCall(ArrayRef Tys); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -2131,7 +2131,8 @@ ElementCount LegalVF = LT.second.getVectorElementCount(); InstructionCost MemOpCost = - getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I); + getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, + TTI::OK_AnyValue, I); // Add on an overhead cost for using gathers/scatters. // TODO: At the moment this is applied unilaterally for all CPUs, but at some // point we may want a per-CPU overhead. @@ -2147,6 +2148,7 @@ MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, + TTI::OperandValueKind OpdInfo, const Instruction *I) { EVT VT = TLI->getValueType(DL, Ty, true); // Type legalization can't handle structs diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -252,10 +252,11 @@ ArrayRef Args = ArrayRef(), const Instruction *CxtI = nullptr); - InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, - MaybeAlign Alignment, unsigned AddressSpace, - TTI::TargetCostKind CostKind, - const Instruction *I = nullptr); + InstructionCost + getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, + unsigned AddressSpace, TTI::TargetCostKind CostKind, + TTI::OperandValueKind OpdInfo = TTI::OK_AnyValue, + const Instruction *I = nullptr); InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1451,6 +1451,7 @@ MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, + TTI::OperandValueKind OpdInfo, const Instruction *I) { // TODO: Handle other cost kinds. if (CostKind != TTI::TCK_RecipThroughput) @@ -1490,7 +1491,7 @@ ? ST->getMVEVectorCostFactor(CostKind) : 1; return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, - CostKind, I); + CostKind, OpdInfo, I); } InstructionCost diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -116,10 +116,11 @@ TTI::TargetCostKind CostKind); InstructionCost getAddressComputationCost(Type *Tp, ScalarEvolution *SE, const SCEV *S); - InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, - MaybeAlign Alignment, unsigned AddressSpace, - TTI::TargetCostKind CostKind, - const Instruction *I = nullptr); + InstructionCost + getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, + unsigned AddressSpace, TTI::TargetCostKind CostKind, + TTI::OperandValueKind OpdInfo = TTI::OK_AnyValue, + const Instruction *I = nullptr); InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind); diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -161,6 +161,7 @@ MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, + TTI::OperandValueKind OpdInfo, const Instruction *I) { assert(Opcode == Instruction::Load || Opcode == Instruction::Store); // TODO: Handle other cost kinds. @@ -169,7 +170,7 @@ if (Opcode == Instruction::Store) return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, - CostKind, I); + CostKind, OpdInfo, I); if (Src->isVectorTy()) { VectorType *VecTy = cast(Src); @@ -209,8 +210,8 @@ return (3 - LogA) * Cost * NumLoads; } - return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, - CostKind, I); + return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind, + OpdInfo, I); } InstructionCost diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -125,10 +125,11 @@ const Instruction *I = nullptr); InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); - InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, - MaybeAlign Alignment, unsigned AddressSpace, - TTI::TargetCostKind CostKind, - const Instruction *I = nullptr); + InstructionCost + getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, + unsigned AddressSpace, TTI::TargetCostKind CostKind, + TTI::OperandValueKind OpdInfo = TTI::OK_AnyValue, + const Instruction *I = nullptr); InstructionCost getInterleavedMemoryOpCost( unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -1145,6 +1145,7 @@ MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, + TTI::OperandValueKind OpdInfo, const Instruction *I) { InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Src, nullptr); diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -105,6 +105,12 @@ Optional FMF, TTI::TargetCostKind CostKind); + InstructionCost + getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, + unsigned AddressSpace, TTI::TargetCostKind CostKind, + TTI::OperandValueKind OpdInfo = TTI::OK_AnyValue, + const Instruction *I = nullptr); + bool isElementTypeLegalForScalableVector(Type *Ty) const { return TLI->isLegalElementTypeForRVV(Ty); } diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -236,8 +236,9 @@ // scalable vectors, we use an upper bound on that number since we don't // know exactly what VL will be. auto &VTy = *cast(DataTy); - InstructionCost MemOpCost = getMemoryOpCost(Opcode, VTy.getElementType(), - Alignment, 0, CostKind, I); + InstructionCost MemOpCost = + getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind, + TTI::OK_AnyValue, I); if (isa(VTy)) { const unsigned EltSize = DL.getTypeSizeInBits(VTy.getElementType()); const unsigned MinSize = DL.getTypeSizeInBits(&VTy).getKnownMinValue(); @@ -376,6 +377,27 @@ return (LT.first - 1) + BaseCost + Log2_32_Ceil(VL); } +InstructionCost RISCVTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, + MaybeAlign Alignment, + unsigned AddressSpace, + TTI::TargetCostKind CostKind, + TTI::OperandValueKind OpdInfo, + const Instruction *I) { + InstructionCost Cost = 0; + if (Opcode == Instruction::Store && isa(Src) && + (OpdInfo == TTI::OK_UniformConstantValue || + OpdInfo == TTI::OK_NonUniformConstantValue)) { + APInt PseudoAddr = APInt::getAllOnes(DL.getPointerSizeInBits()); + // Add a cost of address load + the cost of the vector load. + Cost += RISCVMatInt::getIntMatCost(PseudoAddr, DL.getPointerSizeInBits(), + getST()->getFeatureBits()) + + getMemoryOpCost(Instruction::Load, Src, DL.getABITypeAlign(Src), + /*AddressSpace=*/0, CostKind); + } + return Cost + BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, + CostKind, OpdInfo, I); +} + void RISCVTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) { diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -110,10 +110,11 @@ InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); bool isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue); - InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, - MaybeAlign Alignment, unsigned AddressSpace, - TTI::TargetCostKind CostKind, - const Instruction *I = nullptr); + InstructionCost + getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, + unsigned AddressSpace, TTI::TargetCostKind CostKind, + TTI::OperandValueKind OpdInfo = TTI::OK_AnyValue, + const Instruction *I = nullptr); InstructionCost getInterleavedMemoryOpCost( unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -1109,6 +1109,7 @@ MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, + TTI::OperandValueKind OpdInfo, const Instruction *I) { assert(!Src->isVoidTy() && "Invalid type"); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -155,10 +155,11 @@ int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind); - InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, - MaybeAlign Alignment, unsigned AddressSpace, - TTI::TargetCostKind CostKind, - const Instruction *I = nullptr); + InstructionCost + getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, + unsigned AddressSpace, TTI::TargetCostKind CostKind, + TTI::OperandValueKind OpdInfo = TTI::OK_AnyValue, + const Instruction *I = nullptr); InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1079,7 +1079,8 @@ } // Fallback to the default implementation. - return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info); + return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info, + Opd1PropInfo, Opd2PropInfo, Args, CxtI); } InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, @@ -4028,6 +4029,7 @@ MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, + TTI::OperandValueKind OpdInfo, const Instruction *I) { // TODO: Handle other cost kinds. if (CostKind != TTI::TCK_RecipThroughput) { @@ -4056,9 +4058,16 @@ // Handle the simple case of non-vectors. // NOTE: this assumes that legalization never creates vector from scalars! - if (!VTy || !LT.second.isVector()) + if (!VTy || !LT.second.isVector()) { + InstructionCost Cost = 0; + if (Opcode == Instruction::Store && LT.second.isFloatingPoint() && + (OpdInfo == TTI::OK_UniformConstantValue || + OpdInfo == TTI::OK_NonUniformConstantValue)) + Cost += getMemoryOpCost(Instruction::Load, Src, DL.getABITypeAlign(Src), + /*AddressSpace=*/0, CostKind); // Each load/store unit costs 1. - return LT.first * 1; + return Cost + LT.first * 1; + } bool IsLoad = Opcode == Instruction::Load; @@ -4068,6 +4077,13 @@ InstructionCost Cost = 0; + // Add a cost for constant load to vector. + if (Opcode == Instruction::Store && + (OpdInfo == TTI::OK_UniformConstantValue || + OpdInfo == TTI::OK_NonUniformConstantValue)) + Cost += getMemoryOpCost(Instruction::Load, Src, DL.getABITypeAlign(Src), + /*AddressSpace=*/0, CostKind); + // Source of truth: how many elements were there in the original IR vector? const unsigned SrcNumElt = VTy->getNumElements(); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6393,12 +6393,15 @@ "Stride should be 1 or -1 for consecutive memory access"); const Align Alignment = getLoadStoreAlignment(I); InstructionCost Cost = 0; - if (Legal->isMaskRequired(I)) + if (Legal->isMaskRequired(I)) { Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, CostKind); - else + } else { + TTI::OperandValueProperties OpVP = TTI::OP_None; + TTI::OperandValueKind OpVK = TTI::getOperandInfo(I->getOperand(0), OpVP); Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, - CostKind, I); + CostKind, OpVK, I); + } bool Reverse = ConsecutiveStride < 0; if (Reverse) @@ -6678,9 +6681,11 @@ const Align Alignment = getLoadStoreAlignment(I); unsigned AS = getLoadStoreAddressSpace(I); + TTI::OperandValueProperties OpVP = TTI::OP_None; + TTI::OperandValueKind OpVK = TTI::getOperandInfo(I->getOperand(0), OpVP); return TTI.getAddressComputationCost(ValTy) + TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, - TTI::TCK_RecipThroughput, I); + TTI::TCK_RecipThroughput, OpVK, I); } return getWideningCost(I, VF); } diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6024,15 +6024,15 @@ auto *LI = cast(V); ScalarsCost += TTI->getMemoryOpCost( Instruction::Load, LI->getType(), LI->getAlign(), - LI->getPointerAddressSpace(), CostKind, LI); + LI->getPointerAddressSpace(), CostKind, TTI::OK_AnyValue, LI); } auto *LI = cast(E->getMainOp()); auto *LoadTy = FixedVectorType::get(LI->getType(), VF); Align Alignment = LI->getAlign(); - GatherCost += - VectorizedCnt * - TTI->getMemoryOpCost(Instruction::Load, LoadTy, Alignment, - LI->getPointerAddressSpace(), CostKind, LI); + GatherCost += VectorizedCnt * + TTI->getMemoryOpCost(Instruction::Load, LoadTy, Alignment, + LI->getPointerAddressSpace(), + CostKind, TTI::OK_AnyValue, LI); GatherCost += ScatterVectorizeCnt * TTI->getGatherScatterOpCost( Instruction::Load, LoadTy, LI->getPointerOperand(), @@ -6392,6 +6392,12 @@ CommonCost -= (EntryVF - VL.size()) * ScalarEltCost; } InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; + for (unsigned I = 0, Num = VL0->getNumOperands(); I < Num; ++I) { + if (all_of(VL, [I](Value *V) { + return isConstant(cast(V)->getOperand(I)); + })) + Operands[I] = ConstantVector::getNullValue(VecTy); + } InstructionCost VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0); @@ -6425,8 +6431,9 @@ case Instruction::Load: { // Cost of wide load - cost of scalar loads. Align Alignment = cast(VL0)->getAlign(); - InstructionCost ScalarEltCost = TTI->getMemoryOpCost( - Instruction::Load, ScalarTy, Alignment, 0, CostKind, VL0); + InstructionCost ScalarEltCost = + TTI->getMemoryOpCost(Instruction::Load, ScalarTy, Alignment, 0, + CostKind, TTI::OK_AnyValue, VL0); if (NeedToShuffleReuses) { CommonCost -= (EntryVF - VL.size()) * ScalarEltCost; } @@ -6434,7 +6441,7 @@ InstructionCost VecLdCost; if (E->State == TreeEntry::Vectorize) { VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, Alignment, 0, - CostKind, VL0); + CostKind, TTI::OK_AnyValue, VL0); } else { assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState"); Align CommonAlignment = Alignment; @@ -6454,13 +6461,25 @@ auto *SI = cast(IsReorder ? VL[E->ReorderIndices.front()] : VL0); Align Alignment = SI->getAlign(); - InstructionCost ScalarEltCost = TTI->getMemoryOpCost( - Instruction::Store, ScalarTy, Alignment, 0, CostKind, VL0); - InstructionCost ScalarStCost = VecTy->getNumElements() * ScalarEltCost; - InstructionCost VecStCost = TTI->getMemoryOpCost( - Instruction::Store, VecTy, Alignment, 0, CostKind, VL0); - LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecStCost, ScalarStCost)); - return CommonCost + VecStCost - ScalarStCost; + TTI::OperandValueProperties OpVP = TTI::OP_None; + TTI::OperandValueKind OpVK = TTI::getOperandInfo(SI->getOperand(0), OpVP); + InstructionCost ScalarEltCost = TTI->getMemoryOpCost( + Instruction::Store, ScalarTy, Alignment, 0, CostKind, OpVK, VL0); + InstructionCost ScalarStCost = VecTy->getNumElements() * ScalarEltCost; + OpVK = TTI::OK_AnyValue; + if (all_of(E->Scalars, + [](Value *V) { + return isConstant(cast(V)->getOperand(0)); + }) && + any_of(E->Scalars, [](Value *V) { + Value *Op = cast(V)->getOperand(0); + return !isa(Op) && !cast(Op)->isZeroValue(); + })) + OpVK = TTI::OK_NonUniformConstantValue; + InstructionCost VecStCost = TTI->getMemoryOpCost( + Instruction::Store, VecTy, Alignment, 0, CostKind, OpVK, VL0); + LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecStCost, ScalarStCost)); + return CommonCost + VecStCost - ScalarStCost; } case Instruction::Call: { CallInst *CI = cast(VL0); diff --git a/llvm/test/Analysis/CostModel/X86/arith-fp.ll b/llvm/test/Analysis/CostModel/X86/arith-fp.ll --- a/llvm/test/Analysis/CostModel/X86/arith-fp.ll +++ b/llvm/test/Analysis/CostModel/X86/arith-fp.ll @@ -629,9 +629,9 @@ define i32 @frem(i32 %arg) { ; SSE1-LABEL: 'frem' ; SSE1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef -; SSE1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef -; SSE1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8F32 = frem <8 x float> undef, undef -; SSE1-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16F32 = frem <16 x float> undef, undef +; SSE1-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = frem <4 x float> undef, undef +; SSE1-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = frem <8 x float> undef, undef +; SSE1-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = frem <16 x float> undef, undef ; SSE1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef ; SSE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = frem <2 x double> undef, undef ; SSE1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = frem <4 x double> undef, undef @@ -640,68 +640,68 @@ ; ; SSE2-LABEL: 'frem' ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8F32 = frem <8 x float> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16F32 = frem <16 x float> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = frem <4 x float> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = frem <8 x float> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = frem <16 x float> undef, undef ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = frem <4 x double> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = frem <8 x double> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = frem <2 x double> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = frem <4 x double> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = frem <8 x double> undef, undef ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'frem' ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8F32 = frem <8 x float> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16F32 = frem <16 x float> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = frem <4 x float> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = frem <8 x float> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = frem <16 x float> undef, undef ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = frem <4 x double> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = frem <8 x double> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = frem <2 x double> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = frem <4 x double> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = frem <8 x double> undef, undef ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX-LABEL: 'frem' ; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef -; AVX-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef -; AVX-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8F32 = frem <8 x float> undef, undef -; AVX-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V16F32 = frem <16 x float> undef, undef +; AVX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = frem <4 x float> undef, undef +; AVX-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F32 = frem <8 x float> undef, undef +; AVX-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16F32 = frem <16 x float> undef, undef ; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef -; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef -; AVX-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F64 = frem <4 x double> undef, undef -; AVX-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8F64 = frem <8 x double> undef, undef +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = frem <2 x double> undef, undef +; AVX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = frem <4 x double> undef, undef +; AVX-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8F64 = frem <8 x double> undef, undef ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'frem' ; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef -; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef -; AVX512-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8F32 = frem <8 x float> undef, undef -; AVX512-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V16F32 = frem <16 x float> undef, undef +; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = frem <4 x float> undef, undef +; AVX512-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F32 = frem <8 x float> undef, undef +; AVX512-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16F32 = frem <16 x float> undef, undef ; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef -; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef -; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F64 = frem <4 x double> undef, undef -; AVX512-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8F64 = frem <8 x double> undef, undef +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = frem <2 x double> undef, undef +; AVX512-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4F64 = frem <4 x double> undef, undef +; AVX512-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V8F64 = frem <8 x double> undef, undef ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'frem' ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8F32 = frem <8 x float> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16F32 = frem <16 x float> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = frem <4 x float> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = frem <8 x float> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = frem <16 x float> undef, undef ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = frem <4 x double> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = frem <8 x double> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = frem <2 x double> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = frem <4 x double> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = frem <8 x double> undef, undef ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; GLM-LABEL: 'frem' ; GLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef -; GLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef -; GLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8F32 = frem <8 x float> undef, undef -; GLM-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16F32 = frem <16 x float> undef, undef +; GLM-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4F32 = frem <4 x float> undef, undef +; GLM-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8F32 = frem <8 x float> undef, undef +; GLM-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16F32 = frem <16 x float> undef, undef ; GLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef -; GLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef -; GLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = frem <4 x double> undef, undef -; GLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = frem <8 x double> undef, undef +; GLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2F64 = frem <2 x double> undef, undef +; GLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4F64 = frem <4 x double> undef, undef +; GLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8F64 = frem <8 x double> undef, undef ; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %F32 = frem float undef, undef diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/rvv-min-vector-size.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/rvv-min-vector-size.ll --- a/llvm/test/Transforms/SLPVectorizer/RISCV/rvv-min-vector-size.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/rvv-min-vector-size.ll @@ -12,11 +12,13 @@ define void @foo(i64* nocapture writeonly %da) { ; CHECK-128-LABEL: @foo( ; CHECK-128-NEXT: entry: -; CHECK-128-NEXT: [[TMP0:%.*]] = bitcast i64* [[DA:%.*]] to <2 x i64>* -; CHECK-128-NEXT: store <2 x i64> , <2 x i64>* [[TMP0]], align 8 +; CHECK-128-NEXT: store i64 0, i64* [[DA:%.*]], align 8 +; CHECK-128-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[DA]], i64 1 +; CHECK-128-NEXT: store i64 1, i64* [[ARRAYIDX1]], align 8 ; CHECK-128-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[DA]], i64 2 -; CHECK-128-NEXT: [[TMP1:%.*]] = bitcast i64* [[ARRAYIDX2]] to <2 x i64>* -; CHECK-128-NEXT: store <2 x i64> , <2 x i64>* [[TMP1]], align 8 +; CHECK-128-NEXT: store i64 2, i64* [[ARRAYIDX2]], align 8 +; CHECK-128-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[DA]], i64 3 +; CHECK-128-NEXT: store i64 3, i64* [[ARRAYIDX3]], align 8 ; CHECK-128-NEXT: ret void ; ; CHECK-256-LABEL: @foo( @@ -45,8 +47,9 @@ define void @foo8(i8* nocapture writeonly %da) { ; CHECK-LABEL: @foo8( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[DA:%.*]] to <2 x i8>* -; CHECK-NEXT: store <2 x i8> , <2 x i8>* [[TMP0]], align 8 +; CHECK-NEXT: store i8 0, i8* [[DA:%.*]], align 8 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[DA]], i8 1 +; CHECK-NEXT: store i8 1, i8* [[ARRAYIDX1]], align 8 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, i8* [[DA]], i8 2 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet.ll @@ -14,18 +14,19 @@ ; CHECK-NEXT: ret void ; CHECK: if.else: ; CHECK-NEXT: [[M_NUMCONSTRAINTROWS4:%.*]] = getelementptr inbounds %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960", %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960"* [[INFO:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[NUB5:%.*]] = getelementptr inbounds %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960", %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960"* [[INFO]], i64 0, i32 1 ; CHECK-NEXT: br i1 undef, label [[LAND_LHS_TRUE_I_1:%.*]], label [[IF_THEN7_1:%.*]] ; CHECK: land.lhs.true.i.1: ; CHECK-NEXT: br i1 undef, label [[FOR_INC_1:%.*]], label [[IF_THEN7_1]] ; CHECK: if.then7.1: -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[M_NUMCONSTRAINTROWS4]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> , <2 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: store i32 1, i32* [[M_NUMCONSTRAINTROWS4]], align 4 +; CHECK-NEXT: store i32 5, i32* [[NUB5]], align 4 ; CHECK-NEXT: br label [[FOR_INC_1]] ; CHECK: for.inc.1: -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ , [[IF_THEN7_1]] ], [ , [[LAND_LHS_TRUE_I_1]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add nsw <2 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[M_NUMCONSTRAINTROWS4]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP2]], <2 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ , [[IF_THEN7_1]] ], [ , [[LAND_LHS_TRUE_I_1]] ] +; CHECK-NEXT: [[TMP1:%.*]] = add nsw <2 x i32> [[TMP0]], +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[M_NUMCONSTRAINTROWS4]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[TMP1]], <2 x i32>* [[TMP2]], align 4 ; CHECK-NEXT: unreachable ; entry: