diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6881,6 +6881,42 @@ (void)E; return VecCost - ScalarCost; }; + // Calculate cost difference from vectorizing set of GEPs. + // Negative value means vectorizing is profitable. + auto GetGEPCostDiff = [=](ArrayRef Ptrs, Value *BasePtr) { + InstructionCost CostSavings = 0; + for (Value *V : Ptrs) { + if (V == BasePtr) + continue; + auto *Ptr = dyn_cast(V); + // GEPs may contain just addresses without instructions, considered free. + // GEPs with all constant indices also considered to have zero cost. + if (!Ptr || Ptr->hasAllConstantIndices()) + continue; + + // Here we differentiate two cases: when GEPs represent a regular + // vectorization tree node (and hence vectorized) and when the set is + // arguments of a set of loads or stores being vectorized. In the former + // case all the scalar GEPs will be removed as a result of vectorization. + // For any external uses of some lanes extract element instructions will + // be generated (which cost is estimated separately). For the latter case + // since the set of GEPs itself is not vectorized those used more than + // once will remain staying in vectorized code as well. So we should not + // count them as savings. + if (!Ptr->hasOneUse() && isa(VL0)) + continue; + + // TODO: it is target dependent, so need to implement and then use a TTI + // interface. + CostSavings += TTI->getArithmeticInstrCost(Instruction::Add, + Ptr->getType(), CostKind); + } + LLVM_DEBUG(dbgs() << "SLP: Calculated GEPs cost savings or Tree:\n"; + E->dump()); + LLVM_DEBUG(dbgs() << "SLP: GEP cost saving = " << CostSavings << "\n"); + return InstructionCost() - CostSavings; + }; + switch (ShuffleOrOp) { case Instruction::PHI: { // Count reused scalars. @@ -7155,52 +7191,39 @@ case Instruction::AShr: case Instruction::And: case Instruction::Or: - case Instruction::Xor: - case Instruction::GetElementPtr: { - unsigned Opcode = ShuffleOrOp == Instruction::GetElementPtr - ? static_cast(Instruction::Add) - : ShuffleOrOp; + case Instruction::Xor: { auto GetScalarCost = [=](unsigned Idx) { - auto *VI = dyn_cast(VL[Idx]); - // GEPs may contain just addresses without instructions, consider - // their cost 0. - if (!VI) - return InstructionCost(); + auto *VI = cast(VL[Idx]); unsigned OpIdx = isa(VI) ? 0 : 1; TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0)); TTI::OperandValueInfo Op2Info = TTI::getOperandInfo(VI->getOperand(OpIdx)); SmallVector Operands(VI->operand_values()); - return TTI->getArithmeticInstrCost(Opcode, ScalarTy, CostKind, Op1Info, - Op2Info, Operands, VI); + return TTI->getArithmeticInstrCost(ShuffleOrOp, ScalarTy, CostKind, + Op1Info, Op2Info, Operands, VI); }; auto GetVectorCost = [=](InstructionCost CommonCost) { unsigned OpIdx = isa(VL0) ? 0 : 1; TTI::OperandValueInfo Op1Info = getOperandInfo(VL, 0); TTI::OperandValueInfo Op2Info = getOperandInfo(VL, OpIdx); - return TTI->getArithmeticInstrCost(Opcode, VecTy, CostKind, Op1Info, + return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info, Op2Info) + CommonCost; }; return GetCostDiff(GetScalarCost, GetVectorCost); } + case Instruction::GetElementPtr: { + return CommonCost + GetGEPCostDiff(VL, VL0); + } case Instruction::Load: { auto GetScalarCost = [=](unsigned Idx) { auto *VI = cast(VL[Idx]); - InstructionCost GEPCost = 0; - if (VI != VL0) { - auto *Ptr = dyn_cast(VI->getPointerOperand()); - if (Ptr && Ptr->hasOneUse() && !Ptr->hasAllConstantIndices()) - GEPCost = TTI->getArithmeticInstrCost(Instruction::Add, - Ptr->getType(), CostKind); - } - return GEPCost + - TTI->getMemoryOpCost(Instruction::Load, ScalarTy, VI->getAlign(), + return TTI->getMemoryOpCost(Instruction::Load, ScalarTy, VI->getAlign(), VI->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo(), VI); }; + auto *LI0 = cast(VL0); auto GetVectorCost = [=](InstructionCost CommonCost) { - auto *LI0 = cast(VL0); InstructionCost VecLdCost; if (E->State == TreeEntry::Vectorize) { VecLdCost = TTI->getMemoryOpCost( @@ -7218,34 +7241,46 @@ } return VecLdCost + CommonCost; }; - return GetCostDiff(GetScalarCost, GetVectorCost); + + InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost); + // If this node generates masked gather load then it is not a terminal node. + // Hence address operand cost is estimated separately. + if (E->State == TreeEntry::ScatterVectorize) + return Cost; + + // Estimate cost of GEPs since this tree node is a terminator. + SmallVector PointerOps(VL.size()); + for (auto [I, V] : enumerate(VL)) + PointerOps[I] = cast(V)->getPointerOperand(); + return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand()); } case Instruction::Store: { bool IsReorder = !E->ReorderIndices.empty(); - auto *SI = cast(IsReorder ? VL[E->ReorderIndices.front()] : VL0); auto GetScalarCost = [=](unsigned Idx) { auto *VI = cast(VL[Idx]); - InstructionCost GEPCost = 0; - if (VI != SI) { - auto *Ptr = dyn_cast(VI->getPointerOperand()); - if (Ptr && Ptr->hasOneUse() && !Ptr->hasAllConstantIndices()) - GEPCost = TTI->getArithmeticInstrCost(Instruction::Add, - Ptr->getType(), CostKind); - } TTI::OperandValueInfo OpInfo = getOperandInfo(VI, 0); - return GEPCost + TTI->getMemoryOpCost( - Instruction::Store, ScalarTy, VI->getAlign(), - VI->getPointerAddressSpace(), CostKind, OpInfo, VI); + return TTI->getMemoryOpCost(Instruction::Store, ScalarTy, VI->getAlign(), + VI->getPointerAddressSpace(), CostKind, + OpInfo, VI); }; + auto *BaseSI = + cast(IsReorder ? VL[E->ReorderIndices.front()] : VL0); auto GetVectorCost = [=](InstructionCost CommonCost) { // We know that we can merge the stores. Calculate the cost. TTI::OperandValueInfo OpInfo = getOperandInfo(VL, 0); - return TTI->getMemoryOpCost(Instruction::Store, VecTy, SI->getAlign(), - SI->getPointerAddressSpace(), CostKind, + return TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(), + BaseSI->getPointerAddressSpace(), CostKind, OpInfo) + CommonCost; }; - return GetCostDiff(GetScalarCost, GetVectorCost); + SmallVector PointerOps(VL.size()); + for (auto [I, V] : enumerate(VL)) { + unsigned Idx = IsReorder ? E->ReorderIndices[I] : I; + PointerOps[Idx] = cast(V)->getPointerOperand(); + } + + return GetCostDiff(GetScalarCost, GetVectorCost) + + GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand()); } case Instruction::Call: { auto GetScalarCost = [=](unsigned Idx) { diff --git a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=slp-vectorizer -S -o - -mtriple=x86_64-unknown-linux -mcpu=haswell < %s | FileCheck %s +; RUN: opt -passes=slp-vectorizer -S -o - -mtriple=x86_64-unknown-linux -mcpu=haswell -slp-threshold=-3 < %s | FileCheck %s @e = dso_local local_unnamed_addr global i32 0, align 4 @f = dso_local local_unnamed_addr global i32 0, align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll @@ -22,7 +22,7 @@ ; YAML-NEXT: Function: test ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost ' - ; YAML-NEXT: - Cost: '-17' + ; YAML-NEXT: - Cost: '-3' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '7' entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -passes=slp-vectorizer < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-12 | FileCheck %s +; RUN: opt -S -passes=slp-vectorizer < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-13 | FileCheck %s define void @test(i1 %c, ptr %arg) { ; CHECK-LABEL: @test(