Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -335,6 +335,9 @@ OK_NonUniformConstantValue // Operand is a non uniform constant value. }; + /// \brief Additional properties of an operand's values. + enum OperandValueProperties { OP_None = 0, OP_PowerOf2 = 1 }; + /// \return The number of scalar or vector registers that the target has. /// If 'Vectors' is true, it returns the number of vector registers. If it is /// set to false, it returns the number of scalar registers. @@ -349,9 +352,12 @@ virtual unsigned getMaximumUnrollFactor() const; /// \return The expected cost of arithmetic ops, such as mul, xor, fsub, etc. - virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, - OperandValueKind Opd1Info = OK_AnyValue, - OperandValueKind Opd2Info = OK_AnyValue) const; + virtual unsigned + getArithmeticInstrCost(unsigned Opcode, Type *Ty, + OperandValueKind Opd1Info = OK_AnyValue, + OperandValueKind Opd2Info = OK_AnyValue, + OperandValueProperties Opd1PropInfo = OP_None, + OperandValueProperties Opd2PropInfo = OP_None) const; /// \return The cost of a shuffle instruction of kind Kind and of type Tp. /// The index and subtype parameters are used by the subvector insertion and Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -171,11 +171,12 @@ return PrevTTI->getMaximumUnrollFactor(); } -unsigned TargetTransformInfo::getArithmeticInstrCost(unsigned Opcode, - Type *Ty, - OperandValueKind Op1Info, - OperandValueKind Op2Info) const { - return PrevTTI->getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info); +unsigned TargetTransformInfo::getArithmeticInstrCost( + unsigned Opcode, Type *Ty, OperandValueKind Op1Info, + OperandValueKind Op2Info, OperandValueProperties Opd1PropInfo, + OperandValueProperties Opd2PropInfo) const { + return PrevTTI->getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info, + Opd1PropInfo, Opd2PropInfo); } unsigned TargetTransformInfo::getShuffleCost(ShuffleKind Kind, Type *Tp, @@ -569,7 +570,8 @@ } unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind, - OperandValueKind) const override { + OperandValueKind, OperandValueProperties, + OperandValueProperties) const override { return 1; } Index: lib/CodeGen/BasicTargetTransformInfo.cpp =================================================================== --- lib/CodeGen/BasicTargetTransformInfo.cpp +++ lib/CodeGen/BasicTargetTransformInfo.cpp @@ -104,7 +104,8 @@ unsigned getMaximumUnrollFactor() const override; unsigned getRegisterBitWidth(bool Vector) const override; unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind, - OperandValueKind) const override; + OperandValueKind, OperandValueProperties, + OperandValueProperties) const override; unsigned getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) const override; unsigned getCastInstrCost(unsigned Opcode, Type *Dst, @@ -289,8 +290,9 @@ } unsigned BasicTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, - OperandValueKind, - OperandValueKind) const { + OperandValueKind, OperandValueKind, + OperandValueProperties, + OperandValueProperties) const { // Check if any of the operands are vector operands. const TargetLoweringBase *TLI = getTLI(); int ISD = TLI->InstructionOpcodeToISD(Opcode); Index: lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -112,10 +112,11 @@ unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const override; - unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, - OperandValueKind Opd1Info = OK_AnyValue, - OperandValueKind Opd2Info = OK_AnyValue) const - override; + unsigned getArithmeticInstrCost( + unsigned Opcode, Type *Ty, OperandValueKind Opd1Info = OK_AnyValue, + OperandValueKind Opd2Info = OK_AnyValue, + OperandValueProperties Opd1PropInfo = OP_None, + OperandValueProperties Opd2PropInfo = OP_None) const override; unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const override; @@ -403,9 +404,10 @@ return 2; } -unsigned AArch64TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, - OperandValueKind Opd1Info, - OperandValueKind Opd2Info) const { +unsigned AArch64TTI::getArithmeticInstrCost( + unsigned Opcode, Type *Ty, OperandValueKind Opd1Info, + OperandValueKind Opd2Info, OperandValueProperties Opd1PropInfo, + OperandValueProperties Opd2PropInfo) const { // Legalize the type. std::pair LT = TLI->getTypeLegalizationCost(Ty); @@ -413,8 +415,8 @@ switch (ISD) { default: - return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Opd1Info, - Opd2Info); + return TargetTransformInfo::getArithmeticInstrCost( + Opcode, Ty, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo); case ISD::ADD: case ISD::MUL: case ISD::XOR: Index: lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- lib/Target/ARM/ARMTargetTransformInfo.cpp +++ lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -126,10 +126,11 @@ unsigned getAddressComputationCost(Type *Val, bool IsComplex) const override; - unsigned - getArithmeticInstrCost(unsigned Opcode, Type *Ty, - OperandValueKind Op1Info = OK_AnyValue, - OperandValueKind Op2Info = OK_AnyValue) const override; + unsigned getArithmeticInstrCost( + unsigned Opcode, Type *Ty, OperandValueKind Op1Info = OK_AnyValue, + OperandValueKind Op2Info = OK_AnyValue, + OperandValueProperties Opd1PropInfo = OP_None, + OperandValueProperties Opd2PropInfo = OP_None) const override; unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) const override; @@ -497,9 +498,10 @@ return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp); } -unsigned ARMTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, - OperandValueKind Op1Info, - OperandValueKind Op2Info) const { +unsigned ARMTTI::getArithmeticInstrCost( + unsigned Opcode, Type *Ty, OperandValueKind Op1Info, + OperandValueKind Op2Info, OperandValueProperties Opd1PropInfo, + OperandValueProperties Opd2PropInfo) const { int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode); std::pair LT = TLI->getTypeLegalizationCost(Ty); @@ -555,8 +557,8 @@ if (Idx != -1) return LT.first * CostTbl[Idx].Cost; - unsigned Cost = - TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info); + unsigned Cost = TargetTransformInfo::getArithmeticInstrCost( + Opcode, Ty, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo); // This is somewhat of a hack. The problem that we are facing is that SROA // creates a sequence of shift, and, or instructions to construct values. Index: lib/Target/PowerPC/PPCTargetTransformInfo.cpp =================================================================== --- lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -92,9 +92,10 @@ virtual unsigned getNumberOfRegisters(bool Vector) const override; virtual unsigned getRegisterBitWidth(bool Vector) const override; virtual unsigned getMaximumUnrollFactor() const override; - virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, - OperandValueKind, - OperandValueKind) const override; + virtual unsigned + getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind, + OperandValueKind, OperandValueProperties, + OperandValueProperties) const override; virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) const override; virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst, @@ -318,14 +319,15 @@ return 2; } -unsigned PPCTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, - OperandValueKind Op1Info, - OperandValueKind Op2Info) const { +unsigned PPCTTI::getArithmeticInstrCost( + unsigned Opcode, Type *Ty, OperandValueKind Op1Info, + OperandValueKind Op2Info, OperandValueProperties Opd1PropInfo, + OperandValueProperties Opd2PropInfo) const { assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); // Fallback to the default implementation. - return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info, - Op2Info); + return TargetTransformInfo::getArithmeticInstrCost( + Opcode, Ty, Op1Info, Op2Info, Opd1PropInfo, Opd2PropInfo); } unsigned PPCTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -84,7 +84,8 @@ unsigned getRegisterBitWidth(bool Vector) const override; unsigned getMaximumUnrollFactor() const override; unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind, - OperandValueKind) const override; + OperandValueKind, OperandValueProperties, + OperandValueProperties) const override; unsigned getShuffleCost(ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) const override; unsigned getCastInstrCost(unsigned Opcode, Type *Dst, @@ -178,15 +179,37 @@ return 2; } -unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, - OperandValueKind Op1Info, - OperandValueKind Op2Info) const { +unsigned X86TTI::getArithmeticInstrCost( + unsigned Opcode, Type *Ty, OperandValueKind Op1Info, + OperandValueKind Op2Info, OperandValueProperties Opd1PropInfo, + OperandValueProperties Opd2PropInfo) const { // Legalize the type. std::pair LT = TLI->getTypeLegalizationCost(Ty); int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); + if (ISD == ISD::SDIV && + Op2Info == TargetTransformInfo::OK_UniformConstantValue && + Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) { + // On X86, vector signed division by constants power-of-two are + // normally expanded to the sequence SRA + SRL + ADD + SRA. + // The OperandValue properties many not be same as that of previous + // operation;conservatively assume OP_None. + unsigned Cost = + 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info, Op2Info, + TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); + Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info, + TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); + Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info, + TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); + + return Cost; + } + static const CostTblEntry AVX2UniformConstCostTable[] = { { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5837,18 +5837,31 @@ TargetTransformInfo::OK_AnyValue; TargetTransformInfo::OperandValueKind Op2VK = TargetTransformInfo::OK_AnyValue; + TargetTransformInfo::OperandValueProperties Op1VP = + TargetTransformInfo::OP_None; + TargetTransformInfo::OperandValueProperties Op2VP = + TargetTransformInfo::OP_None; Value *Op2 = I->getOperand(1); // Check for a splat of a constant or for a non uniform vector of constants. - if (isa(Op2)) + if (isa(Op2)) { + ConstantInt *CInt = cast(Op2); + if (CInt && CInt->getValue().isPowerOf2()) + Op2VP = TargetTransformInfo::OP_PowerOf2; Op2VK = TargetTransformInfo::OK_UniformConstantValue; - else if (isa(Op2) || isa(Op2)) { + } else if (isa(Op2) || isa(Op2)) { Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; - if (cast(Op2)->getSplatValue() != nullptr) + Constant *SplatValue = cast(Op2)->getSplatValue(); + if (SplatValue) { + ConstantInt *CInt = dyn_cast(SplatValue); + if (CInt && CInt->getValue().isPowerOf2()) + Op2VP = TargetTransformInfo::OP_PowerOf2; Op2VK = TargetTransformInfo::OK_UniformConstantValue; + } } - return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK); + return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK, + Op1VP, Op2VP); } case Instruction::Select: { SelectInst *SI = cast(I); Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1425,6 +1425,10 @@ TargetTransformInfo::OK_AnyValue; TargetTransformInfo::OperandValueKind Op2VK = TargetTransformInfo::OK_UniformConstantValue; + TargetTransformInfo::OperandValueProperties Op1VP = + TargetTransformInfo::OP_None; + TargetTransformInfo::OperandValueProperties Op2VP = + TargetTransformInfo::OP_None; // If all operands are exactly the same ConstantInt then set the // operand kind to OK_UniformConstantValue. @@ -1446,11 +1450,15 @@ CInt != cast(I->getOperand(1))) Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; } + if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt && + CInt->getValue().isPowerOf2()) + Op2VP = TargetTransformInfo::OP_PowerOf2; - ScalarCost = - VecTy->getNumElements() * - TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK); - VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK); + ScalarCost = VecTy->getNumElements() * + TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK, + Op1VP, Op2VP); + VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK, + Op1VP, Op2VP); } return VecCost - ScalarCost; } Index: test/Transforms/LoopVectorize/X86/powof2div.ll =================================================================== --- test/Transforms/LoopVectorize/X86/powof2div.ll +++ test/Transforms/LoopVectorize/X86/powof2div.ll @@ -0,0 +1,31 @@ +; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux-gnu -S | FileCheck %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.anon = type { [100 x i32], i32, [100 x i32] } + +@Foo = common global %struct.anon zeroinitializer, align 4 + +;CHECK: load <4 x i32>* +;CHECK: sdiv <4 x i32> +;CHECK: store <4 x i32> + +define void @foo(){ +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds %struct.anon* @Foo, i64 0, i32 2, i64 %indvars.iv + %0 = load i32* %arrayidx, align 4 + %div = sdiv i32 %0, 2 + %arrayidx2 = getelementptr inbounds %struct.anon* @Foo, i64 0, i32 0, i64 %indvars.iv + store i32 %div, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 100 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + Index: test/Transforms/SLPVectorizer/X86/powof2div.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/powof2div.ll +++ test/Transforms/SLPVectorizer/X86/powof2div.ll @@ -0,0 +1,42 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +;CHECK: load <4 x i32>* +;CHECK: add <4 x i32> +;CHECK: sdiv <4 x i32> +define void @f(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c){ +entry: + %0 = load i32* %b, align 4 + %1 = load i32* %c, align 4 + %add = add nsw i32 %1, %0 + %div = sdiv i32 %add, 2 + store i32 %div, i32* %a, align 4 + %arrayidx3 = getelementptr inbounds i32* %b, i64 1 + %2 = load i32* %arrayidx3, align 4 + %arrayidx4 = getelementptr inbounds i32* %c, i64 1 + %3 = load i32* %arrayidx4, align 4 + %add5 = add nsw i32 %3, %2 + %div6 = sdiv i32 %add5, 2 + %arrayidx7 = getelementptr inbounds i32* %a, i64 1 + store i32 %div6, i32* %arrayidx7, align 4 + %arrayidx8 = getelementptr inbounds i32* %b, i64 2 + %4 = load i32* %arrayidx8, align 4 + %arrayidx9 = getelementptr inbounds i32* %c, i64 2 + %5 = load i32* %arrayidx9, align 4 + %add10 = add nsw i32 %5, %4 + %div11 = sdiv i32 %add10, 2 + %arrayidx12 = getelementptr inbounds i32* %a, i64 2 + store i32 %div11, i32* %arrayidx12, align 4 + %arrayidx13 = getelementptr inbounds i32* %b, i64 3 + %6 = load i32* %arrayidx13, align 4 + %arrayidx14 = getelementptr inbounds i32* %c, i64 3 + %7 = load i32* %arrayidx14, align 4 + %add15 = add nsw i32 %7, %6 + %div16 = sdiv i32 %add15, 2 + %arrayidx17 = getelementptr inbounds i32* %a, i64 3 + store i32 %div16, i32* %arrayidx17, align 4 + ret void +} +