Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -488,6 +488,10 @@ /// any callee-saved registers, so would require a spill and fill. unsigned getCostOfKeepingLiveOverCall(ArrayRef Tys) const; + /// \return whether the target supports using memory operand as the + /// destination for the opcode and type. + bool isLegalMemDestOperand(unsigned Opcode, Type *Ty) const; + /// \returns True if the intrinsic is a supported memory intrinsic. Info /// will contain additional information - whether the intrinsic may write /// or read to memory, volatility and the pointer. Info is undefined @@ -591,6 +595,7 @@ virtual unsigned getNumberOfParts(Type *Tp) = 0; virtual unsigned getAddressComputationCost(Type *Ty, bool IsComplex) = 0; virtual unsigned getCostOfKeepingLiveOverCall(ArrayRef Tys) = 0; + virtual bool isLegalMemDestOperand(unsigned Opcode, Type *Ty) = 0; virtual bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) = 0; virtual Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, @@ -761,6 +766,9 @@ unsigned getCostOfKeepingLiveOverCall(ArrayRef Tys) override { return Impl.getCostOfKeepingLiveOverCall(Tys); } + bool isLegalMemDestOperand(unsigned Opcode, Type *Ty) override { + return Impl.isLegalMemDestOperand(Opcode, Ty); + } bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) override { return Impl.getTgtMemIntrinsic(Inst, Info); Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -317,6 +317,8 @@ unsigned getCostOfKeepingLiveOverCall(ArrayRef Tys) { return 0; } + bool isLegalMemDestOperand(unsigned, Type *) { return 0; } + bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) { return false; } Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -265,6 +265,11 @@ return TTIImpl->getCostOfKeepingLiveOverCall(Tys); } +bool TargetTransformInfo::isLegalMemDestOperand(unsigned Opcode, + Type *Ty) const { + return TTIImpl->isLegalMemDestOperand(Opcode, Ty); +} + bool TargetTransformInfo::getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const { return TTIImpl->getTgtMemIntrinsic(Inst, Info); Index: lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- lib/Target/X86/X86TargetTransformInfo.h +++ lib/Target/X86/X86TargetTransformInfo.h @@ -101,9 +101,9 @@ Type *Ty); unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty); + bool isLegalMemDestOperand(unsigned Opcode, Type *Ty); bool isLegalMaskedLoad(Type *DataType, int Consecutive); bool isLegalMaskedStore(Type *DataType, int Consecutive); - /// @} }; Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -66,6 +66,26 @@ } +bool X86TTIImpl::isLegalMemDestOperand(unsigned Opcode, Type *Ty) { + if (Ty->isVectorTy()) + return false; + switch (Opcode) { + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + return true; + } + return false; +} + unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) { // If the loop will not be vectorized, don't interleave the loop. // Let regular unroll to unroll the loop, which saves the overflow Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -359,6 +359,10 @@ /// holding live values over call sites. int getSpillCost(); + /// \returns the cost incurred by other side effect like failing to + /// combine insns after vectorization. + int getOtherCost(); + /// \returns the vectorization cost of the subtree that starts at \p VL. /// A negative number means that this is profitable. int getTreeCost(); @@ -1710,6 +1714,41 @@ return Cost; } +int BoUpSLP::getOtherCost() { + // Many X86 scalar instructions support using memory operand as destination + // but most vector instructions do not support it. Like: + // shrq $5, (%rdx) + // shrq $5, 8(%rdx) + // is often better than: + // movdqu (%rdx), %xmm0 + // psrlq $5, %xmm0 + // movdqu %xmm0, (%rdx) + if (VectorizableTree.size() >= 3) { + StoreInst *SI = dyn_cast(VectorizableTree[0].Scalars[0]); + if (!SI) + return 0; + + LoadInst *LI = dyn_cast(VectorizableTree[2].Scalars[0]); + if (!LI) + return 0; + + Instruction *IT = cast(VectorizableTree[1].Scalars[0]); + ArrayRef VL = VectorizableTree[0].Scalars; + VectorType *VecTy = VectorType::get(IT->getType(), VL.size()); + + // If scalar version of IT instruction cannot use memory operand as + // destination or vector version can, no extra cost. + // If LI and SI have different memory addresses, no extra cost. + if (!TTI->isLegalMemDestOperand(IT->getOpcode(), IT->getType()) || + TTI->isLegalMemDestOperand(IT->getOpcode(), VecTy) || + SI->getOperand(1) != LI->getOperand(0)) + return 0; + + return VL.size() * 2; + } + return 0; +} + int BoUpSLP::getTreeCost() { int Cost = 0; DEBUG(dbgs() << "SLP: Calculating cost for tree of size " << @@ -1753,6 +1792,8 @@ Cost += getSpillCost(); + Cost += getOtherCost(); + DEBUG(dbgs() << "SLP: Total Cost " << Cost + ExtractCost<< ".\n"); return Cost + ExtractCost; } Index: test/Transforms/SLPVectorizer/X86/pr23510.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/pr23510.ll +++ test/Transforms/SLPVectorizer/X86/pr23510.ll @@ -0,0 +1,36 @@ +; PR23510 +; RUN: opt < %s -mtriple=x86_64-linux-gnu -basicaa -slp-vectorizer -S | FileCheck %s +; Check that slp does not generate vectorized lshr. +; CHECK-LABEL: @foo( +; CHECK-NOT: lshr <2 x i64> + +define void @foo(float* nocapture readonly %p1, i32 %p2, i64* nocapture %p3, float* nocapture %p4) { +entry: + %idx.ext = sext i32 %p2 to i64 + %add.ptr = getelementptr inbounds float, float* %p1, i64 %idx.ext + %arrayidx1 = getelementptr inbounds float, float* %add.ptr, i64 5 + %tmp = load float, float* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds float, float* %p4, i64 3 + %tmp1 = load float, float* %arrayidx2, align 4 + %add = fadd float %tmp, %tmp1 + store float %add, float* %arrayidx2, align 4 + store i64 0, i64* %p3, align 8 + %arrayidx4 = getelementptr inbounds i64, i64* %p3, i64 1 + %tmp2 = load i64, i64* %arrayidx4, align 8 + %shr5 = lshr i64 %tmp2, 5 + store i64 %shr5, i64* %arrayidx4, align 8 + %arrayidx6 = getelementptr inbounds i64, i64* %p3, i64 2 + %tmp3 = load i64, i64* %arrayidx6, align 8 + %shr7 = lshr i64 %tmp3, 5 + store i64 %shr7, i64* %arrayidx6, align 8 + %arrayidx8 = getelementptr inbounds i64, i64* %p3, i64 3 + %tmp4 = load i64, i64* %arrayidx8, align 8 + %shr9 = lshr i64 %tmp4, 5 + store i64 %shr9, i64* %arrayidx8, align 8 + %add.ptr11 = getelementptr inbounds float, float* %add.ptr, i64 %idx.ext + %tmp5 = load float, float* %add.ptr11, align 4 + %tmp6 = load float, float* %p4, align 4 + %add15 = fadd float %tmp5, %tmp6 + store float %add15, float* %p4, align 4 + ret void +}