Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -329,10 +329,11 @@ /// \brief Additional information about an operand's possible values. enum OperandValueKind { - OK_AnyValue, // Operand can have any value. - OK_UniformValue, // Operand is uniform (splat of a value). - OK_UniformConstantValue, // Operand is uniform constant. - OK_NonUniformConstantValue // Operand is a non uniform constant value. + OK_AnyValue, // Operand can have any value. + OK_UniformValue, // Operand is uniform (splat of a value). + OK_UniformConstantValue, // Operand is uniform constant. + OK_UniformConstantPowerOfTwo, // Operand is uniform constant power of 2. + OK_NonUniformConstantValue // Operand is a non uniform constant value. }; /// \return The number of scalar or vector registers that the target has. Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -202,6 +202,21 @@ return LT.first * AVX2UniformConstCostTable[Idx].Cost; } + static const CostTblEntry + AVX2UniformConstPowOf2CostTable[] = { + {ISD::SDIV, MVT::v16i16, 1}, // psraw instruction + {ISD::UDIV, MVT::v16i16, 1}, // psraw instruction + {ISD::SDIV, MVT::v8i32, 1}, // psrad instruction + {ISD::UDIV, MVT::v8i32, 1}, // psrad instruction + }; + + if (Op2Info == TargetTransformInfo::OK_UniformConstantPowerOfTwo && + ST->hasAVX2()) { + int Idx = CostTableLookup(AVX2UniformConstPowOf2CostTable, ISD, LT.second); + if (Idx != -1) + return LT.first * AVX2UniformConstPowOf2CostTable[Idx].Cost; + } + static const CostTblEntry AVX2CostTable[] = { // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to // customize them to detect the cases where shift amount is a scalar one. @@ -241,6 +256,7 @@ if (ST->hasAVX2()) { if (ISD == ISD::SHL && LT.second == MVT::v16i16 && (Op2Info == TargetTransformInfo::OK_UniformConstantValue || + Op2Info == TargetTransformInfo::OK_UniformConstantPowerOfTwo || Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) // On AVX2, a packed v16i16 shift left by a constant build_vector // is lowered into a vector multiply (vpmullw). @@ -286,6 +302,22 @@ if (Idx != -1) return LT.first * SSE2UniformConstCostTable[Idx].Cost; } + static const CostTblEntry + SSE2UniformConstPowerOf2CostTable[] = { + // We currently only support DIV ops. + {ISD::SDIV, MVT::v8i16, 1}, // psraw sequence + {ISD::UDIV, MVT::v8i16, 1}, // psraw sequence + {ISD::SDIV, MVT::v4i32, 1}, // psrad sequence + {ISD::UDIV, MVT::v4i32, 1}, // psrad sequence + }; + + if (Op2Info == TargetTransformInfo::OK_UniformConstantPowerOfTwo && + ST->hasSSE2()) { + int Idx = + CostTableLookup(SSE2UniformConstPowerOf2CostTable, ISD, LT.second); + if (Idx != -1) + return LT.first * SSE2UniformConstPowerOf2CostTable[Idx].Cost; + } if (ISD == ISD::SHL && Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) { Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1438,6 +1438,9 @@ CInt != cast(I->getOperand(1))) Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; } + if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt && + CInt->getValue().isPowerOf2()) + Op2VK = TargetTransformInfo::OK_UniformConstantPowerOfTwo; ScalarCost = VecTy->getNumElements() * Index: test/Transforms/SLPVectorizer/X86/powof2div.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/powof2div.ll +++ test/Transforms/SLPVectorizer/X86/powof2div.ll @@ -0,0 +1,42 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +;CHECK: load <4 x i32>* +;CHECK: add <4 x i32> +;CHECK: sdiv <4 x i32> +define void @f(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c){ +entry: + %0 = load i32* %b, align 4 + %1 = load i32* %c, align 4 + %add = add nsw i32 %1, %0 + %div = sdiv i32 %add, 2 + store i32 %div, i32* %a, align 4 + %arrayidx3 = getelementptr inbounds i32* %b, i64 1 + %2 = load i32* %arrayidx3, align 4 + %arrayidx4 = getelementptr inbounds i32* %c, i64 1 + %3 = load i32* %arrayidx4, align 4 + %add5 = add nsw i32 %3, %2 + %div6 = sdiv i32 %add5, 2 + %arrayidx7 = getelementptr inbounds i32* %a, i64 1 + store i32 %div6, i32* %arrayidx7, align 4 + %arrayidx8 = getelementptr inbounds i32* %b, i64 2 + %4 = load i32* %arrayidx8, align 4 + %arrayidx9 = getelementptr inbounds i32* %c, i64 2 + %5 = load i32* %arrayidx9, align 4 + %add10 = add nsw i32 %5, %4 + %div11 = sdiv i32 %add10, 2 + %arrayidx12 = getelementptr inbounds i32* %a, i64 2 + store i32 %div11, i32* %arrayidx12, align 4 + %arrayidx13 = getelementptr inbounds i32* %b, i64 3 + %6 = load i32* %arrayidx13, align 4 + %arrayidx14 = getelementptr inbounds i32* %c, i64 3 + %7 = load i32* %arrayidx14, align 4 + %add15 = add nsw i32 %7, %6 + %div16 = sdiv i32 %add15, 2 + %arrayidx17 = getelementptr inbounds i32* %a, i64 3 + store i32 %div16, i32* %arrayidx17, align 4 + ret void +} +