Index: lib/Analysis/CostModel.cpp =================================================================== --- lib/Analysis/CostModel.cpp +++ lib/Analysis/CostModel.cpp @@ -121,6 +121,28 @@ return isAlternate; } +// Check for a splat of a uniform value. This is not loop aware, so return +// true only for the obviously uniform cases (argument, globalvariable) +static bool isBroadcastOfUniform(Value *V) { + ShuffleVectorInst *SVI = dyn_cast(V); + if (!SVI) + return false; + + if (!isa(SVI->getMask())) + return false; + + InsertElementInst *Insert = dyn_cast(SVI->getOperand(0)); + if (!Insert) + return false; + + ConstantInt *Index = dyn_cast(Insert->getOperand(2)); + if (!Index || !Index->isZero()) + return false; + + Value *Scalar = Insert->getOperand(1); + return (isa(Scalar) || isa(Scalar)); +} + static TargetTransformInfo::OperandValueKind getOperandInfo(Value *V) { TargetTransformInfo::OperandValueKind OpInfo = TargetTransformInfo::OK_AnyValue; @@ -132,6 +154,11 @@ OpInfo = TargetTransformInfo::OK_UniformConstantValue; } + // Check for a splat of a uniform value. This is not loop aware, so return + // true only for the obviously uniform cases (argument, globalvalue) + if (isBroadcastOfUniform(V)) + OpInfo = TargetTransformInfo::OK_UniformValue; + return OpInfo; } Index: lib/Analysis/LoopAccessAnalysis.cpp =================================================================== --- lib/Analysis/LoopAccessAnalysis.cpp +++ lib/Analysis/LoopAccessAnalysis.cpp @@ -1752,7 +1752,14 @@ } bool LoopAccessInfo::isUniform(Value *V) const { - return (PSE->getSE()->isLoopInvariant(PSE->getSE()->getSCEV(V), TheLoop)); + auto *SE = PSE->getSE(); + // Since we rely on SCEV for uniformity, if the type is not SCEVable, it is + // never considered uniform. + // TODO: Is this really what we want? Even without FP SCEV, we may want some + // trivially loop-invariant FP values to be considered uniform. + if (!SE->isSCEVable(V->getType())) + return false; + return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop)); } // FIXME: this function is currently a duplicate of the one in Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -240,9 +240,16 @@ static const CostTblEntry SSE2UniformConstCostTable[] = { - // We don't correctly identify costs of casts because they are marked as - // custom. // Constant splats are cheaper for the following instructions. + { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence + { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence + { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence + { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence + }; + + static const CostTblEntry + SSE2UniformCostTable[] = { + // Uniform splats are cheaper for the following instructions. { ISD::SHL, MVT::v16i8, 1 }, // psllw. { ISD::SHL, MVT::v32i8, 2 }, // psllw. { ISD::SHL, MVT::v8i16, 1 }, // psllw. @@ -269,21 +276,21 @@ { ISD::SRA, MVT::v8i32, 2 }, // psrad. { ISD::SRA, MVT::v2i64, 4 }, // 2 x psrad + shuffle. { ISD::SRA, MVT::v4i64, 8 }, // 2 x psrad + shuffle. - - { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence - { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence - { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence - { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence }; - if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && - ST->hasSSE2()) { - // pmuldq sequence. - if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41()) - return LT.first * 15; - - if (const auto *Entry = CostTableLookup(SSE2UniformConstCostTable, ISD, - LT.second)) + if (ST->hasSSE2() && + ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || + (Op2Info == TargetTransformInfo::OK_UniformValue))) { + if (Op2Info == TargetTransformInfo::OK_UniformConstantValue) { + // pmuldq sequence. + if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41()) + return LT.first * 15; + if (const auto *Entry = + CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second)) + return LT.first * Entry->Cost; + } + if (const auto *Entry = + CostTableLookup(SSE2UniformCostTable, ISD, LT.second)) return LT.first * Entry->Cost; } @@ -312,12 +319,6 @@ static const CostTblEntry SSE2CostTable[] = { // We don't correctly identify costs of casts because they are marked as // custom. - // For some cases, where the shift amount is a scalar we would be able - // to generate better code. Unfortunately, when this is the case the value - // (the splat) will get hoisted out of the loop, thereby making it invisible - // to ISel. The cost model must return worst case assumptions because it is - // used for vectorization and we don't want to make vectorized code worse - // than scalar code. { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence. { ISD::SHL, MVT::v32i8, 2*26 }, // cmpgtb sequence. { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence. Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5971,7 +5971,7 @@ TargetTransformInfo::OP_None; Value *Op2 = I->getOperand(1); - // Check for a splat of a constant or for a non uniform vector of constants. + // Check for a splat or for a non uniform vector of constants. if (isa(Op2)) { ConstantInt *CInt = cast(Op2); if (CInt && CInt->getValue().isPowerOf2()) @@ -5986,6 +5986,8 @@ Op2VP = TargetTransformInfo::OP_PowerOf2; Op2VK = TargetTransformInfo::OK_UniformConstantValue; } + } else if (Legal->isUniform(Op2)) { + Op2VK = TargetTransformInfo::OK_UniformValue; } return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK, Index: test/Analysis/CostModel/X86/uniformshift.ll =================================================================== --- test/Analysis/CostModel/X86/uniformshift.ll +++ test/Analysis/CostModel/X86/uniformshift.ll @@ -0,0 +1,39 @@ +; RUN: llc -mtriple=x86_64-apple-darwin -mattr=+sse2 < %s | FileCheck --check-prefix=SSE2-CODEGEN %s +; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse2 -cost-model -analyze < %s | FileCheck --check-prefix=SSE2 %s + +define <4 x i32> @shl(<4 x i32> %vector, i32 %scalar) { +entry: + ; SSE2: 'shl' + ; SSE2: cost of 1 {{.*}} shl + ; SSE2-CODEGEN: movd %edi, %xmm1 + ; SSE2-CODEGEN: pslld %xmm1, %xmm0 + %insert = insertelement <4 x i32> undef, i32 %scalar, i32 0 + %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer + %ret = shl <4 x i32> %vector , %splat + ret <4 x i32> %ret +} + +define <4 x i32> @ashr(<4 x i32> %vector, i32 %scalar) { +entry: + ; SSE2: 'ashr' + ; SSE2: cost of 1 {{.*}} ashr + ; SSE2-CODEGEN: movd %edi, %xmm1 + ; SSE2-CODEGEN: psrad %xmm1, %xmm0 + %insert = insertelement <4 x i32> undef, i32 %scalar, i32 0 + %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer + %ret = ashr <4 x i32> %vector , %splat + ret <4 x i32> %ret +} + +define <4 x i32> @lshr(<4 x i32> %vector, i32 %scalar) { +entry: + ; SSE2: 'lshr' + ; SSE2: cost of 1 {{.*}} lshr + ; SSE2-CODEGEN: movd %edi, %xmm1 + ; SSE2-CODEGEN: psrld %xmm1, %xmm0 + %insert = insertelement <4 x i32> undef, i32 %scalar, i32 0 + %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer + %ret = lshr <4 x i32> %vector , %splat + ret <4 x i32> %ret +} + Index: test/Transforms/LoopVectorize/X86/uniformshift.ll =================================================================== --- test/Transforms/LoopVectorize/X86/uniformshift.ll +++ test/Transforms/LoopVectorize/X86/uniformshift.ll @@ -0,0 +1,23 @@ +; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse2 -loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s +; REQUIRES: asserts + +; CHECK: "foo" +; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction: %shift = ashr i32 %val, %k +define void @foo(i32* nocapture %p, i32 %k) local_unnamed_addr #0 { +entry: + br label %body + +body: + %i = phi i64 [ 0, %entry ], [ %next, %body ] + %ptr = getelementptr inbounds i32, i32* %p, i64 %i + %val = load i32, i32* %ptr, align 4 + %shift = ashr i32 %val, %k + store i32 %shift, i32* %ptr, align 4 + %next = add nuw nsw i64 %i, 1 + %cmp = icmp eq i64 %next, 16 + br i1 %cmp, label %exit, label %body + +exit: + ret void + +}