Index: lib/Analysis/CostModel.cpp
===================================================================
--- lib/Analysis/CostModel.cpp
+++ lib/Analysis/CostModel.cpp
@@ -121,6 +121,28 @@
   return isAlternate;
 }
 
+// Check for a splat of a uniform value. This is not loop aware, so return
+// true only for the obviously uniform cases (argument, globalvariable)
+static bool isBroadcastOfUniform(Value *V) {
+  ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(V);
+  if (!SVI)
+    return false;
+
+  if (!isa<ConstantAggregateZero>(SVI->getMask()))
+    return false;
+
+  InsertElementInst *Insert = dyn_cast<InsertElementInst>(SVI->getOperand(0));
+  if (!Insert)
+    return false;
+
+  ConstantInt *Index = dyn_cast<ConstantInt>(Insert->getOperand(2));
+  if (!Index || !Index->isZero())
+    return false;
+
+  Value *Scalar = Insert->getOperand(1);
+  return (isa<Argument>(Scalar) || isa<GlobalValue>(Scalar));
+}
+
 static TargetTransformInfo::OperandValueKind getOperandInfo(Value *V) {
   TargetTransformInfo::OperandValueKind OpInfo =
     TargetTransformInfo::OK_AnyValue;
@@ -132,6 +154,11 @@
       OpInfo = TargetTransformInfo::OK_UniformConstantValue;
   }
 
+  // Check for a splat of a uniform value. This is not loop aware, so return
+  // true only for the obviously uniform cases (argument, globalvalue)
+  if (isBroadcastOfUniform(V))
+    OpInfo = TargetTransformInfo::OK_UniformValue;
+
   return OpInfo;
 }
 
Index: lib/Analysis/LoopAccessAnalysis.cpp
===================================================================
--- lib/Analysis/LoopAccessAnalysis.cpp
+++ lib/Analysis/LoopAccessAnalysis.cpp
@@ -1752,7 +1752,14 @@
 }
 
 bool LoopAccessInfo::isUniform(Value *V) const {
-  return (PSE->getSE()->isLoopInvariant(PSE->getSE()->getSCEV(V), TheLoop));
+  auto *SE = PSE->getSE();
+  // Since we rely on SCEV for uniformity, if the type is not SCEVable, it is
+  // never considered uniform.
+  // TODO: Is this really what we want? Even without FP SCEV, we may want some
+  // trivially loop-invariant FP values to be considered uniform.
+  if (!SE->isSCEVable(V->getType()))
+    return false;
+  return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
 }
 
 // FIXME: this function is currently a duplicate of the one in
Index: lib/Target/X86/X86TargetTransformInfo.cpp
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.cpp
+++ lib/Target/X86/X86TargetTransformInfo.cpp
@@ -240,9 +240,16 @@
 
   static const CostTblEntry
   SSE2UniformConstCostTable[] = {
-    // We don't correctly identify costs of casts because they are marked as
-    // custom.
     // Constant splats are cheaper for the following instructions.
+    { ISD::SDIV, MVT::v8i16,  6 }, // pmulhw sequence
+    { ISD::UDIV, MVT::v8i16,  6 }, // pmulhuw sequence
+    { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
+    { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
+  };
+
+  static const CostTblEntry
+  SSE2UniformCostTable[] = {
+    // Uniform splats are cheaper for the following instructions.
     { ISD::SHL,  MVT::v16i8,  1 }, // psllw.
     { ISD::SHL,  MVT::v32i8,  2 }, // psllw.
     { ISD::SHL,  MVT::v8i16,  1 }, // psllw.
@@ -269,21 +276,21 @@
     { ISD::SRA,  MVT::v8i32,  2 }, // psrad.
     { ISD::SRA,  MVT::v2i64,  4 }, // 2 x psrad + shuffle.
     { ISD::SRA,  MVT::v4i64,  8 }, // 2 x psrad + shuffle.
-
-    { ISD::SDIV, MVT::v8i16,  6 }, // pmulhw sequence
-    { ISD::UDIV, MVT::v8i16,  6 }, // pmulhuw sequence
-    { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
-    { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
   };
 
-  if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
-      ST->hasSSE2()) {
-    // pmuldq sequence.
-    if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
-      return LT.first * 15;
-
-    if (const auto *Entry = CostTableLookup(SSE2UniformConstCostTable, ISD,
-                                            LT.second))
+  if (ST->hasSSE2() &&
+      ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
+       (Op2Info == TargetTransformInfo::OK_UniformValue))) {
+    if (Op2Info == TargetTransformInfo::OK_UniformConstantValue) {
+      // pmuldq sequence.
+      if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
+        return LT.first * 15;
+      if (const auto *Entry =
+              CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
+        return LT.first * Entry->Cost;
+    }
+    if (const auto *Entry =
+            CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
       return LT.first * Entry->Cost;
   }
 
@@ -312,12 +319,6 @@
   static const CostTblEntry SSE2CostTable[] = {
     // We don't correctly identify costs of casts because they are marked as
     // custom.
-    // For some cases, where the shift amount is a scalar we would be able
-    // to generate better code. Unfortunately, when this is the case the value
-    // (the splat) will get hoisted out of the loop, thereby making it invisible
-    // to ISel. The cost model must return worst case assumptions because it is
-    // used for vectorization and we don't want to make vectorized code worse
-    // than scalar code.
     { ISD::SHL,  MVT::v16i8,    26 }, // cmpgtb sequence.
     { ISD::SHL,  MVT::v32i8,  2*26 }, // cmpgtb sequence.
     { ISD::SHL,  MVT::v8i16,    32 }, // cmpgtb sequence.
Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5971,7 +5971,7 @@
         TargetTransformInfo::OP_None;
     Value *Op2 = I->getOperand(1);
 
-    // Check for a splat of a constant or for a non uniform vector of constants.
+    // Check for a splat or for a non uniform vector of constants.
     if (isa<ConstantInt>(Op2)) {
       ConstantInt *CInt = cast<ConstantInt>(Op2);
       if (CInt && CInt->getValue().isPowerOf2())
@@ -5986,6 +5986,8 @@
           Op2VP = TargetTransformInfo::OP_PowerOf2;
         Op2VK = TargetTransformInfo::OK_UniformConstantValue;
       }
+    } else if (Legal->isUniform(Op2)) {
+      Op2VK = TargetTransformInfo::OK_UniformValue;
     }
 
     return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK,
Index: test/Analysis/CostModel/X86/uniformshift.ll
===================================================================
--- test/Analysis/CostModel/X86/uniformshift.ll
+++ test/Analysis/CostModel/X86/uniformshift.ll
@@ -0,0 +1,39 @@
+; RUN: llc -mtriple=x86_64-apple-darwin -mattr=+sse2 < %s | FileCheck --check-prefix=SSE2-CODEGEN %s
+; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse2 -cost-model -analyze < %s | FileCheck --check-prefix=SSE2 %s
+
+define <4 x i32> @shl(<4 x i32> %vector, i32 %scalar) {
+entry:
+  ; SSE2: 'shl'
+  ; SSE2: cost of 1 {{.*}} shl
+  ; SSE2-CODEGEN: movd  %edi, %xmm1
+  ; SSE2-CODEGEN: pslld %xmm1, %xmm0
+  %insert = insertelement <4 x i32> undef, i32 %scalar, i32 0
+  %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %ret = shl <4 x i32> %vector , %splat
+  ret <4 x i32> %ret
+}
+
+define <4 x i32> @ashr(<4 x i32> %vector, i32 %scalar) {
+entry:
+  ; SSE2: 'ashr'
+  ; SSE2: cost of 1 {{.*}} ashr
+  ; SSE2-CODEGEN: movd  %edi, %xmm1
+  ; SSE2-CODEGEN: psrad %xmm1, %xmm0
+  %insert = insertelement <4 x i32> undef, i32 %scalar, i32 0
+  %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %ret = ashr <4 x i32> %vector , %splat
+  ret <4 x i32> %ret
+}
+
+define <4 x i32> @lshr(<4 x i32> %vector, i32 %scalar) {
+entry:
+  ; SSE2: 'lshr'
+  ; SSE2: cost of 1 {{.*}} lshr
+  ; SSE2-CODEGEN: movd  %edi, %xmm1
+  ; SSE2-CODEGEN: psrld %xmm1, %xmm0
+  %insert = insertelement <4 x i32> undef, i32 %scalar, i32 0
+  %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %ret = lshr <4 x i32> %vector , %splat
+  ret <4 x i32> %ret
+}
+
Index: test/Transforms/LoopVectorize/X86/uniformshift.ll
===================================================================
--- test/Transforms/LoopVectorize/X86/uniformshift.ll
+++ test/Transforms/LoopVectorize/X86/uniformshift.ll
@@ -0,0 +1,23 @@
+; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse2 -loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; CHECK: "foo"
+; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction:   %shift = ashr i32 %val, %k
+define void @foo(i32* nocapture %p, i32 %k) local_unnamed_addr #0 {
+entry:  
+  br label %body
+
+body:
+  %i = phi i64 [ 0, %entry ], [ %next, %body ]
+  %ptr = getelementptr inbounds i32, i32* %p, i64 %i
+  %val = load i32, i32* %ptr, align 4
+  %shift = ashr i32 %val, %k
+  store i32 %shift, i32* %ptr, align 4
+  %next = add nuw nsw i64 %i, 1
+  %cmp = icmp eq i64 %next, 16
+  br i1 %cmp, label %exit, label %body
+
+exit:
+  ret void
+
+}