Index: lib/Analysis/CostModel.cpp
===================================================================
--- lib/Analysis/CostModel.cpp
+++ lib/Analysis/CostModel.cpp
@@ -130,6 +130,20 @@
     OpInfo = TargetTransformInfo::OK_NonUniformConstantValue;
     if (cast<Constant>(V)->getSplatValue() != nullptr)
       OpInfo = TargetTransformInfo::OK_UniformConstantValue;
+  } else if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(V)) {
+    // Check for a splat of a variant.
+    unsigned NumVecElems = V->getType()->getVectorNumElements();
+    if (isPowerOf2_32(NumVecElems)) {
+      SmallVector<int, 16> ShuffleMask(NumVecElems, 0);
+      // Check that shuffle masks matches.
+      SmallVector<int, 16> Mask = SI->getShuffleMask();
+      for (unsigned i = 0; i < NumVecElems; i++) {
+        for (unsigned j = 0; j < NumVecElems; j++)
+          ShuffleMask[j] = i;
+        if (ShuffleMask == Mask)
+          OpInfo = TargetTransformInfo::OK_UniformValue;
+      }
+    }
   }
 
   return OpInfo;
Index: lib/Target/X86/X86TargetTransformInfo.cpp
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.cpp
+++ lib/Target/X86/X86TargetTransformInfo.cpp
@@ -223,6 +223,25 @@
       return LT.first * SSE2UniformConstCostTable[Idx].Cost;
   }
 
+  static const CostTblEntry<MVT::SimpleValueType> SSE2UniformCostTable[] = {
+    { ISD::SHL,  MVT::v8i16,  1 }, // psllw.
+    { ISD::SHL,  MVT::v4i32,  1 }, // pslld
+    { ISD::SHL,  MVT::v2i64,  1 }, // psllq.
+
+    { ISD::SRL,  MVT::v8i16,  1 }, // psrlw.
+    { ISD::SRL,  MVT::v4i32,  1 }, // psrld.
+    { ISD::SRL,  MVT::v2i64,  1 }, // psrlq.
+
+    { ISD::SRA,  MVT::v8i16,  1 }, // psraw.
+    { ISD::SRA,  MVT::v4i32,  1 }, // psrad.
+  };
+
+  if (Op2Info == TargetTransformInfo::OK_UniformValue && ST->hasSSE2()) {
+    int Idx = CostTableLookup(SSE2UniformCostTable, ISD, LT.second);
+    if (Idx != -1)
+      return LT.first * SSE2UniformCostTable[Idx].Cost;
+  }
+
   if (ISD == ISD::SHL &&
       Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) {
     EVT VT = LT.second;
Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4535,6 +4535,10 @@
           Op2VP = TargetTransformInfo::OP_PowerOf2;
         Op2VK = TargetTransformInfo::OK_UniformConstantValue;
       }
+    } else if (SE->isSCEVable(Op2->getType())) {
+      const SCEV *Op2SCEV = SE->getSCEV(Op2);
+      if (SE->isLoopInvariant(Op2SCEV, TheLoop))
+        Op2VK = TargetTransformInfo::OK_UniformValue;
     }
 
     return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK,
Index: test/Analysis/CostModel/X86/testshiftashr.ll
===================================================================
--- test/Analysis/CostModel/X86/testshiftashr.ll
+++ test/Analysis/CostModel/X86/testshiftashr.ll
@@ -529,3 +529,59 @@
   ret %shifttypec32i8 %0
 }
 
+; Uniform variant shift.
+%shifttypeu16i8 = type <16 x i8>
+define %shifttypeu16i8 @shift16i8u(%shifttypeu16i8 %a, i8 %b) {
+entry:
+  ; SSE2: shift16i8u
+  ; SSE2: cost of 160 {{.*}} ashr
+  ; SSE2-CODEGEN: shift16i8u
+  ; SSE2-CODEGEN: sarb %cl
+
+  %broadcast.splatinsert1 = insertelement <16 x i8> undef, i8 %b, i32 0
+  %broadcast.splat2 = shufflevector <16 x i8> %broadcast.splatinsert1, <16 x i8> undef, <16 x i32> zeroinitializer
+  %tmp = ashr <16 x i8> %a, %broadcast.splat2
+  ret %shifttypeu16i8 %tmp
+}
+
+%shifttypeu8i16 = type <8 x i16>
+define %shifttypeu8i16 @shift8i16u(%shifttypeu8i16 %a, i16 %b) {
+entry:
+  ; SSE2: shift8i16u
+  ; SSE2: cost of 1 {{.*}} ashr
+  ; SSE2-CODEGEN: shift8i16u
+  ; SSE2-CODEGEN: psraw
+
+  %broadcast.splatinsert1 = insertelement <8 x i16> undef, i16 %b, i32 0
+  %broadcast.splat2 = shufflevector <8 x i16> %broadcast.splatinsert1, <8 x i16> undef, <8 x i32> zeroinitializer
+  %tmp = ashr <8 x i16> %a, %broadcast.splat2
+  ret %shifttypeu8i16 %tmp
+}
+
+%shifttypeu4i32 = type <4 x i32>
+define %shifttypeu4i32 @shift4i32u(%shifttypeu4i32 %a, i32 %b) {
+entry:
+  ; SSE2: shift4i32u
+  ; SSE2: cost of 1 {{.*}} ashr
+  ; SSE2-CODEGEN: shift4i32u
+  ; SSE2-CODEGEN: psrad
+
+  %broadcast.splatinsert1 = insertelement <4 x i32> undef, i32 %b, i32 0
+  %broadcast.splat2 = shufflevector <4 x i32> %broadcast.splatinsert1, <4 x i32> undef, <4 x i32> zeroinitializer
+  %tmp = ashr <4 x i32> %a, %broadcast.splat2
+  ret %shifttypeu4i32 %tmp
+}
+
+%shifttypeu2i64 = type <2 x i64>
+define %shifttypeu2i64 @shift2i64u(%shifttypeu2i64 %a, i64 %b) {
+entry:
+  ; SSE2: shift2i64u
+  ; SSE2: cost of 20 {{.*}} ashr
+  ; SSE2-CODEGEN: shift2i64u
+  ; SSE2-CODEGEN: sarq %cl
+
+  %broadcast.splatinsert1 = insertelement <2 x i64> undef, i64 %b, i32 0
+  %broadcast.splat2 = shufflevector <2 x i64> %broadcast.splatinsert1, <2 x i64> undef, <2 x i32> zeroinitializer
+  %tmp = ashr <2 x i64> %a, %broadcast.splat2
+  ret %shifttypeu2i64 %tmp
+}
Index: test/Analysis/CostModel/X86/testshiftlshr.ll
===================================================================
--- test/Analysis/CostModel/X86/testshiftlshr.ll
+++ test/Analysis/CostModel/X86/testshiftlshr.ll
@@ -527,3 +527,60 @@
                                   i8 3, i8 3, i8 3, i8 3>
   ret %shifttypec32i8 %0
 }
+
+; Uniform variant shift.
+%shifttypeu16i8 = type <16 x i8>
+define %shifttypeu16i8 @shift16i8u(%shifttypeu16i8 %a, i8 %b) {
+entry:
+  ; SSE2: shift16i8u
+  ; SSE2: cost of 160 {{.*}} lshr
+  ; SSE2-CODEGEN: shift16i8u
+  ; SSE2-CODEGEN: shrb %cl
+
+  %broadcast.splatinsert1 = insertelement <16 x i8> undef, i8 %b, i32 0
+  %broadcast.splat2 = shufflevector <16 x i8> %broadcast.splatinsert1, <16 x i8> undef, <16 x i32> zeroinitializer
+  %tmp = lshr <16 x i8> %a, %broadcast.splat2
+  ret %shifttypeu16i8 %tmp
+}
+
+%shifttypeu8i16 = type <8 x i16>
+define %shifttypeu8i16 @shift8i16u(%shifttypeu8i16 %a, i16 %b) {
+entry:
+  ; SSE2: shift8i16u
+  ; SSE2: cost of 1 {{.*}} lshr
+  ; SSE2-CODEGEN: shift8i16u
+  ; SSE2-CODEGEN: psrlw
+
+  %broadcast.splatinsert1 = insertelement <8 x i16> undef, i16 %b, i32 0
+  %broadcast.splat2 = shufflevector <8 x i16> %broadcast.splatinsert1, <8 x i16> undef, <8 x i32> zeroinitializer
+  %tmp = lshr <8 x i16> %a, %broadcast.splat2
+  ret %shifttypeu8i16 %tmp
+}
+
+%shifttypeu4i32 = type <4 x i32>
+define %shifttypeu4i32 @shift4i32u(%shifttypeu4i32 %a, i32 %b) {
+entry:
+  ; SSE2: shift4i32u
+  ; SSE2: cost of 1 {{.*}} lshr
+  ; SSE2-CODEGEN: shift4i32u
+  ; SSE2-CODEGEN: psrld
+
+  %broadcast.splatinsert1 = insertelement <4 x i32> undef, i32 %b, i32 0
+  %broadcast.splat2 = shufflevector <4 x i32> %broadcast.splatinsert1, <4 x i32> undef, <4 x i32> zeroinitializer
+  %tmp = lshr <4 x i32> %a, %broadcast.splat2
+  ret %shifttypeu4i32 %tmp
+}
+
+%shifttypeu2i64 = type <2 x i64>
+define %shifttypeu2i64 @shift2i64u(%shifttypeu2i64 %a, i64 %b) {
+entry:
+  ; SSE2: shift2i64u
+  ; SSE2: cost of 1 {{.*}} lshr
+  ; SSE2-CODEGEN: shift2i64u
+  ; SSE2-CODEGEN: psrlq
+
+  %broadcast.splatinsert1 = insertelement <2 x i64> undef, i64 %b, i32 0
+  %broadcast.splat2 = shufflevector <2 x i64> %broadcast.splatinsert1, <2 x i64> undef, <2 x i32> zeroinitializer
+  %tmp = lshr <2 x i64> %a, %broadcast.splat2
+  ret %shifttypeu2i64 %tmp
+}
Index: test/Analysis/CostModel/X86/testshiftshl.ll
===================================================================
--- test/Analysis/CostModel/X86/testshiftshl.ll
+++ test/Analysis/CostModel/X86/testshiftshl.ll
@@ -527,3 +527,60 @@
                                   i8 3, i8 3, i8 3, i8 3>
   ret %shifttypec32i8 %0
 }
+
+; Uniform variant shift.
+%shifttypeu16i8 = type <16 x i8>
+define %shifttypeu16i8 @shift16i8u(%shifttypeu16i8 %a, i8 %b) {
+entry:
+  ; SSE2: shift16i8u
+  ; SSE2: cost of 30 {{.*}} shl
+  ; SSE2-CODEGEN: shift16i8u
+  ; SSE2-CODEGEN: psllw
+
+  %broadcast.splatinsert1 = insertelement <16 x i8> undef, i8 %b, i32 0
+  %broadcast.splat2 = shufflevector <16 x i8> %broadcast.splatinsert1, <16 x i8> undef, <16 x i32> zeroinitializer
+  %tmp = shl <16 x i8> %a, %broadcast.splat2
+  ret %shifttypeu16i8 %tmp
+}
+
+%shifttypeu8i16 = type <8 x i16>
+define %shifttypeu8i16 @shift8i16u(%shifttypeu8i16 %a, i16 %b) {
+entry:
+  ; SSE2: shift8i16u
+  ; SSE2: cost of 1 {{.*}} shl
+  ; SSE2-CODEGEN: shift8i16u
+  ; SSE2-CODEGEN: psllw
+
+  %broadcast.splatinsert1 = insertelement <8 x i16> undef, i16 %b, i32 0
+  %broadcast.splat2 = shufflevector <8 x i16> %broadcast.splatinsert1, <8 x i16> undef, <8 x i32> zeroinitializer
+  %tmp = shl <8 x i16> %a, %broadcast.splat2
+  ret %shifttypeu8i16 %tmp
+}
+
+%shifttypeu4i32 = type <4 x i32>
+define %shifttypeu4i32 @shift4i32u(%shifttypeu4i32 %a, i32 %b) {
+entry:
+  ; SSE2: shift4i32u
+  ; SSE2: cost of 1 {{.*}} shl
+  ; SSE2-CODEGEN: shift4i32u
+  ; SSE2-CODEGEN: pslld
+
+  %broadcast.splatinsert1 = insertelement <4 x i32> undef, i32 %b, i32 0
+  %broadcast.splat2 = shufflevector <4 x i32> %broadcast.splatinsert1, <4 x i32> undef, <4 x i32> zeroinitializer
+  %tmp = shl <4 x i32> %a, %broadcast.splat2
+  ret %shifttypeu4i32 %tmp
+}
+
+%shifttypeu2i64 = type <2 x i64>
+define %shifttypeu2i64 @shift2i64u(%shifttypeu2i64 %a, i64 %b) {
+entry:
+  ; SSE2: shift2i64u
+  ; SSE2: cost of 1 {{.*}} shl
+  ; SSE2-CODEGEN: shift2i64u
+  ; SSE2-CODEGEN: psllq
+
+  %broadcast.splatinsert1 = insertelement <2 x i64> undef, i64 %b, i32 0
+  %broadcast.splat2 = shufflevector <2 x i64> %broadcast.splatinsert1, <2 x i64> undef, <2 x i32> zeroinitializer
+  %tmp = shl <2 x i64> %a, %broadcast.splat2
+  ret %shifttypeu2i64 %tmp
+}
Index: test/Transforms/LoopVectorize/uniform-shift.ll
===================================================================
--- test/Transforms/LoopVectorize/uniform-shift.ll
+++ test/Transforms/LoopVectorize/uniform-shift.ll
@@ -0,0 +1,39 @@
+; PR23582
+; RUN: opt < %s -basicaa -loop-vectorize -force-vector-interleave=1 -dce -instcombine -simplifycfg -S | llc | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@k = common global i32 0, align 4
+@A1 = common global [1024 x i32] zeroinitializer, align 16
+@B1 = common global [1024 x i32] zeroinitializer, align 16
+@C1 = common global [1024 x i32] zeroinitializer, align 16
+
+; This test checks that loop vectorizer will generate uniform vshift.
+; CHECK-LABEL: kernel1:
+; CHECK: [[LOOP:^[a-zA-Z0-9_.]+]]:
+; CHECK: movdqa {{.*}}, [[REG:%xmm[0-7]]]
+; CHECK-NEXT: psrad {{%xmm[0-7]}}, [[REG]]
+; CHECK-NEXT: movdqa [[REG]], {{.*}}
+; CHECK-NEXT: addq $16, {{%[a-z0-9]+}}
+; CHECK-NEXT: jne [[LOOP]]
+
+define void @kernel1() {
+entry:
+  %tmp = load i32, i32* @k, align 4
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @B1, i64 0, i64 %indvars.iv
+  %tmp1 = load i32, i32* %arrayidx, align 4
+  %shr = ashr i32 %tmp1, %tmp
+  %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @A1, i64 0, i64 %indvars.iv
+  store i32 %shr, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}