Index: llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
===================================================================
--- llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -277,13 +277,13 @@
 /// Implement the transforms common to all CastInst visitors.
 Instruction *InstCombinerImpl::commonCastTransforms(CastInst &CI) {
   Value *Src = CI.getOperand(0);
+  auto *Ty = CI.getType();
 
   // Try to eliminate a cast of a cast.
   if (auto *CSrc = dyn_cast<CastInst>(Src)) {   // A->B->C cast
     if (Instruction::CastOps NewOpc = isEliminableCastPair(CSrc, &CI)) {
       // The first cast (CSrc) is eliminable so we need to fix up or replace
       // the second cast (CI). CSrc will then have a good chance of being dead.
-      auto *Ty = CI.getType();
       auto *Res = CastInst::Create(NewOpc, CSrc->getOperand(0), Ty);
       // Point debug users of the dying cast to the new one.
       if (CSrc->hasOneUse())
@@ -319,6 +319,23 @@
         return NV;
   }
 
+  // Canonicalize a unary shuffle after the cast if neither operation changes
+  // the size or element size of the input vector.
+  // TODO: We could allow size-changing ops if that doesn't harm codegen.
+  // cast (shuffle X, Mask) --> shuffle (cast X), Mask
+  Value *X;
+  ArrayRef<int> Mask;
+  if (match(Src, m_OneUse(m_Shuffle(m_Value(X), m_Undef(), m_Mask(Mask))))) {
+    auto *SrcTy = dyn_cast<FixedVectorType>(X->getType());
+    auto *DestTy = dyn_cast<FixedVectorType>(Ty);
+    if (SrcTy && DestTy &&
+        SrcTy->getNumElements() == DestTy->getNumElements() &&
+        SrcTy->getPrimitiveSizeInBits() == DestTy->getPrimitiveSizeInBits()) {
+      Value *CastX = Builder.CreateCast(CI.getOpcode(), X, DestTy);
+      return new ShuffleVectorInst(CastX, UndefValue::get(DestTy), Mask);
+    }
+  }
+
   return nullptr;
 }
 
Index: llvm/test/Transforms/InstCombine/X86/x86-f16c-inseltpoison.ll
===================================================================
--- llvm/test/Transforms/InstCombine/X86/x86-f16c-inseltpoison.ll
+++ llvm/test/Transforms/InstCombine/X86/x86-f16c-inseltpoison.ll
@@ -24,8 +24,8 @@
 ; All 8 elements required.
 define <8 x float> @demand_vcvtph2ps_256(<8 x i16> %A) {
 ; CHECK-LABEL: @demand_vcvtph2ps_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to <8 x half>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[A:%.*]] to <8 x half>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[CVTPH2PS:%.*]] = fpext <8 x half> [[TMP2]] to <8 x float>
 ; CHECK-NEXT:    ret <8 x float> [[CVTPH2PS]]
 ;
Index: llvm/test/Transforms/InstCombine/X86/x86-f16c.ll
===================================================================
--- llvm/test/Transforms/InstCombine/X86/x86-f16c.ll
+++ llvm/test/Transforms/InstCombine/X86/x86-f16c.ll
@@ -24,8 +24,8 @@
 ; All 8 elements required.
 define <8 x float> @demand_vcvtph2ps_256(<8 x i16> %A) {
 ; CHECK-LABEL: @demand_vcvtph2ps_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to <8 x half>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[A:%.*]] to <8 x half>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[CVTPH2PS:%.*]] = fpext <8 x half> [[TMP2]] to <8 x float>
 ; CHECK-NEXT:    ret <8 x float> [[CVTPH2PS]]
 ;
Index: llvm/test/Transforms/InstCombine/shuffle-cast-dist.ll
===================================================================
--- llvm/test/Transforms/InstCombine/shuffle-cast-dist.ll
+++ llvm/test/Transforms/InstCombine/shuffle-cast-dist.ll
@@ -4,8 +4,8 @@
 define <2 x float> @vtrn1(<2 x i32> %v)
 ; CHECK-LABEL: @vtrn1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[R_UNCASTED:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[R:%.*]] = bitcast <2 x i32> [[R_UNCASTED]] to <2 x float>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <2 x float>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
 {
Index: llvm/test/Transforms/InstCombine/shufflevec-bitcast-inseltpoison.ll
===================================================================
--- llvm/test/Transforms/InstCombine/shufflevec-bitcast-inseltpoison.ll
+++ llvm/test/Transforms/InstCombine/shufflevec-bitcast-inseltpoison.ll
@@ -56,9 +56,9 @@
 
 define <4 x i32> @splat_bitcast_operand_same_size_src_elt(<4 x float> %x) {
 ; CHECK-LABEL: @splat_bitcast_operand_same_size_src_elt(
-; CHECK-NEXT:    [[S1:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT:    [[S2:%.*]] = bitcast <4 x float> [[S1]] to <4 x i32>
-; CHECK-NEXT:    ret <4 x i32> [[S2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[X:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[BC:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    ret <4 x i32> [[BC]]
 ;
   %s1 = shufflevector <4 x float> %x, <4 x float> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
   %bc = bitcast <4 x float> %s1 to <4 x i32>
Index: llvm/test/Transforms/InstCombine/shufflevec-bitcast.ll
===================================================================
--- llvm/test/Transforms/InstCombine/shufflevec-bitcast.ll
+++ llvm/test/Transforms/InstCombine/shufflevec-bitcast.ll
@@ -56,9 +56,9 @@
 
 define <4 x i32> @splat_bitcast_operand_same_size_src_elt(<4 x float> %x) {
 ; CHECK-LABEL: @splat_bitcast_operand_same_size_src_elt(
-; CHECK-NEXT:    [[S1:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT:    [[S2:%.*]] = bitcast <4 x float> [[S1]] to <4 x i32>
-; CHECK-NEXT:    ret <4 x i32> [[S2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[X:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[BC:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    ret <4 x i32> [[BC]]
 ;
   %s1 = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
   %bc = bitcast <4 x float> %s1 to <4 x i32>
Index: llvm/test/Transforms/InstCombine/vector-casts.ll
===================================================================
--- llvm/test/Transforms/InstCombine/vector-casts.ll
+++ llvm/test/Transforms/InstCombine/vector-casts.ll
@@ -413,8 +413,8 @@
 
 define <4 x float> @sitofp_shuf(<4 x i32> %x) {
 ; CHECK-LABEL: @sitofp_shuf(
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 undef>
-; CHECK-NEXT:    [[R:%.*]] = sitofp <4 x i32> [[S]] to <4 x float>
+; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[X:%.*]] to <4 x float>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 undef>
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %s = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 undef>
@@ -424,8 +424,8 @@
 
 define <3 x half> @uitofp_shuf(<3 x i16> %x) {
 ; CHECK-LABEL: @uitofp_shuf(
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <3 x i16> [[X:%.*]], <3 x i16> poison, <3 x i32> <i32 2, i32 undef, i32 0>
-; CHECK-NEXT:    [[R:%.*]] = uitofp <3 x i16> [[S]] to <3 x half>
+; CHECK-NEXT:    [[TMP1:%.*]] = uitofp <3 x i16> [[X:%.*]] to <3 x half>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x half> [[TMP1]], <3 x half> undef, <3 x i32> <i32 2, i32 undef, i32 0>
 ; CHECK-NEXT:    ret <3 x half> [[R]]
 ;
   %s = shufflevector <3 x i16> %x, <3 x i16> poison, <3 x i32> <i32 2, i32 undef, i32 0>
@@ -435,8 +435,8 @@
 
 define <4 x i64> @fptosi_shuf(<4 x double> %x) {
 ; CHECK-LABEL: @fptosi_shuf(
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x double> [[X:%.*]], <4 x double> poison, <4 x i32> <i32 undef, i32 2, i32 3, i32 undef>
-; CHECK-NEXT:    [[R:%.*]] = fptosi <4 x double> [[S]] to <4 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = fptosi <4 x double> [[X:%.*]] to <4 x i64>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> undef, <4 x i32> <i32 undef, i32 2, i32 3, i32 undef>
 ; CHECK-NEXT:    ret <4 x i64> [[R]]
 ;
   %s = shufflevector <4 x double> %x, <4 x double> poison, <4 x i32> <i32 undef, i32 2, i32 3, i32 undef>
@@ -446,8 +446,8 @@
 
 define <2 x i32> @fptoui_shuf(<2 x float> %x) {
 ; CHECK-LABEL: @fptoui_shuf(
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <2 x float> [[X:%.*]], <2 x float> poison, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[R:%.*]] = fptoui <2 x float> [[S]] to <2 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = fptoui <2 x float> [[X:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
   %s = shufflevector <2 x float> %x, <2 x float> poison, <2 x i32> <i32 1, i32 1>
@@ -455,6 +455,9 @@
   ret <2 x i32> %r
 }
 
+; negative test
+; TODO: Should we reduce the width of the shuffle?
+
 define <4 x half> @narrowing_sitofp_shuf(<4 x i32> %x) {
 ; CHECK-LABEL: @narrowing_sitofp_shuf(
 ; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 undef>
@@ -466,6 +469,8 @@
   ret <4 x half> %r
 }
 
+; negative test
+
 define <4 x double> @widening_uitofp_shuf(<4 x i32> %x) {
 ; CHECK-LABEL: @widening_uitofp_shuf(
 ; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 undef>
@@ -477,6 +482,8 @@
   ret <4 x double> %r
 }
 
+; negative test
+
 define <3 x i64> @fptosi_narrowing_shuf(<4 x double> %x) {
 ; CHECK-LABEL: @fptosi_narrowing_shuf(
 ; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x double> [[X:%.*]], <4 x double> poison, <3 x i32> <i32 undef, i32 2, i32 3>
@@ -488,6 +495,9 @@
   ret <3 x i64> %r
 }
 
+; negative test
+; TODO: Should we reduce the width of the cast?
+
 define <3 x i32> @fptoui_widening_shuf(<2 x float> %x) {
 ; CHECK-LABEL: @fptoui_widening_shuf(
 ; CHECK-NEXT:    [[S:%.*]] = shufflevector <2 x float> [[X:%.*]], <2 x float> poison, <3 x i32> <i32 1, i32 1, i32 0>
@@ -499,6 +509,9 @@
   ret <3 x i32> %r
 }
 
+; negative test
+; TODO: Should we reduce the width of the cast?
+
 define <4 x half> @narrowing_sitofp_widening_shuf(<2 x i32> %x) {
 ; CHECK-LABEL: @narrowing_sitofp_widening_shuf(
 ; CHECK-NEXT:    [[S:%.*]] = shufflevector <2 x i32> [[X:%.*]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 0, i32 undef>
@@ -512,6 +525,8 @@
 
 declare void @use(<4 x i32>)
 
+; negative test
+
 define <4 x float> @sitofp_shuf_extra_use(<4 x i32> %x) {
 ; CHECK-LABEL: @sitofp_shuf_extra_use(
 ; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 undef>