Index: lib/Transforms/InstCombine/InstCombineInternal.h
===================================================================
--- lib/Transforms/InstCombine/InstCombineInternal.h
+++ lib/Transforms/InstCombine/InstCombineInternal.h
@@ -799,6 +799,10 @@
   ///
   /// If the multiplication is known not to overflow then NoSignedWrap is set.
   Value *Descale(Value *Val, APInt Scale, bool &NoSignedWrap);
+
+  // Rearange shuffle-bitcast-shuffle sequence into:
+  // shuffle-shuffle-bitcast or bitcast-shuffle-shuffle
+  Instruction *RearangeShuffleBitcastShuffle(ShuffleVectorInst &Shuf);
 };
 
 } // end namespace llvm
Index: lib/Transforms/InstCombine/InstCombineVectorOps.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -1206,6 +1206,11 @@
     Value *V = EvaluateInDifferentElementOrder(LHS, Mask);
     return replaceInstUsesWith(SVI, V);
   }
+  if (isa<UndefValue>(RHS) && isa<BitCastInst>(LHS)) {
+    Instruction *Inst = RearangeShuffleBitcastShuffle(SVI);
+    if (Inst)
+      return replaceInstUsesWith(SVI, Inst);
+  }
 
   // SROA generates shuffle+bitcast when the extracted sub-vector is bitcast to
   // a non-vector type. We can instead bitcast the original vector followed by
Index: lib/Transforms/InstCombine/InstructionCombining.cpp
===================================================================
--- lib/Transforms/InstCombine/InstructionCombining.cpp
+++ lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1408,6 +1408,261 @@
   } while (true);
 }
 
+// Computes shuffle mask for smaller vector element size
+// InNumElts, InEltsSize - target vector <number of elements x element size>, eg
+// <16 x 1> ShufNumElts, ShufEltSize - current shuffle <number of elements x
+// element size>, eg <4 x 4> ShufMask - current shuffle mask NewShufMask -
+// recomputed shuffle mask
+static bool ShuffleMaskForSmallerVectorElement(
+    int InNumElts, int ShufNumElts, int InEltBytes, int ShufEltBytes,
+    const SmallVector<int, 16> &ShufMask, SmallVector<int, 16> &NewShufMask) {
+
+  if ((int)ShufMask.size() != ShufNumElts)
+    return false;
+  if ((InNumElts * InEltBytes) != (ShufNumElts * ShufEltBytes))
+    return false;
+  int ChunkSize = ShufEltBytes / InEltBytes;
+  if (!ChunkSize)
+    return false;
+  NewShufMask.resize(InNumElts);
+  for (int i = 0; i < ShufNumElts; ++i) {
+    int ShufMaskElt = ShufMask[i];
+    for (int j = 0, m = i * ChunkSize; j < ChunkSize; ++j) {
+      if ((ShufMaskElt < 0) || (ShufMaskElt >= ShufNumElts))
+        NewShufMask[m + j] = -1;
+      else
+        NewShufMask[m + j] = (ShufMaskElt * ChunkSize) + j;
+    }
+  }
+  return true;
+}
+
+// Computes shuffle mask for bigger vector element size
+// InNumElts, InEltsSize - target vector <number of elements x element size>, eg
+// <4 x 4> ShufNumElts, ShufEltSize - current shuffle <number of elements x
+// element size>, eg <16 x 1> ShufMask - current shuffle mask NewShufMask -
+// recomputed shuffle mask NOTE: not always succedes
+static bool ShuffleMaskForBiggerVectorElement(
+    int InNumElts, int ShufNumElts, int InEltBytes, int ShufEltBytes,
+    const SmallVector<int, 16> &ShufMask, SmallVector<int, 16> &NewShufMask) {
+
+  if ((int)ShufMask.size() != ShufNumElts)
+    return false;
+  if ((InNumElts * InEltBytes) != (ShufNumElts * ShufEltBytes))
+    return false;
+  int ChunkSize = InEltBytes / ShufEltBytes;
+  if (!ChunkSize)
+    return false;
+
+  NewShufMask.resize(InNumElts);
+  for (int i = 0, m = 0; i < ShufNumElts; i += ChunkSize, m++) {
+    int ShufMaskElt = ShufMask[i];
+    for (int j = 0; j < ChunkSize; ++j) {
+      if ((ShufMaskElt < 0) ||
+          (ShufMaskElt >=
+           ShufNumElts)) { // either all elements in a chunk are undefined
+        if (ShufMask[i + j] != ShufMaskElt)
+          return false;
+      } else { // or they are all sequencial
+        if (ShufMask[i + j] != (ShufMaskElt + j))
+          return false;
+      }
+    }
+    NewShufMask[m] = ShufMaskElt / ChunkSize;
+  }
+  return true;
+}
+
+static Constant *ShuffleMaskToConstantVector(ArrayRef<int> ShufMask,
+                                             InstCombiner::BuilderTy &Builder) {
+  SmallVector<Constant *, 16> MaskValues;
+  for (int i = 0, e = ShufMask.size(); i != e; ++i) {
+    if (ShufMask[i] == -1)
+      MaskValues.push_back(UndefValue::get(Builder.getInt32Ty()));
+    else
+      MaskValues.push_back(Builder.getInt32(ShufMask[i]));
+  }
+  return ConstantVector::get(MaskValues);
+}
+
+template <typename VectorInstType>
+static bool getVectorInstructionTypeInfo(const VectorInstType &VectInst,
+                                         int &NumElts, int &EltBytes) {
+  NumElts = EltBytes = 0;
+  VectorType *VectInstTy = cast<VectorType>(VectInst.getType());
+  if (!VectInstTy)
+    return false;
+  NumElts = VectInstTy->getVectorNumElements();
+  EltBytes = VectInstTy->getVectorElementType()->getScalarSizeInBits() / 8;
+  return true;
+}
+
+template <typename VectorInstType>
+static bool getVectorInstructionOperandTypeInfo(const VectorInstType &VectInst,
+                                                int OpNum, int &NumElts,
+                                                int &EltBytes) {
+  NumElts = EltBytes = 0;
+  Value *VectInstOp = VectInst.getOperand(OpNum);
+  if (isa<UndefValue>(VectInstOp))
+    return true;
+  VectorType *VectInstOpTy = cast<VectorType>(VectInstOp->getType());
+  if (!VectInstOpTy)
+    return false;
+  NumElts = VectInstOpTy->getVectorNumElements();
+  EltBytes = VectInstOpTy->getVectorElementType()->getScalarSizeInBits() / 8;
+  return true;
+}
+
+static bool ComputeShuffleMaskForSmallerVectorElement(
+    const BinaryOperator &Inst, const ShuffleVectorInst &Shuf,
+    Constant *&NewMask, InstCombiner::BuilderTy &Builder) {
+
+  SmallVector<int, 16> ShufMask = Shuf.getShuffleMask();
+  int InstNumElts, ShufNumElts, InstEltBytes, ShufEltBytes;
+  if (!getVectorInstructionTypeInfo(Inst, InstNumElts, InstEltBytes))
+    return false;
+  if (!getVectorInstructionTypeInfo(Shuf, ShufNumElts, ShufEltBytes))
+    return false;
+  SmallVector<int, 16> NewShufMask;
+  bool ok =
+      ShuffleMaskForSmallerVectorElement(InstNumElts, ShufNumElts, InstEltBytes,
+                                         ShufEltBytes, ShufMask, NewShufMask);
+  if (!ok)
+    return false;
+  NewMask = ShuffleMaskToConstantVector(NewShufMask, Builder);
+  return true;
+}
+
+static bool ComputeShuffleMaskForBiggerVectorElement(
+    const BinaryOperator &Inst, const ShuffleVectorInst &Shuf,
+    Constant *&NewMask, InstCombiner::BuilderTy &Builder) {
+
+  SmallVector<int, 16> ShufMask = Shuf.getShuffleMask();
+  int InstNumElts, ShufNumElts, InstEltBytes, ShufEltBytes;
+  if (!getVectorInstructionTypeInfo(Inst, InstNumElts, InstEltBytes))
+    return false;
+  if (!getVectorInstructionTypeInfo(Shuf, ShufNumElts, ShufEltBytes))
+    return false;
+  SmallVector<int, 16> NewShufMask;
+  bool ok =
+      ShuffleMaskForBiggerVectorElement(InstNumElts, ShufNumElts, InstEltBytes,
+                                        ShufEltBytes, ShufMask, NewShufMask);
+  if (!ok)
+    return false;
+  NewMask = ShuffleMaskToConstantVector(NewShufMask, Builder);
+  return true;
+}
+
+static bool ComputeShuffleMaskForDifferentVectorType(
+    const BinaryOperator &Inst, const ShuffleVectorInst &Shuf,
+    Constant *&NewMask, InstCombiner::BuilderTy &Builder) {
+
+  int InstNumElts, ShufNumElts, InstEltBytes, ShufEltBytes;
+  if (!getVectorInstructionTypeInfo(Inst, InstNumElts, InstEltBytes))
+    return false;
+  if (!getVectorInstructionTypeInfo(Shuf, ShufNumElts, ShufEltBytes))
+    return false;
+  if (InstNumElts > ShufNumElts) {
+    return ComputeShuffleMaskForSmallerVectorElement(Inst, Shuf, NewMask,
+                                                     Builder);
+  } else if (InstNumElts < ShufNumElts) {
+    return ComputeShuffleMaskForBiggerVectorElement(Inst, Shuf, NewMask,
+                                                    Builder);
+  }
+  return false;
+}
+
+static Instruction *BitcastShuffleShuffleSequence(
+    ShuffleVectorInst &Shuf1, ShuffleVectorInst &Shuf2,
+    const SmallVector<int, 16> &NewShufMask, InstCombiner::BuilderTy &Builder) {
+  Constant *NewShuf2MaskC = ShuffleMaskToConstantVector(NewShufMask, Builder);
+  Constant *Shuf2MaskC = Shuf2.getMask();
+  Value *NewBitCast =
+      Builder.CreateBitCast(Shuf1.getOperand(0), Shuf2.getType());
+  Value *NewShuf1 = Builder.CreateShuffleVector(
+      NewBitCast, UndefValue::get(Shuf2.getType()), NewShuf2MaskC);
+  Value *NewShuf2 = Builder.CreateShuffleVector(
+      NewShuf1, UndefValue::get(Shuf2.getType()), Shuf2MaskC);
+  return cast<Instruction>(NewShuf2);
+}
+
+static Instruction *ShuffleShuffleBitcastSequence(
+    ShuffleVectorInst &Shuf1, ShuffleVectorInst &Shuf2,
+    const SmallVector<int, 16> &NewShufMask, InstCombiner::BuilderTy &Builder) {
+  Constant *NewShuf2MaskC = ShuffleMaskToConstantVector(NewShufMask, Builder);
+  Value *NewShuf2 = Builder.CreateShuffleVector(
+      &Shuf1, UndefValue::get(Shuf1.getType()), NewShuf2MaskC);
+  Value *NewBitCast = Builder.CreateBitCast(NewShuf2, Shuf2.getType());
+  return cast<Instruction>(NewBitCast);
+}
+
+// Rearange shuffle-bitcast-shuffle sequence:
+// x1 = shuffle(<T1> x0, mask1); x2=bitcact(<T1> x1, <T2>); x3 = shuffle(<T2>
+// x2, mask2); Into two possible sequences:
+// 1. => x1 = shuffle(<T1> x0, mask1); x2 = shuffle(<T1> x1, new_mask2);  x3 =
+// bitcact(<T1> x2, <T2>);
+// 2. => x1 = bitcact(<T1> x0, <T2>);  x2 = shuffle(<T2>, new_mask1); x3 =
+// shuffle(<T2> x2, mask2); Backend usually will replace two addjacent shuffles
+// by one shuffle instruction with combined mask
+Instruction *
+InstCombiner::RearangeShuffleBitcastShuffle(ShuffleVectorInst &Shuf) {
+  Value *ShufLHS = Shuf.getOperand(0);
+  Value *ShufRHS = Shuf.getOperand(1);
+
+  if (!isa<BitCastInst>(ShufLHS))
+    return nullptr;
+  if (!isa<UndefValue>(ShufRHS))
+    return nullptr;
+  BitCastInst *ShufBc = cast<BitCastInst>(ShufLHS);
+
+  Value *ShufBcOp = ShufBc->getOperand(0);
+  if (!isa<ShuffleVectorInst>(ShufBcOp))
+    return nullptr;
+  ShuffleVectorInst *BcShuf = cast<ShuffleVectorInst>(ShufBcOp);
+
+  int ShufNumElts, BcShufNumElts, ShufEltBytes, BcShufEltBytes;
+  if (!getVectorInstructionTypeInfo(Shuf, ShufNumElts, ShufEltBytes))
+    return nullptr;
+  if (!getVectorInstructionTypeInfo(*BcShuf, BcShufNumElts, BcShufEltBytes))
+    return nullptr;
+  if ((ShufNumElts * ShufEltBytes) != (BcShufNumElts * BcShufEltBytes))
+    return nullptr;
+  SmallVector<int, 16> ShufMask = Shuf.getShuffleMask();
+  SmallVector<int, 16> BcShufMask = BcShuf->getShuffleMask();
+  SmallVector<int, 16> NewShufMask;
+  bool ok = false;
+  if (BcShufEltBytes > ShufEltBytes) {
+    ok = ShuffleMaskForBiggerVectorElement(BcShufNumElts, ShufNumElts,
+                                           BcShufEltBytes, ShufEltBytes,
+                                           ShufMask, NewShufMask);
+    if (ok) {
+      return ShuffleShuffleBitcastSequence(*BcShuf, Shuf, NewShufMask, Builder);
+    } else {
+      ok = ShuffleMaskForSmallerVectorElement(ShufNumElts, BcShufNumElts,
+                                              ShufEltBytes, BcShufEltBytes,
+                                              BcShufMask, NewShufMask);
+      if (!ok)
+        return nullptr;
+      return BitcastShuffleShuffleSequence(*BcShuf, Shuf, NewShufMask, Builder);
+    }
+  } else { // ShufEltBytes > BcShufEltBytes
+    ok = ShuffleMaskForBiggerVectorElement(ShufNumElts, BcShufNumElts,
+                                           ShufEltBytes, BcShufEltBytes,
+                                           BcShufMask, NewShufMask);
+    if (ok) {
+      return BitcastShuffleShuffleSequence(*BcShuf, Shuf, NewShufMask, Builder);
+    } else {
+      ok = ShuffleMaskForSmallerVectorElement(BcShufNumElts, ShufNumElts,
+                                              BcShufEltBytes, ShufEltBytes,
+                                              ShufMask, NewShufMask);
+      if (!ok)
+        return nullptr;
+      return ShuffleShuffleBitcastSequence(*BcShuf, Shuf, NewShufMask, Builder);
+    }
+  }
+  return nullptr;
+}
+
 /// \brief Creates node of binary operation with the same attributes as the
 /// specified one but with other operands.
 static Value *CreateBinOpAsGiven(BinaryOperator &Inst, Value *LHS, Value *RHS,
@@ -1451,7 +1706,46 @@
     return Builder.CreateShuffleVector(
         NewBO, UndefValue::get(NewBO->getType()), LShuf->getMask());
   }
-
+  // Both arguments of the binary operation are the shuffle instructions, but
+  // binary operation vector element type is different from a shuffle
+  // instruction vector element type, e.g. shuffle operands data type is <4 x
+  // i32>, but a binary operation operands data type is <16 x i8> In this
+  // situation, in order to move the shuffle instruction behind the binary
+  // operation instruction we need change the shuffle instruction data type and
+  // recompute the shuffle instruction mask. We can always do that if we need to
+  // change shuffle vector element type into smaller one, but changing from
+  // smaller shuffle vector element type into bigger vector element type is not
+  // always possible.
+  BitCastInst *LBitCast = dyn_cast<BitCastInst>(LHS);
+  BitCastInst *RBitCast = dyn_cast<BitCastInst>(RHS);
+  if (LBitCast && RBitCast) {
+    Value *LBitCastOp = LBitCast->getOperand(0);
+    Value *RBitCastOp = RBitCast->getOperand(0);
+    ShuffleVectorInst *LBcShuf = dyn_cast<ShuffleVectorInst>(LBitCastOp);
+    ShuffleVectorInst *RBcShuf = dyn_cast<ShuffleVectorInst>(RBitCastOp);
+
+    if (LBcShuf && RBcShuf && LBcShuf->getMask() == RBcShuf->getMask() &&
+        isa<UndefValue>(LBcShuf->getOperand(1)) &&
+        isa<UndefValue>(RBcShuf->getOperand(1)) &&
+        LBcShuf->getOperand(0)->getType() ==
+            RBcShuf->getOperand(0)->getType()) {
+      Constant *NewMask;
+      bool ok = ComputeShuffleMaskForDifferentVectorType(Inst, *LBcShuf,
+                                                         NewMask, Builder);
+      if (ok) {
+        Value *LBitCast =
+            Builder.CreateBitCast(LBcShuf->getOperand(0), Inst.getType());
+        Value *RBitCast =
+            Builder.CreateBitCast(RBcShuf->getOperand(0), Inst.getType());
+
+        Value *NewBinOp = CreateBinOpAsGiven(Inst, LBitCast, RBitCast, Builder);
+
+        Value *NewShuf = Builder.CreateShuffleVector(
+            NewBinOp, UndefValue::get(Inst.getType()), NewMask);
+        return NewShuf;
+      }
+    }
+  }
   // If one argument is a shuffle within one vector, the other is a constant,
   // try moving the shuffle after the binary operation.
   ShuffleVectorInst *Shuffle = nullptr;
Index: test/Transforms/InstCombine/vec_shuffle.ll
===================================================================
--- test/Transforms/InstCombine/vec_shuffle.ll
+++ test/Transforms/InstCombine/vec_shuffle.ll
@@ -463,3 +463,297 @@
   %1 = shufflevector <4 x i32*> %A, <4 x i32*> undef, <2 x i32> <i32 0, i32 1>
   ret <2 x i32*> %1
 }
+
+; Function Attrs: noinline nounwind uwtable
+define <2 x i64> @shuffle_32_add_16_shuffle_32_masks_are_eq(<2 x i64> %v) {
+; CHECK-LABEL: @shuffle_32_add_16_shuffle_32_masks_are_eq(
+; CHECK-NEXT:  [[TMP0:%.*]] = bitcast <2 x i64> %v to <8 x i16>
+; CHECK-NEXT:  [[TMP1:%.*]] = bitcast <2 x i64> %v to <8 x i16>
+; CHECK-NEXT:  [[TMP2:%.*]] = add <8 x i16> [[TMP1:%.*]], [[TMP2:%.*]]
+; CHECK-NEXT:  [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2:%.*]] to <2 x i64>
+; CHECK-NEXT:  ret <2 x i64> [[TMP3:%.*]]
+;
+  %bc0 = bitcast <2 x i64> %v to <4 x i32>
+  %shuffle = shufflevector <4 x i32> %bc0, <4 x i32> zeroinitializer, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %bc1 = bitcast <4 x i32> %shuffle to <2 x i64>
+  %bc2 = bitcast <2 x i64> %bc1 to <8 x i16>
+  %add.i = add <8 x i16> %bc2, %bc2
+  %bc3 = bitcast <8 x i16> %add.i to <2 x i64>
+  %bc4 = bitcast <2 x i64> %bc3 to <4 x i32>
+  %shuffle4 = shufflevector <4 x i32> %bc4, <4 x i32> zeroinitializer, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %bc5 = bitcast <4 x i32> %shuffle4 to <2 x i64>
+  ret <2 x i64> %bc5
+}
+
+; Function Attrs: noinline nounwind uwtable
+define <2 x i64> @shuffle_32_add_8_shuffle_32_masks_are_eq(<2 x i64> %v) {
+; CHECK-LABEL: @shuffle_32_add_8_shuffle_32_masks_are_eq(
+; CHECK-NEXT:  [[TMP0:%.*]] = bitcast <2 x i64> %v to <16 x i8>
+; CHECK-NEXT:  [[TMP1:%.*]] = bitcast <2 x i64> %v to <16 x i8>
+; CHECK-NEXT:  [[TMP2:%.*]] = add <16 x i8> [[TMP1:%.*]], [[TMP2:%.*]]
+; CHECK-NEXT:  [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2:%.*]] to <2 x i64>
+; CHECK-NEXT:  ret <2 x i64> [[TMP3:%.*]]
+;
+  %bc0 = bitcast <2 x i64> %v to <4 x i32>
+  %shuffle = shufflevector <4 x i32> %bc0, <4 x i32> zeroinitializer, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %bc1 = bitcast <4 x i32> %shuffle to <2 x i64>
+  %bc2 = bitcast <2 x i64> %bc1 to <16 x i8>
+  %add.i = add <16 x i8> %bc2, %bc2
+  %bc3 = bitcast <16 x i8> %add.i to <2 x i64>
+  %bc4 = bitcast <2 x i64> %bc3 to <4 x i32>
+  %shuffle4 = shufflevector <4 x i32> %bc4, <4 x i32> zeroinitializer, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %bc5 = bitcast <4 x i32> %shuffle4 to <2 x i64>
+  ret <2 x i64> %bc5
+}
+
+; Function Attrs: noinline nounwind uwtable
+define <2 x i64> @shuffle_8_add_32_shuffle_8_masks_are_eq(<2 x i64> %v) {
+; CHECK-LABEL: @shuffle_8_add_32_shuffle_8_masks_are_eq(
+; CHECK-NEXT:  [[TMP0:%.*]] = bitcast <2 x i64> %v to <4 x i32>
+; CHECK-NEXT:  [[TMP1:%.*]] = bitcast <2 x i64> %v to <4 x i32>
+; CHECK-NEXT:  [[TMP2:%.*]] = add <4 x i32> [[TMP1:%.*]], [[TMP2:%.*]]
+; CHECK-NEXT:  [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2:%.*]] to <2 x i64>
+; CHECK-NEXT:  ret <2 x i64> [[TMP3:%.*]]
+;
+  %bc0 = bitcast <2 x i64> %v to <16 x i8>
+  %shuffle = shufflevector <16 x i8> %bc0, <16 x i8> zeroinitializer, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %bc1 = bitcast <16 x i8> %shuffle to <2 x i64>
+  %bc2 = bitcast <2 x i64> %bc1 to <4 x i32>
+  %add.i = add <4 x i32> %bc2, %bc2
+  %bc3 = bitcast <4 x i32> %add.i to <2 x i64>
+  %bc4 = bitcast <2 x i64> %bc3 to <16 x i8>
+  %shuffle4 = shufflevector <16 x i8> %bc4, <16 x i8> zeroinitializer, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %bc5 = bitcast <16 x i8> %shuffle4 to <2 x i64>
+  ret <2 x i64> %bc5
+}
+
+define <8 x i16> @shuffle_32_add_16_masks_are_eq(<4 x i32> %v1, <4 x i32> %v2) {
+; CHECK-LABEL:  @shuffle_32_add_16_masks_are_eq
+; CHECK-NEXT:  [[TMP1:%.*]] = bitcast <4 x i32> %v1 to <8 x i16>
+; CHECK-NEXT:  [[TMP2:%.*]] = bitcast <4 x i32> %v2 to <8 x i16>
+; CHECK-NEXT:  [[TMP3:%.*]] = add <8 x i16> [[TMP1:%.*]], [[TMP2:%.*]]
+; CHECK-NEXT:  [[TMP4:%.*]] = shufflevector <8 x i16> [[TMP3:%.*]], <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1>
+; CHECK-NEXT:  ret <8 x i16> [[TMP4:%.*]]
+
+  %shuffle1 = shufflevector <4 x i32> %v1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+  %shuffle2 = shufflevector <4 x i32> %v2, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+  %bc1 = bitcast <4 x i32> %shuffle1 to <8 x i16>
+  %bc2 = bitcast <4 x i32> %shuffle2 to <8 x i16>
+  %add = add <8 x i16> %bc1, %bc2
+  ret <8 x i16> %add
+}
+
+define <16 x i8> @shuffle_32_add_8_masks_are_eq(<4 x i32> %v1, <4 x i32> %v2) {
+; CHECK-LABEL: @shuffle_32_add_8_masks_are_eq
+; CHECK-NEXT:  [[TMP1:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
+; CHECK-NEXT:  [[TMP2:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+; CHECK-NEXT:  [[TMP3:%.*]] = add <16 x i8> [[TMP1:%.*]], [[TMP2:%.*]]
+; CHECK-NEXT:  [[TMP4:%.*]] = shufflevector <16 x i8> [[TMP3:%.*]], <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:  ret <16 x i8> [[TMP4:%.*]]
+
+  %shuffle1 = shufflevector <4 x i32> %v1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+  %shuffle2 = shufflevector <4 x i32> %v2, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+  %bc1 = bitcast <4 x i32> %shuffle1 to <16 x i8>
+  %bc2 = bitcast <4 x i32> %shuffle2 to <16 x i8>
+  %add = add <16 x i8> %bc1, %bc2
+  ret <16 x i8> %add
+}
+
+define <16 x i8> @shuffle_16_add_8_masks_are_eq(<8 x i16> %v1, <8 x i16> %v2) {
+; CHECK-LABEL: @shuffle_16_add_8_masks_are_eq
+; CHECK-NEXT:  [[TMP1:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+; CHECK-NEXT:  [[TMP2:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+; CHECK-NEXT:  [[TMP3:%.*]] = add <16 x i8> [[TMP1:%.*]], [[TMP2:%.*]]
+; CHECK-NEXT:  [[TMP4:%.*]] = shufflevector <16 x i8> [[TMP3:%.*]], <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 12, i32 13>
+; CHECK-NEXT:  ret <16 x i8> [[TMP4:%.*]]
+
+  %shuffle1 = shufflevector <8 x i16> %v1, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 4, i32 5, i32 7, i32 6>
+  %shuffle2 = shufflevector <8 x i16> %v2, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 4, i32 5, i32 7, i32 6>
+  %bc1 = bitcast <8 x i16> %shuffle1 to <16 x i8>
+  %bc2 = bitcast <8 x i16> %shuffle2 to <16 x i8>
+  %add = add <16 x i8> %bc1, %bc2
+  ret <16 x i8> %add
+}
+
+define <4 x i32> @shuffle_16_add_32_masks_are_eq_and_can_be_converted_up(<8 x i16> %v1, <8 x i16> %v2) {
+; CHECK-LABEL:	@shuffle_16_add_32_masks_are_eq_and_can_be_converted_up
+; CHECK-NEXT:  [[TMP1:%.*]] = bitcast <8 x i16> %v1 to <4 x i32>
+; CHECK-NEXT:  [[TMP2:%.*]] = bitcast <8 x i16> %v2 to <4 x i32>
+; CHECK-NEXT:  [[TMP3:%.*]] = add <4 x i32> [[TMP1:%.*]], [[TMP2:%.*]]
+; CHECK-NEXT:  [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3:%.*]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+; CHECK-NEXT:  ret <4 x i32> [[TMP4:%.*]]
+
+  %shuffle1 = shufflevector <8 x i16> %v1, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+  %shuffle2 = shufflevector <8 x i16> %v2, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+  %bc1 = bitcast <8 x i16> %shuffle1 to <4 x i32>
+  %bc2 = bitcast <8 x i16> %shuffle2 to <4 x i32>
+  %add = add <4 x i32> %bc1, %bc2
+  ret <4 x i32> %add
+}
+
+define <4 x i32> @shuffle_8_add_32_masks_are_eq_and_can_be_converted_up(<16 x i8> %v1, <16 x i8> %v2) {
+; CHECK-LABEL: @shuffle_8_add_32_masks_are_eq_and_can_be_converted_up
+; CHECK-NEXT:  [[TMP1:%.*]] = bitcast <16 x i8> %v1 to <4 x i32>
+; CHECK-NEXT:  [[TMP2:%.*]] = bitcast <16 x i8> %v2 to <4 x i32>
+; CHECK-NEXT:  [[TMP3:%.*]] = add <4 x i32> [[TMP1:%.*]], [[TMP2:%.*]]
+; CHECK-NEXT:  [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3:%.*]], <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
+; CHECK-NEXT:  ret <4 x i32> [[TMP4:%.*]]
+
+  %shuffle1 = shufflevector <16 x i8> %v1, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle2 = shufflevector <16 x i8> %v2, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %bc1 = bitcast <16 x i8> %shuffle1 to <4 x i32>
+  %bc2 = bitcast <16 x i8> %shuffle2 to <4 x i32>
+  %add = add <4 x i32> %bc1, %bc2
+  ret <4 x i32> %add
+}
+
+; shuffle<8 x i16>( bitcast<8 x i16>( shuffle<4 x i32>(v)))
+
+define <8 x i16> @shuffle_32_bitcast_16_shuffle_16_can_be_converted_up(<4 x i32> %v1) {
+; CHECK-LABEL: @shuffle_32_bitcast_16_shuffle_16_can_be_converted_up
+; CHECK-NEXT:  [[TMP0:%.*]] = shufflevector <4 x i32> %v1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+; CHECK-NEXT:  [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0:%.*]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+; CHECK-NEXT:  [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1:%.*]] to <8 x i16>
+; CHECK-NEXT:  ret <8 x i16> [[TMP2:%.*]]
+
+  %shuffle1 = shufflevector <4 x i32> %v1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+  %bc1 = bitcast <4 x i32> %shuffle1 to <8 x i16>
+  %shuffle2 = shufflevector <8 x i16> %bc1, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1>
+  ret <8 x i16> %shuffle2
+}
+
+; shuffle<8 x i16>( bitcast<8 x i16>( shuffle<4 x i32>(v)))
+
+define <8 x i16> @shuffle_32_bitcast_16_shuffle_16_can_not_be_converted_up(<4 x i32> %v1) {
+; CHECK-LABEL: @shuffle_32_bitcast_16_shuffle_16_can_not_be_converted_up
+; CHECK-NEXT:  [[TMP1:%.*]] = bitcast <4 x i32> %v1 to <8 x i16>
+; CHECK-NEXT:  [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1:%.*]], <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1>
+; CHECK-NEXT:  [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP2:%.*]], <8 x i16> undef, <8 x i32> <i32 5, i32 4, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:  ret <8 x i16> [[TMP3:%.*]]
+
+  %shuffle1 = shufflevector <4 x i32> %v1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+  %bc1 = bitcast <4 x i32> %shuffle1 to <8 x i16>
+  %shuffle2 = shufflevector <8 x i16> %bc1, <8 x i16> undef, <8 x i32> <i32 5, i32 4, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+  ret <8 x i16> %shuffle2
+}
+
+; shuffle<16 x i8>( bitcast<16 x i8>( shuffle<4 x i32>(v)))
+
+define <16 x i8> @shuffle_32_bitcast_8_shuffle_8_can_be_converted_up(<4 x i32> %v1) {
+; CHECK-LABEL: @shuffle_32_bitcast_8_shuffle_8_can_be_converted_up
+; CHECK-NEXT:  [[TMP0:%.*]] = shufflevector <4 x i32> %v1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+; CHECK-NEXT:  [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0:%.*]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+; CHECK-NEXT:  [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1:%.*]] to <16 x i8>
+; CHECK-NEXT:  ret <16 x i8> [[TMP2:%.*]]
+
+  %shuffle1 = shufflevector <4 x i32> %v1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+  %bc1 = bitcast <4 x i32> %shuffle1 to <16 x i8>
+  %shuffle2 = shufflevector <16 x i8> %bc1, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+  ret <16 x i8> %shuffle2
+}
+
+; shuffle<16 x i8>( bitcast<16 x i8>( shuffle<4 x i32>(v)))
+
+define <16 x i8> @shuffle_32_bitcast_8_shuffle_8_can_not_be_converted_up(<4 x i32> %v1) {
+; CHECK-LABEL: @shuffle_32_bitcast_8_shuffle_8_can_not_be_converted_up
+; CHECK-NEXT:  [[TMP1:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
+; CHECK-NEXT:  [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1:%.*]], <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:  [[TMP3:%.*]] = shufflevector <16 x i8> [[TMP2:%.*]], <16 x i8> undef, <16 x i32> <i32 5, i32 4, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:  ret <16 x i8> [[TMP3:%.*]]
+
+  %shuffle1 = shufflevector <4 x i32> %v1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+  %bc1 = bitcast <4 x i32> %shuffle1 to <16 x i8>
+  %shuffle2 = shufflevector <16 x i8> %bc1, <16 x i8> undef, <16 x i32> <i32 5, i32 4, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %shuffle2
+}
+
+; shuffle<4 x i32>( bitcast<4 x i32>( shuffle<16 x i8>(v)))
+
+define <4 x i32> @shuffle_8_bitcast_32_shuffle_32_can_be_converted_up(<16 x i8> %v1) {
+; CHECK-LABEL: @shuffle_8_bitcast_32_shuffle_32_can_be_converted_up
+; CHECK-NEXT:  [[TMP1:%.*]] = bitcast <16 x i8> %v1 to <4 x i32>
+; CHECK-NEXT:  [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1:%.*]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+; CHECK-NEXT:  [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2:%.*]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+; CHECK-NEXT:  ret <4 x i32> [[TMP3:%.*]]
+
+  %shuffle1 = shufflevector <16 x i8> %v1, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+  %bc1 = bitcast <16 x i8> %shuffle1 to <4 x i32>
+  %shuffle2 = shufflevector <4 x i32> %bc1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+  ret <4 x i32> %shuffle2
+}
+
+; shuffle<4 x i32>( bitcast<4 x i32>( shuffle<8 x i16>(v)))
+
+define <4 x i32> @shuffle_16_bitcast_32_shuffle_32_can_be_converted_up(<8 x i16> %v1) {
+; CHECK-LABEL: @shuffle_16_bitcast_32_shuffle_32_can_be_converted_up
+; CHECK-NEXT:  [[TMP1:%.*]] = bitcast <8 x i16> %v1 to <4 x i32>
+; CHECK-NEXT:  [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1:%.*]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+; CHECK-NEXT:  [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2:%.*]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+; CHECK-NEXT:  ret <4 x i32> [[TMP3:%.*]]
+
+  %shuffle1 = shufflevector <8 x i16> %v1, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1>
+  %bc1 = bitcast <8 x i16> %shuffle1 to <4 x i32>
+  %shuffle2 = shufflevector <4 x i32> %bc1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+  ret <4 x i32> %shuffle2
+}
+
+; shuffle<4 x i32>( bitcast<4 x i32>( shuffle<16 x i8>(v)))
+
+define <4 x i32> @shuffle_8_bitcast_32_shuffle_32_can_not_be_converted_up(<16 x i8> %v1) {
+; CHECK-LABEL: @shuffle_8_bitcast_32_shuffle_32_can_not_be_converted_up
+; CHECK-NEXT:  [[TMP0:%.*]] = shufflevector <16 x i8> %v1, <16 x i8> undef, <16 x i32> <i32 9, i32 8, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:  [[TMP1:%.*]] = shufflevector <16 x i8> [[TMP0:%.*]], <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:  [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1:%.*]] to <4 x i32>
+; CHECK-NEXT:  ret <4 x i32> [[TMP2:%.*]]
+
+  %shuffle1 = shufflevector <16 x i8> %v1, <16 x i8> undef, <16 x i32> <i32 9, i32 8, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+  %bc1 = bitcast <16 x i8> %shuffle1 to <4 x i32>
+  %shuffle2 = shufflevector <4 x i32> %bc1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+  ret <4 x i32> %shuffle2
+}
+
+; shuffle<4 x i32>( bitcast<4 x i32>( shuffle<8 x i16>(v)))
+
+define <4 x i32> @shuffle_16_bitcast_32_shuffle_32_can_not_be_converted_up(<8 x i16> %v1) {
+; CHECK-LABEL: @shuffle_16_bitcast_32_shuffle_32_can_not_be_converted_up
+; CHECK-NEXT:  [[TMP0:%.*]] = shufflevector <8 x i16> %v1, <8 x i16> undef, <8 x i32> <i32 5, i32 4, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1>
+; CHECK-NEXT:  [[TMP1:%.*]] = shufflevector <8 x i16> [[TMP0:%.*]], <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1>
+; CHECK-NEXT:  [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1:%.*]] to <4 x i32>
+; CHECK-NEXT:  ret <4 x i32> [[TMP2:%.*]]
+
+  %shuffle1 = shufflevector <8 x i16> %v1, <8 x i16> undef, <8 x i32> <i32 5, i32 4, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1>
+  %bc1 = bitcast <8 x i16> %shuffle1 to <4 x i32>
+  %shuffle2 = shufflevector <4 x i32> %bc1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 1, i32 0>
+  ret <4 x i32> %shuffle2
+}
+
+; shuffle<8 x i16>( bitcast<8 x i16>( shuffle<16 x i8>(v)))
+
+define <8 x i16> @shuffle_8_bitcast_16_shuffle_16_can__be_converted_up(<16 x i8> %v1) {
+; CHECK-LABEL: @shuffle_8_bitcast_16_shuffle_16_can__be_converted_up
+; CHECK-NEXT:  [[TMP1:%.*]] = bitcast <16 x i8> %v1 to <8 x i16>
+; CHECK-NEXT:  [[TMP2:%.*]] = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1>
+; CHECK-NEXT:  [[TMP3:%.*]] = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1>
+; CHECK-NEXT:  ret <8 x i16> %3
+
+  %shuffle1 = shufflevector <16 x i8> %v1, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+  %bc1 = bitcast <16 x i8> %shuffle1 to <8 x i16>
+  %shuffle2 = shufflevector <8 x i16> %bc1, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1>
+  ret <8 x i16> %shuffle2
+}
+
+; shuffle<8 x i16>( bitcast<8 x i16>( shuffle<16 x i8>(v)))
+
+define <8 x i16> @shuffle_8_bitcast_16_shuffle_16_can_not_be_converted_up(<16 x i8> %v1) {
+; CHECK-LABEL: @shuffle_8_bitcast_16_shuffle_16_can_not_be_converted_up(<16 x i8> %v1) {
+; CHECK-NEXT:  [[TMP0:%.*]] = shufflevector <16 x i8> %v1, <16 x i8> undef, <16 x i32> <i32 9, i32 8, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:  [[TMP1:%.*]] = shufflevector <16 x i8> [[TMP0:%.*]], <16 x i8> undef, <16 x i32> <i32 10, i32 11, i32 8, i32 9, i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:  [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1:%.*]] to <8 x i16>
+; CHECK-NEXT:  ret <8 x i16> [[TMP2:%.*]]
+
+  %shuffle1 = shufflevector <16 x i8> %v1, <16 x i8> undef, <16 x i32> <i32 9, i32 8, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
+  %bc1 = bitcast <16 x i8> %shuffle1 to <8 x i16>
+  %shuffle2 = shufflevector <8 x i16> %bc1, <8 x i16> undef, <8 x i32> <i32 5, i32 4, i32 6, i32 7, i32 2, i32 3, i32 0, i32 1>
+  ret <8 x i16> %shuffle2
+}
+