Index: lib/Transforms/InstCombine/InstructionCombining.cpp
===================================================================
--- lib/Transforms/InstCombine/InstructionCombining.cpp
+++ lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1451,6 +1451,49 @@
     return Builder.CreateShuffleVector(
         NewBO, UndefValue::get(NewBO->getType()), LShuf->getMask());
   }
+  // Both arguments of the binary operation are the shuffle instructions, but
+  // binary operation vector type is different from a shuffle vector type, e.g.
+  // shuffle operands data type is <4 x i32>, but a binary operation operands
+  // data type is <16 x i8> In this situation, in order to move the shuffle
+  // instruction behind the binary operation instruction we need four bitcast
+  // instructions: two for each shuffle operand, one for binary operation
+  // result, and one for shuffle result. That looks like a loot of bitcast
+  // instructions, but they will be all eliminated during the subsequent
+  // instructions combine phases.
+  // Another approach is to change the shuffle instruction data type and
+  // recompute the shuffle instruction mask has very limited usage because, we
+  // can recompute the shuffle mask only in a case when the shuffle mask is a
+  // constant value, and secondly we can do this only in a situation when we
+  // need to change a shuffle instruction vector type from <4 x i32> to <16 x
+  // i8> but not visa versa
+  BitCastInst *LBitCast = dyn_cast<BitCastInst>(LHS);
+  BitCastInst *RBitCast = dyn_cast<BitCastInst>(RHS);
+  if (LBitCast && RBitCast) {
+    Value *LBitCastOp = LBitCast->getOperand(0);
+    Value *RBitCastOp = RBitCast->getOperand(0);
+    ShuffleVectorInst *LBcShuf = dyn_cast<ShuffleVectorInst>(LBitCastOp);
+    ShuffleVectorInst *RBcShuf = dyn_cast<ShuffleVectorInst>(RBitCastOp);
+
+    if (LBcShuf && RBcShuf && LBcShuf->getMask() == RBcShuf->getMask() &&
+        isa<UndefValue>(LBcShuf->getOperand(1)) &&
+        isa<UndefValue>(RBcShuf->getOperand(1)) &&
+        LBcShuf->getOperand(0)->getType() ==
+            RBcShuf->getOperand(0)->getType()) {
+
+      Value *LBitCast =
+          Builder.CreateBitCast(LBcShuf->getOperand(0), Inst.getType());
+      Value *RBitCast =
+          Builder.CreateBitCast(RBcShuf->getOperand(0), Inst.getType());
+
+      Value *NewBinOp = CreateBinOpAsGiven(Inst, LBitCast, RBitCast, Builder);
+      Value *ShufBitCast = Builder.CreateBitCast(NewBinOp, LBcShuf->getType());
+
+      Value *NewShuf = Builder.CreateShuffleVector(
+          ShufBitCast, UndefValue::get(LBcShuf->getType()), LBcShuf->getMask());
+      Value *NewBitCast = Builder.CreateBitCast(NewShuf, Inst.getType());
+      return NewBitCast;
+    }
+  }
 
   // If one argument is a shuffle within one vector, the other is a constant,
   // try moving the shuffle after the binary operation.
Index: test/Transforms/InstCombine/vec_shuffle.ll
===================================================================
--- test/Transforms/InstCombine/vec_shuffle.ll
+++ test/Transforms/InstCombine/vec_shuffle.ll
@@ -463,3 +463,48 @@
   %1 = shufflevector <4 x i32*> %A, <4 x i32*> undef, <2 x i32> <i32 0, i32 1>
   ret <2 x i32*> %1
 }
+
+; Function Attrs: noinline nounwind uwtable
+define <2 x i64> @shuffle_add2_32_16(<2 x i64> %v) {
+; CHECK-LABEL: @shuffle_add2_32_16(
+; CHECK-NEXT:  [[TMP0:%.*]] = bitcast <2 x i64> %v to <8 x i16>
+; CHECK-NEXT:  [[TMP1:%.*]] = bitcast <2 x i64> %v to <8 x i16>
+; CHECK-NEXT:  [[TMP2:%.*]] = add <8 x i16> [[TMP1:%.*]], [[TMP2:%.*]]
+; CHECK-NEXT:  [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2:%.*]] to <2 x i64>
+; CHECK-NEXT:  ret <2 x i64> [[TMP3:%.*]]
+;
+  %bc0 = bitcast <2 x i64> %v to <4 x i32>
+  %shuffle = shufflevector <4 x i32> %bc0, <4 x i32> zeroinitializer, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %bc1 = bitcast <4 x i32> %shuffle to <2 x i64>
+  %bc2 = bitcast <2 x i64> %bc1 to <8 x i16>
+  %add.i = add <8 x i16> %bc2, %bc2
+  %bc3 = bitcast <8 x i16> %add.i to <2 x i64>
+  %bc4 = bitcast <2 x i64> %bc3 to <4 x i32>
+  %shuffle4 = shufflevector <4 x i32> %bc4, <4 x i32> zeroinitializer, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %bc5 = bitcast <4 x i32> %shuffle4 to <2 x i64>
+  ret <2 x i64> %bc5
+}
+
+
+
+; Function Attrs: noinline nounwind uwtable
+define <2 x i64> @shuffle_add2_32_8(<2 x i64> %v) {
+; CHECK-LABEL: @shuffle_add2_32_8(
+; CHECK-NEXT:  [[TMP0:%.*]] = bitcast <2 x i64> %v to <16 x i8>
+; CHECK-NEXT:  [[TMP1:%.*]] = bitcast <2 x i64> %v to <16 x i8>
+; CHECK-NEXT:  [[TMP2:%.*]] = add <16 x i8> [[TMP1:%.*]], [[TMP2:%.*]]
+; CHECK-NEXT:  [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2:%.*]] to <2 x i64>
+; CHECK-NEXT:  ret <2 x i64> [[TMP3:%.*]]
+;
+  %bc0 = bitcast <2 x i64> %v to <4 x i32>
+  %shuffle = shufflevector <4 x i32> %bc0, <4 x i32> zeroinitializer, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %bc1 = bitcast <4 x i32> %shuffle to <2 x i64>
+  %bc2 = bitcast <2 x i64> %bc1 to <16 x i8>
+  %add.i = add <16 x i8> %bc2, %bc2
+  %bc3 = bitcast <16 x i8> %add.i to <2 x i64>
+  %bc4 = bitcast <2 x i64> %bc3 to <4 x i32>
+  %shuffle4 = shufflevector <4 x i32> %bc4, <4 x i32> zeroinitializer, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %bc5 = bitcast <4 x i32> %shuffle4 to <2 x i64>
+  ret <2 x i64> %bc5
+}
+