diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1820,7 +1820,11 @@
 }
 
 /// This input value (which is known to have vector type) is being zero extended
-/// or truncated to the specified vector type.
+/// or truncated to the specified vector type. Since the zext/trunc is done
+/// using an integer type, we have a (bitcast(cast(bitcast))) pattern,
+/// endianness will impact which end of the vector that is extended or
+/// truncated.
+///
 /// Try to replace it with a shuffle (and vector/vector bitcast) if possible.
 ///
 /// The source and destination vector types may have different element types.
@@ -1850,25 +1854,36 @@
   SmallVector<uint32_t, 16> ShuffleMask;
   Value *V2;
 
-  if (SrcTy->getNumElements() > DestTy->getNumElements()) {
-    // If we're shrinking the number of elements, just shuffle in the low
+  bool IsBigEndian = IC.getDataLayout().isBigEndian();
+  unsigned SrcElts = SrcTy->getNumElements();
+  unsigned DestElts = DestTy->getNumElements();
+
+  if (SrcElts > DestElts) {
+    // If we're shrinking the number of elements, just shuffle in the low/high
     // elements from the input and use undef as the second shuffle input.
     V2 = UndefValue::get(SrcTy);
-    for (unsigned i = 0, e = DestTy->getNumElements(); i != e; ++i)
-      ShuffleMask.push_back(i);
+    unsigned FirstElt = IsBigEndian ? SrcElts - DestElts : 0;
+    for (unsigned i = 0, e = DestElts; i != e; ++i)
+      ShuffleMask.push_back(FirstElt + i);
 
   } else {
     // If we're increasing the number of elements, shuffle in all of the
     // elements from InVal and fill the rest of the result elements with zeros
     // from a constant zero.
+
+    // The excess elements reference the first element of the zero input (V2).
     V2 = Constant::getNullValue(SrcTy);
-    unsigned SrcElts = SrcTy->getNumElements();
+
+    if (IsBigEndian)
+      for (unsigned i = 0, e = DestElts - SrcElts; i != e; ++i)
+        ShuffleMask.push_back(SrcElts);
+
     for (unsigned i = 0, e = SrcElts; i != e; ++i)
       ShuffleMask.push_back(i);
 
-    // The excess elements reference the first element of the zero input.
-    for (unsigned i = 0, e = DestTy->getNumElements()-SrcElts; i != e; ++i)
-      ShuffleMask.push_back(SrcElts);
+    if (!IsBigEndian)
+      for (unsigned i = 0, e = DestElts - SrcElts; i != e; ++i)
+        ShuffleMask.push_back(SrcElts);
   }
 
   return new ShuffleVectorInst(InVal, V2,
diff --git a/llvm/test/Transforms/InstCombine/cast.ll b/llvm/test/Transforms/InstCombine/cast.ll
--- a/llvm/test/Transforms/InstCombine/cast.ll
+++ b/llvm/test/Transforms/InstCombine/cast.ll
@@ -824,7 +824,7 @@
 
 define <3 x i32> @test60(<4 x i32> %call4) {
 ; CHECK-LABEL: @test60(
-; CHECK-NEXT:    [[P10:%.*]] = shufflevector <4 x i32> [[CALL4:%.*]], <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[P10:%.*]] = shufflevector <4 x i32> [[CALL4:%.*]], <4 x i32> undef, <3 x i32> <i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    ret <3 x i32> [[P10]]
 ;
   %p11 = bitcast <4 x i32> %call4 to i128
@@ -836,7 +836,7 @@
 
 define <4 x i32> @test61(<3 x i32> %call4) {
 ; CHECK-LABEL: @test61(
-; CHECK-NEXT:    [[P10:%.*]] = shufflevector <3 x i32> [[CALL4:%.*]], <3 x i32> <i32 0, i32 undef, i32 undef>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[P10:%.*]] = shufflevector <3 x i32> [[CALL4:%.*]], <3 x i32> <i32 0, i32 undef, i32 undef>, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
 ; CHECK-NEXT:    ret <4 x i32> [[P10]]
 ;
   %p11 = bitcast <3 x i32> %call4 to i96
@@ -848,7 +848,7 @@
 define <4 x i32> @test62(<3 x float> %call4) {
 ; CHECK-LABEL: @test62(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <3 x float> [[CALL4:%.*]] to <3 x i32>
-; CHECK-NEXT:    [[P10:%.*]] = shufflevector <3 x i32> [[TMP1]], <3 x i32> <i32 0, i32 undef, i32 undef>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[P10:%.*]] = shufflevector <3 x i32> [[TMP1]], <3 x i32> <i32 0, i32 undef, i32 undef>, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
 ; CHECK-NEXT:    ret <4 x i32> [[P10]]
 ;
   %p11 = bitcast <3 x float> %call4 to i96