diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1820,7 +1820,11 @@
 }
 
 /// This input value (which is known to have vector type) is being zero extended
-/// or truncated to the specified vector type.
+/// or truncated to the specified vector type. Since the zext/trunc is done
+/// using an integer type, we have a (bitcast(cast(bitcast))) pattern,
+/// endianness will impact which end of the vector that is extended or
+/// truncated.
+///
 /// Try to replace it with a shuffle (and vector/vector bitcast) if possible.
 ///
 /// The source and destination vector types may have different element types.
@@ -1850,25 +1854,42 @@
   SmallVector<uint32_t, 16> ShuffleMask;
   Value *V2;
 
-  if (SrcTy->getNumElements() > DestTy->getNumElements()) {
-    // If we're shrinking the number of elements, just shuffle in the low
-    // elements from the input and use undef as the second shuffle input.
+  bool IsBigEndian = IC.getDataLayout().isBigEndian();
+  unsigned SrcElts = SrcTy->getNumElements();
+  unsigned DestElts = DestTy->getNumElements();
+
+  if (SrcElts > DestElts) {
+    // If we're shrinking the number of elements, just shuffle in the elements
+    // from the input and use undef as the second shuffle input.
     V2 = UndefValue::get(SrcTy);
-    for (unsigned i = 0, e = DestTy->getNumElements(); i != e; ++i)
-      ShuffleMask.push_back(i);
+    // For big endian we should pick elements with high indices, for little
+    // endian from low indices.
+    unsigned FirstElt = IsBigEndian ? SrcElts - DestElts : 0;
+    for (unsigned i = 0, e = DestElts; i != e; ++i)
+      ShuffleMask.push_back(FirstElt + i);
 
   } else {
     // If we're increasing the number of elements, shuffle in all of the
-    // elements from InVal and fill the rest of the result elements with zeros
-    // from a constant zero.
+    // elements from InVal. Fill the rest of the result elements with zeros
+    // from a constant zero. We use the first element of the zero input.
     V2 = Constant::getNullValue(SrcTy);
-    unsigned SrcElts = SrcTy->getNumElements();
+
+    // Create a helper mask for picking excessive values from first element of
+    // the null vector.
+    SmallVector<uint32_t, 16> ZeroMask;
+    ZeroMask.assign(DestElts - SrcElts, SrcElts);
+
+    // For big endian the zeroes should be inserted at low indices.
+    if (IsBigEndian)
+      ShuffleMask.insert(ShuffleMask.end(), ZeroMask.begin(), ZeroMask.end());
+
+    // Add the mask for picking values from the source vector <0, 1, 2 ...>.
     for (unsigned i = 0, e = SrcElts; i != e; ++i)
       ShuffleMask.push_back(i);
 
-    // The excess elements reference the first element of the zero input.
-    for (unsigned i = 0, e = DestTy->getNumElements()-SrcElts; i != e; ++i)
-      ShuffleMask.push_back(SrcElts);
+    // For little endian the zeroes should be inserted at high indices.
+    if (!IsBigEndian)
+      ShuffleMask.insert(ShuffleMask.end(), ZeroMask.begin(), ZeroMask.end());
   }
 
   return new ShuffleVectorInst(InVal, V2,
diff --git a/llvm/test/Transforms/InstCombine/cast.ll b/llvm/test/Transforms/InstCombine/cast.ll
--- a/llvm/test/Transforms/InstCombine/cast.ll
+++ b/llvm/test/Transforms/InstCombine/cast.ll
@@ -823,9 +823,13 @@
 }
 
 define <3 x i32> @test60(<4 x i32> %call4) {
-; ALL-LABEL: @test60(
-; ALL-NEXT:    [[P10:%.*]] = shufflevector <4 x i32> [[CALL4:%.*]], <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
-; ALL-NEXT:    ret <3 x i32> [[P10]]
+; BE-LABEL: @test60(
+; BE-NEXT:    [[P10:%.*]] = shufflevector <4 x i32> [[CALL4:%.*]], <4 x i32> undef, <3 x i32> <i32 1, i32 2, i32 3>
+; BE-NEXT:    ret <3 x i32> [[P10]]
+;
+; LE-LABEL: @test60(
+; LE-NEXT:    [[P10:%.*]] = shufflevector <4 x i32> [[CALL4:%.*]], <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
+; LE-NEXT:    ret <3 x i32> [[P10]]
 ;
   %p11 = bitcast <4 x i32> %call4 to i128
   %p9 = trunc i128 %p11 to i96
@@ -835,9 +839,13 @@
 }
 
 define <4 x i32> @test61(<3 x i32> %call4) {
-; ALL-LABEL: @test61(
-; ALL-NEXT:    [[P10:%.*]] = shufflevector <3 x i32> [[CALL4:%.*]], <3 x i32> <i32 0, i32 undef, i32 undef>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; ALL-NEXT:    ret <4 x i32> [[P10]]
+; BE-LABEL: @test61(
+; BE-NEXT:    [[P10:%.*]] = shufflevector <3 x i32> [[CALL4:%.*]], <3 x i32> <i32 0, i32 undef, i32 undef>, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
+; BE-NEXT:    ret <4 x i32> [[P10]]
+;
+; LE-LABEL: @test61(
+; LE-NEXT:    [[P10:%.*]] = shufflevector <3 x i32> [[CALL4:%.*]], <3 x i32> <i32 0, i32 undef, i32 undef>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; LE-NEXT:    ret <4 x i32> [[P10]]
 ;
   %p11 = bitcast <3 x i32> %call4 to i96
   %p9 = zext i96 %p11 to i128
@@ -846,10 +854,15 @@
 }
 
 define <4 x i32> @test62(<3 x float> %call4) {
-; ALL-LABEL: @test62(
-; ALL-NEXT:    [[TMP1:%.*]] = bitcast <3 x float> [[CALL4:%.*]] to <3 x i32>
-; ALL-NEXT:    [[P10:%.*]] = shufflevector <3 x i32> [[TMP1]], <3 x i32> <i32 0, i32 undef, i32 undef>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; ALL-NEXT:    ret <4 x i32> [[P10]]
+; BE-LABEL: @test62(
+; BE-NEXT:    [[TMP1:%.*]] = bitcast <3 x float> [[CALL4:%.*]] to <3 x i32>
+; BE-NEXT:    [[P10:%.*]] = shufflevector <3 x i32> [[TMP1]], <3 x i32> <i32 0, i32 undef, i32 undef>, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
+; BE-NEXT:    ret <4 x i32> [[P10]]
+;
+; LE-LABEL: @test62(
+; LE-NEXT:    [[TMP1:%.*]] = bitcast <3 x float> [[CALL4:%.*]] to <3 x i32>
+; LE-NEXT:    [[P10:%.*]] = shufflevector <3 x i32> [[TMP1]], <3 x i32> <i32 0, i32 undef, i32 undef>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; LE-NEXT:    ret <4 x i32> [[P10]]
 ;
   %p11 = bitcast <3 x float> %call4 to i96
   %p9 = zext i96 %p11 to i128