Index: llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
===================================================================
--- llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -690,6 +690,7 @@
 
   Value *Src = CI.getOperand(0);
   Type *DestTy = CI.getType(), *SrcTy = Src->getType();
+  ConstantInt *Cst = nullptr;
 
   // Attempt to truncate the entire input expression tree to the destination
   // type.   Only do this if the dest type is a simple type, don't convert the
@@ -758,7 +759,7 @@
   // more efficiently. Support vector types. Cleanup code by using m_OneUse.
 
   // Transform trunc(lshr (zext A), Cst) to eliminate one type conversion.
-  Value *A = nullptr; ConstantInt *Cst = nullptr;
+  Value *A = nullptr;
   if (Src->hasOneUse() &&
       match(Src, m_LShr(m_ZExt(m_Value(A)), m_ConstantInt(Cst)))) {
     // We have three types to worry about here, the type of A, the source of
@@ -843,6 +844,37 @@
   if (Instruction *I = foldVecTruncToExtElt(CI, *this))
     return I;
 
+  /// Whenever an element is extracted from a vector, and then truncated,
+  /// canonicalize by converting it to a bitcast followed by an
+  /// extractelement.
+  ///
+  /// Example (little endian):
+  ///   trunc (extractelement <4 x i64> %X, 0) to i32
+  ///   --->
+  ///   extractelement <8 x i32> (bitcast <4 x i64> %X to <8 x i32>), i32 0
+  Value *VecOp = nullptr;
+  if (match(Src,
+            m_OneUse(m_ExtractElement(m_Value(VecOp), m_ConstantInt(Cst))))) {
+    Type *VecOpTy = VecOp->getType();
+    unsigned DestScalarSize = DestTy->getScalarSizeInBits();
+    unsigned VecOpScalarSize = VecOpTy->getScalarSizeInBits();
+    unsigned VecNumElts = VecOpTy->getVectorNumElements();
+
+    // A badly fit destination size would result in an invalid cast.
+    if (VecNumElts * VecOpScalarSize % DestScalarSize == 0) {
+      unsigned BitCastNumElts = VecNumElts * VecOpScalarSize / DestScalarSize;
+      unsigned VecOpIdx = Cst->getZExtValue();
+      unsigned NewIdx =
+          DL.isBigEndian()
+              ? (VecOpIdx + 1) * VecOpScalarSize / DestScalarSize - 1
+              : VecOpIdx * VecOpScalarSize / DestScalarSize;
+
+      Type *BitCastTo = VectorType::get(DestTy, BitCastNumElts);
+      Value *BitCast = Builder.CreateBitCast(VecOp, BitCastTo);
+      return ExtractElementInst::Create(BitCast, Builder.getInt32(NewIdx));
+    }
+  }
+
   return nullptr;
 }
 
Index: llvm/test/Transforms/InstCombine/pr45314_be.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/InstCombine/pr45314_be.ll
@@ -0,0 +1,96 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+target datalayout = "E"
+
+define i32 @shrinkExtractElt_i64_to_i32_0(<3 x i64> %x) {
+; CHECK-LABEL: @shrinkExtractElt_i64_to_i32_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <6 x i32>
+; CHECK-NEXT:    [[T:%.*]] = extractelement <6 x i32> [[TMP1]], i32 1
+; CHECK-NEXT:    ret i32 [[T]]
+;
+  %e = extractelement <3 x i64> %x, i32 0
+  %t = trunc i64 %e to i32
+  ret i32 %t
+}
+
+define i32 @shrinkExtractElt_i64_to_i32_1(<3 x i64> %x) {
+; CHECK-LABEL: @shrinkExtractElt_i64_to_i32_1(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <6 x i32>
+; CHECK-NEXT:    [[T:%.*]] = extractelement <6 x i32> [[TMP1]], i32 3
+; CHECK-NEXT:    ret i32 [[T]]
+;
+  %e = extractelement <3 x i64> %x, i32 1
+  %t = trunc i64 %e to i32
+  ret i32 %t
+}
+
+define i32 @shrinkExtractElt_i64_to_i32_2(<3 x i64> %x) {
+; CHECK-LABEL: @shrinkExtractElt_i64_to_i32_2(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <6 x i32>
+; CHECK-NEXT:    [[T:%.*]] = extractelement <6 x i32> [[TMP1]], i32 5
+; CHECK-NEXT:    ret i32 [[T]]
+;
+  %e = extractelement <3 x i64> %x, i32 2
+  %t = trunc i64 %e to i32
+  ret i32 %t
+}
+
+define i16 @shrinkExtractElt_i64_to_i16_0(<3 x i64> %x) {
+; CHECK-LABEL: @shrinkExtractElt_i64_to_i16_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <12 x i16>
+; CHECK-NEXT:    [[T:%.*]] = extractelement <12 x i16> [[TMP1]], i32 3
+; CHECK-NEXT:    ret i16 [[T]]
+;
+  %e = extractelement <3 x i64> %x, i16 0
+  %t = trunc i64 %e to i16
+  ret i16 %t
+}
+
+define i16 @shrinkExtractElt_i64_to_i16_1(<3 x i64> %x) {
+; CHECK-LABEL: @shrinkExtractElt_i64_to_i16_1(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <12 x i16>
+; CHECK-NEXT:    [[T:%.*]] = extractelement <12 x i16> [[TMP1]], i32 7
+; CHECK-NEXT:    ret i16 [[T]]
+;
+  %e = extractelement <3 x i64> %x, i16 1
+  %t = trunc i64 %e to i16
+  ret i16 %t
+}
+
+define i16 @shrinkExtractElt_i64_to_i16_2(<3 x i64> %x) {
+; CHECK-LABEL: @shrinkExtractElt_i64_to_i16_2(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <12 x i16>
+; CHECK-NEXT:    [[T:%.*]] = extractelement <12 x i16> [[TMP1]], i32 11
+; CHECK-NEXT:    ret i16 [[T]]
+;
+  %e = extractelement <3 x i64> %x, i16 2
+  %t = trunc i64 %e to i16
+  ret i16 %t
+}
+
+; Do not optimize if it would result in an invalid bitcast instruction.
+define i13 @shrinkExtractElt_i67_to_i13_2(<3 x i67> %x) {
+; CHECK-LABEL: @shrinkExtractElt_i67_to_i13_2(
+; CHECK-NEXT:    [[E:%.*]] = extractelement <3 x i67> [[X:%.*]], i459 2
+; CHECK-NEXT:    [[T:%.*]] = trunc i67 [[E]] to i13
+; CHECK-NEXT:    ret i13 [[T]]
+;
+  %e = extractelement <3 x i67> %x, i459 2
+  %t = trunc i67 %e to i13
+  ret i13 %t
+}
+
+; Do not canonicalize if that would increase the instruction count.
+declare void @use(i64)
+define i16 @shrinkExtractElt_i64_to_i16_2_extra_use(<3 x i64> %x) {
+; CHECK-LABEL: @shrinkExtractElt_i64_to_i16_2_extra_use(
+; CHECK-NEXT:    [[E:%.*]] = extractelement <3 x i64> [[X:%.*]], i64 2
+; CHECK-NEXT:    call void @use(i64 [[E]])
+; CHECK-NEXT:    [[T:%.*]] = trunc i64 [[E]] to i16
+; CHECK-NEXT:    ret i16 [[T]]
+;
+  %e = extractelement <3 x i64> %x, i64 2
+  call void @use(i64 %e)
+  %t = trunc i64 %e to i16
+  ret i16 %t
+}
Index: llvm/test/Transforms/InstCombine/pr45314_le.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/InstCombine/pr45314_le.ll
@@ -0,0 +1,96 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+target datalayout = "e"
+
+define i32 @shrinkExtractElt_i64_to_i32_0(<3 x i64> %x) {
+; CHECK-LABEL: @shrinkExtractElt_i64_to_i32_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <6 x i32>
+; CHECK-NEXT:    [[T:%.*]] = extractelement <6 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    ret i32 [[T]]
+;
+  %e = extractelement <3 x i64> %x, i32 0
+  %t = trunc i64 %e to i32
+  ret i32 %t
+}
+
+define i32 @shrinkExtractElt_i64_to_i32_1(<3 x i64> %x) {
+; CHECK-LABEL: @shrinkExtractElt_i64_to_i32_1(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <6 x i32>
+; CHECK-NEXT:    [[T:%.*]] = extractelement <6 x i32> [[TMP1]], i32 2
+; CHECK-NEXT:    ret i32 [[T]]
+;
+  %e = extractelement <3 x i64> %x, i32 1
+  %t = trunc i64 %e to i32
+  ret i32 %t
+}
+
+define i32 @shrinkExtractElt_i64_to_i32_2(<3 x i64> %x) {
+; CHECK-LABEL: @shrinkExtractElt_i64_to_i32_2(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <6 x i32>
+; CHECK-NEXT:    [[T:%.*]] = extractelement <6 x i32> [[TMP1]], i32 4
+; CHECK-NEXT:    ret i32 [[T]]
+;
+  %e = extractelement <3 x i64> %x, i32 2
+  %t = trunc i64 %e to i32
+  ret i32 %t
+}
+
+define i16 @shrinkExtractElt_i64_to_i16_0(<3 x i64> %x) {
+; CHECK-LABEL: @shrinkExtractElt_i64_to_i16_0(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <12 x i16>
+; CHECK-NEXT:    [[T:%.*]] = extractelement <12 x i16> [[TMP1]], i32 0
+; CHECK-NEXT:    ret i16 [[T]]
+;
+  %e = extractelement <3 x i64> %x, i16 0
+  %t = trunc i64 %e to i16
+  ret i16 %t
+}
+
+define i16 @shrinkExtractElt_i64_to_i16_1(<3 x i64> %x) {
+; CHECK-LABEL: @shrinkExtractElt_i64_to_i16_1(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <12 x i16>
+; CHECK-NEXT:    [[T:%.*]] = extractelement <12 x i16> [[TMP1]], i32 4
+; CHECK-NEXT:    ret i16 [[T]]
+;
+  %e = extractelement <3 x i64> %x, i16 1
+  %t = trunc i64 %e to i16
+  ret i16 %t
+}
+
+define i16 @shrinkExtractElt_i64_to_i16_2(<3 x i64> %x) {
+; CHECK-LABEL: @shrinkExtractElt_i64_to_i16_2(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <3 x i64> [[X:%.*]] to <12 x i16>
+; CHECK-NEXT:    [[T:%.*]] = extractelement <12 x i16> [[TMP1]], i32 8
+; CHECK-NEXT:    ret i16 [[T]]
+;
+  %e = extractelement <3 x i64> %x, i16 2
+  %t = trunc i64 %e to i16
+  ret i16 %t
+}
+
+; Do not optimize if it would result in an invalid bitcast instruction.
+define i13 @shrinkExtractElt_i67_to_i13_2(<3 x i67> %x) {
+; CHECK-LABEL: @shrinkExtractElt_i67_to_i13_2(
+; CHECK-NEXT:    [[E:%.*]] = extractelement <3 x i67> [[X:%.*]], i459 2
+; CHECK-NEXT:    [[T:%.*]] = trunc i67 [[E]] to i13
+; CHECK-NEXT:    ret i13 [[T]]
+;
+  %e = extractelement <3 x i67> %x, i459 2
+  %t = trunc i67 %e to i13
+  ret i13 %t
+}
+
+; Do not canonicalize if that would increase the instruction count.
+declare void @use(i64)
+define i16 @shrinkExtractElt_i64_to_i16_2_extra_use(<3 x i64> %x) {
+; CHECK-LABEL: @shrinkExtractElt_i64_to_i16_2_extra_use(
+; CHECK-NEXT:    [[E:%.*]] = extractelement <3 x i64> [[X:%.*]], i64 2
+; CHECK-NEXT:    call void @use(i64 [[E]])
+; CHECK-NEXT:    [[T:%.*]] = trunc i64 [[E]] to i16
+; CHECK-NEXT:    ret i16 [[T]]
+;
+  %e = extractelement <3 x i64> %x, i64 2
+  call void @use(i64 %e)
+  %t = trunc i64 %e to i16
+  ret i16 %t
+}