diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -1515,6 +1515,57 @@
   return CastInst::Create(CastOpcode, NewInsElt, InsElt.getType());
 }
 
+/// If we are inserting 2 halves of a value into adjacent elements of a vector,
+/// try to convert to a single insert with appropriate bitcasts.
+static Instruction *foldTruncInsEltPair(InsertElementInst &InsElt,
+                                        bool IsBigEndian,
+                                        InstCombiner::BuilderTy &Builder) {
+  Value *VecOp    = InsElt.getOperand(0);
+  Value *ScalarOp = InsElt.getOperand(1);
+  Value *IndexOp  = InsElt.getOperand(2);
+
+  // inselt (inselt BaseVec, (trunc X), Index0), (trunc (lshr X, BW/2)), Index1
+  // TODO: The insertion order could be reversed.
+  // TODO: Detect smaller fractions of the scalar.
+  // TODO: One-use checks are conservative.
+  auto *VTy = dyn_cast<FixedVectorType>(InsElt.getType());
+  Value *X, *BaseVec;
+  uint64_t ShAmt, Index0, Index1;
+  if (!VTy || (VTy->getNumElements() & 1) ||
+      !match(VecOp, m_OneUse(m_InsertElt(m_Value(BaseVec), m_Trunc(m_Value(X)),
+                                         m_ConstantInt(Index0)))) ||
+      !match(ScalarOp, m_OneUse(m_Trunc(m_LShr(m_Specific(X),
+                                               m_ConstantInt(ShAmt))))) ||
+      !match(IndexOp, m_ConstantInt(Index1)))
+    return nullptr;
+
+  Type *SrcTy = X->getType();
+  unsigned ScalarWidth = SrcTy->getScalarSizeInBits();
+  unsigned VecEltWidth = VTy->getScalarSizeInBits();
+  if (ScalarWidth != VecEltWidth * 2 || ShAmt != VecEltWidth)
+    return nullptr;
+
+  // The low half must be inserted at element +1 for big-endian.
+  // The high half must be inserted at element +1 for little-endian
+  if (IsBigEndian ? Index0 != Index1 + 1 : Index0 + 1 != Index1)
+    return nullptr;
+
+  // The high half must be inserted at an even element for big-endian.
+  // The low half must be inserted at an even element for little-endian.
+  if (IsBigEndian ? Index1 & 1 : Index0 & 1)
+    return nullptr;
+
+  // Bitcast the base vector to a vector type with the source element type.
+  Type *CastTy = FixedVectorType::get(SrcTy, VTy->getNumElements() / 2);
+  Value *CastBaseVec = Builder.CreateBitCast(BaseVec, CastTy);
+
+  // Scale the insert index for a vector with half as many elements.
+  // bitcast (inselt (bitcast BaseVec), X, NewIndex)
+  uint64_t NewIndex = IsBigEndian ? Index1 / 2 : Index0 / 2;
+  Value *NewInsert = Builder.CreateInsertElement(CastBaseVec, X, NewIndex);
+  return new BitCastInst(NewInsert, VTy);
+}
+
 Instruction *InstCombinerImpl::visitInsertElementInst(InsertElementInst &IE) {
   Value *VecOp    = IE.getOperand(0);
   Value *ScalarOp = IE.getOperand(1);
@@ -1642,6 +1693,9 @@
   if (Instruction *Ext = narrowInsElt(IE, Builder))
     return Ext;
 
+  if (Instruction *Ext = foldTruncInsEltPair(IE, DL.isBigEndian(), Builder))
+    return Ext;
+
   return nullptr;
 }
 
diff --git a/llvm/test/Transforms/InstCombine/insertelt-trunc.ll b/llvm/test/Transforms/InstCombine/insertelt-trunc.ll
--- a/llvm/test/Transforms/InstCombine/insertelt-trunc.ll
+++ b/llvm/test/Transforms/InstCombine/insertelt-trunc.ll
@@ -1,16 +1,24 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=instcombine -S -data-layout="E" | FileCheck %s --check-prefixes=ALL
-; RUN: opt < %s -passes=instcombine -S -data-layout="e" | FileCheck %s --check-prefixes=ALL
+; RUN: opt < %s -passes=instcombine -S -data-layout="E" | FileCheck %s --check-prefixes=ALL,BE
+; RUN: opt < %s -passes=instcombine -S -data-layout="e" | FileCheck %s --check-prefixes=ALL,LE
 
 
+declare void @use(i16)
+declare void @use_vec(<8 x i16>)
+
 define <4 x i16> @insert_01_poison_v4i16(i32 %x) {
-; ALL-LABEL: @insert_01_poison_v4i16(
-; ALL-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
-; ALL-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
-; ALL-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
-; ALL-NEXT:    [[INS0:%.*]] = insertelement <4 x i16> poison, i16 [[LO16]], i64 0
-; ALL-NEXT:    [[INS1:%.*]] = insertelement <4 x i16> [[INS0]], i16 [[HI16]], i64 1
-; ALL-NEXT:    ret <4 x i16> [[INS1]]
+; BE-LABEL: @insert_01_poison_v4i16(
+; BE-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
+; BE-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
+; BE-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
+; BE-NEXT:    [[INS0:%.*]] = insertelement <4 x i16> poison, i16 [[LO16]], i64 0
+; BE-NEXT:    [[INS1:%.*]] = insertelement <4 x i16> [[INS0]], i16 [[HI16]], i64 1
+; BE-NEXT:    ret <4 x i16> [[INS1]]
+;
+; LE-LABEL: @insert_01_poison_v4i16(
+; LE-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[X:%.*]], i64 0
+; LE-NEXT:    [[INS1:%.*]] = bitcast <2 x i32> [[TMP1]] to <4 x i16>
+; LE-NEXT:    ret <4 x i16> [[INS1]]
 ;
   %hi32 = lshr i32 %x, 16
   %hi16 = trunc i32 %hi32 to i16
@@ -21,13 +29,18 @@
 }
 
 define <8 x i16> @insert_10_poison_v8i16(i32 %x) {
-; ALL-LABEL: @insert_10_poison_v8i16(
-; ALL-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
-; ALL-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
-; ALL-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
-; ALL-NEXT:    [[INS0:%.*]] = insertelement <8 x i16> poison, i16 [[LO16]], i64 1
-; ALL-NEXT:    [[INS1:%.*]] = insertelement <8 x i16> [[INS0]], i16 [[HI16]], i64 0
-; ALL-NEXT:    ret <8 x i16> [[INS1]]
+; BE-LABEL: @insert_10_poison_v8i16(
+; BE-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i64 0
+; BE-NEXT:    [[INS1:%.*]] = bitcast <4 x i32> [[TMP1]] to <8 x i16>
+; BE-NEXT:    ret <8 x i16> [[INS1]]
+;
+; LE-LABEL: @insert_10_poison_v8i16(
+; LE-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
+; LE-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
+; LE-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
+; LE-NEXT:    [[INS0:%.*]] = insertelement <8 x i16> poison, i16 [[LO16]], i64 1
+; LE-NEXT:    [[INS1:%.*]] = insertelement <8 x i16> [[INS0]], i16 [[HI16]], i64 0
+; LE-NEXT:    ret <8 x i16> [[INS1]]
 ;
   %hi32 = lshr i32 %x, 16
   %hi16 = trunc i32 %hi32 to i16
@@ -37,6 +50,8 @@
   ret <8 x i16> %ins1
 }
 
+; negative test - larger element is not aligned in the vector
+
 define <4 x i32> @insert_12_poison_v4i32(i64 %x) {
 ; ALL-LABEL: @insert_12_poison_v4i32(
 ; ALL-NEXT:    [[HI64:%.*]] = lshr i64 [[X:%.*]], 32
@@ -54,6 +69,8 @@
   ret <4 x i32> %ins1
 }
 
+; negative test - larger element is not aligned in the vector
+
 define <4 x i16> @insert_21_poison_v4i16(i32 %x) {
 ; ALL-LABEL: @insert_21_poison_v4i16(
 ; ALL-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
@@ -72,13 +89,18 @@
 }
 
 define <4 x i32> @insert_23_poison_v4i32(i64 %x) {
-; ALL-LABEL: @insert_23_poison_v4i32(
-; ALL-NEXT:    [[HI64:%.*]] = lshr i64 [[X:%.*]], 32
-; ALL-NEXT:    [[HI32:%.*]] = trunc i64 [[HI64]] to i32
-; ALL-NEXT:    [[LO32:%.*]] = trunc i64 [[X]] to i32
-; ALL-NEXT:    [[INS0:%.*]] = insertelement <4 x i32> poison, i32 [[LO32]], i64 2
-; ALL-NEXT:    [[INS1:%.*]] = insertelement <4 x i32> [[INS0]], i32 [[HI32]], i64 3
-; ALL-NEXT:    ret <4 x i32> [[INS1]]
+; BE-LABEL: @insert_23_poison_v4i32(
+; BE-NEXT:    [[HI64:%.*]] = lshr i64 [[X:%.*]], 32
+; BE-NEXT:    [[HI32:%.*]] = trunc i64 [[HI64]] to i32
+; BE-NEXT:    [[LO32:%.*]] = trunc i64 [[X]] to i32
+; BE-NEXT:    [[INS0:%.*]] = insertelement <4 x i32> poison, i32 [[LO32]], i64 2
+; BE-NEXT:    [[INS1:%.*]] = insertelement <4 x i32> [[INS0]], i32 [[HI32]], i64 3
+; BE-NEXT:    ret <4 x i32> [[INS1]]
+;
+; LE-LABEL: @insert_23_poison_v4i32(
+; LE-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[X:%.*]], i64 1
+; LE-NEXT:    [[INS1:%.*]] = bitcast <2 x i64> [[TMP1]] to <4 x i32>
+; LE-NEXT:    ret <4 x i32> [[INS1]]
 ;
   %hi64 = lshr i64 %x, 32
   %hi32 = trunc i64 %hi64 to i32
@@ -89,13 +111,18 @@
 }
 
 define <4 x i16> @insert_32_poison_v4i16(i32 %x) {
-; ALL-LABEL: @insert_32_poison_v4i16(
-; ALL-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
-; ALL-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
-; ALL-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
-; ALL-NEXT:    [[INS0:%.*]] = insertelement <4 x i16> poison, i16 [[LO16]], i64 3
-; ALL-NEXT:    [[INS1:%.*]] = insertelement <4 x i16> [[INS0]], i16 [[HI16]], i64 2
-; ALL-NEXT:    ret <4 x i16> [[INS1]]
+; BE-LABEL: @insert_32_poison_v4i16(
+; BE-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[X:%.*]], i64 1
+; BE-NEXT:    [[INS1:%.*]] = bitcast <2 x i32> [[TMP1]] to <4 x i16>
+; BE-NEXT:    ret <4 x i16> [[INS1]]
+;
+; LE-LABEL: @insert_32_poison_v4i16(
+; LE-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
+; LE-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
+; LE-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
+; LE-NEXT:    [[INS0:%.*]] = insertelement <4 x i16> poison, i16 [[LO16]], i64 3
+; LE-NEXT:    [[INS1:%.*]] = insertelement <4 x i16> [[INS0]], i16 [[HI16]], i64 2
+; LE-NEXT:    ret <4 x i16> [[INS1]]
 ;
   %hi32 = lshr i32 %x, 16
   %hi16 = trunc i32 %hi32 to i16
@@ -105,14 +132,23 @@
   ret <4 x i16> %ins1
 }
 
+; Similar to the above tests but with a non-poison base vector.
+
+; Vector is same size as scalar, so this is just a cast.
+; TODO: Could be swapped/rotated into place.
+
 define <2 x i16> @insert_01_v2i16(i32 %x, <2 x i16> %v) {
-; ALL-LABEL: @insert_01_v2i16(
-; ALL-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
-; ALL-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
-; ALL-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
-; ALL-NEXT:    [[INS0:%.*]] = insertelement <2 x i16> poison, i16 [[LO16]], i64 0
-; ALL-NEXT:    [[INS1:%.*]] = insertelement <2 x i16> [[INS0]], i16 [[HI16]], i64 1
-; ALL-NEXT:    ret <2 x i16> [[INS1]]
+; BE-LABEL: @insert_01_v2i16(
+; BE-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
+; BE-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
+; BE-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
+; BE-NEXT:    [[INS0:%.*]] = insertelement <2 x i16> poison, i16 [[LO16]], i64 0
+; BE-NEXT:    [[INS1:%.*]] = insertelement <2 x i16> [[INS0]], i16 [[HI16]], i64 1
+; BE-NEXT:    ret <2 x i16> [[INS1]]
+;
+; LE-LABEL: @insert_01_v2i16(
+; LE-NEXT:    [[INS1:%.*]] = bitcast i32 [[X:%.*]] to <2 x i16>
+; LE-NEXT:    ret <2 x i16> [[INS1]]
 ;
   %hi32 = lshr i32 %x, 16
   %hi16 = trunc i32 %hi32 to i16
@@ -123,13 +159,19 @@
 }
 
 define <8 x i16> @insert_10_v8i16(i32 %x, <8 x i16> %v) {
-; ALL-LABEL: @insert_10_v8i16(
-; ALL-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
-; ALL-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
-; ALL-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
-; ALL-NEXT:    [[INS0:%.*]] = insertelement <8 x i16> [[V:%.*]], i16 [[LO16]], i64 1
-; ALL-NEXT:    [[INS1:%.*]] = insertelement <8 x i16> [[INS0]], i16 [[HI16]], i64 0
-; ALL-NEXT:    ret <8 x i16> [[INS1]]
+; BE-LABEL: @insert_10_v8i16(
+; BE-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <4 x i32>
+; BE-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[X:%.*]], i64 0
+; BE-NEXT:    [[INS1:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x i16>
+; BE-NEXT:    ret <8 x i16> [[INS1]]
+;
+; LE-LABEL: @insert_10_v8i16(
+; LE-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
+; LE-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
+; LE-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
+; LE-NEXT:    [[INS0:%.*]] = insertelement <8 x i16> [[V:%.*]], i16 [[LO16]], i64 1
+; LE-NEXT:    [[INS1:%.*]] = insertelement <8 x i16> [[INS0]], i16 [[HI16]], i64 0
+; LE-NEXT:    ret <8 x i16> [[INS1]]
 ;
   %hi32 = lshr i32 %x, 16
   %hi16 = trunc i32 %hi32 to i16
@@ -139,6 +181,8 @@
   ret <8 x i16> %ins1
 }
 
+; negative test - larger element is not aligned in the vector
+
 define <4 x i32> @insert_12_v4i32(i64 %x, <4 x i32> %v) {
 ; ALL-LABEL: @insert_12_v4i32(
 ; ALL-NEXT:    [[HI64:%.*]] = lshr i64 [[X:%.*]], 32
@@ -156,6 +200,8 @@
   ret <4 x i32> %ins1
 }
 
+; negative test - larger element is not aligned in the vector
+
 define <4 x i16> @insert_21_v4i16(i32 %x, <4 x i16> %v) {
 ; ALL-LABEL: @insert_21_v4i16(
 ; ALL-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
@@ -174,13 +220,19 @@
 }
 
 define <4 x i32> @insert_23_v4i32(i64 %x, <4 x i32> %v) {
-; ALL-LABEL: @insert_23_v4i32(
-; ALL-NEXT:    [[HI64:%.*]] = lshr i64 [[X:%.*]], 32
-; ALL-NEXT:    [[HI32:%.*]] = trunc i64 [[HI64]] to i32
-; ALL-NEXT:    [[LO32:%.*]] = trunc i64 [[X]] to i32
-; ALL-NEXT:    [[INS0:%.*]] = insertelement <4 x i32> [[V:%.*]], i32 [[LO32]], i64 2
-; ALL-NEXT:    [[INS1:%.*]] = insertelement <4 x i32> [[INS0]], i32 [[HI32]], i64 3
-; ALL-NEXT:    ret <4 x i32> [[INS1]]
+; BE-LABEL: @insert_23_v4i32(
+; BE-NEXT:    [[HI64:%.*]] = lshr i64 [[X:%.*]], 32
+; BE-NEXT:    [[HI32:%.*]] = trunc i64 [[HI64]] to i32
+; BE-NEXT:    [[LO32:%.*]] = trunc i64 [[X]] to i32
+; BE-NEXT:    [[INS0:%.*]] = insertelement <4 x i32> [[V:%.*]], i32 [[LO32]], i64 2
+; BE-NEXT:    [[INS1:%.*]] = insertelement <4 x i32> [[INS0]], i32 [[HI32]], i64 3
+; BE-NEXT:    ret <4 x i32> [[INS1]]
+;
+; LE-LABEL: @insert_23_v4i32(
+; LE-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <2 x i64>
+; LE-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[X:%.*]], i64 1
+; LE-NEXT:    [[INS1:%.*]] = bitcast <2 x i64> [[TMP2]] to <4 x i32>
+; LE-NEXT:    ret <4 x i32> [[INS1]]
 ;
   %hi64 = lshr i64 %x, 32
   %hi32 = trunc i64 %hi64 to i32
@@ -191,13 +243,19 @@
 }
 
 define <4 x i16> @insert_32_v4i16(i32 %x, <4 x i16> %v) {
-; ALL-LABEL: @insert_32_v4i16(
-; ALL-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
-; ALL-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
-; ALL-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
-; ALL-NEXT:    [[INS0:%.*]] = insertelement <4 x i16> [[V:%.*]], i16 [[LO16]], i64 3
-; ALL-NEXT:    [[INS1:%.*]] = insertelement <4 x i16> [[INS0]], i16 [[HI16]], i64 2
-; ALL-NEXT:    ret <4 x i16> [[INS1]]
+; BE-LABEL: @insert_32_v4i16(
+; BE-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <2 x i32>
+; BE-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[X:%.*]], i64 1
+; BE-NEXT:    [[INS1:%.*]] = bitcast <2 x i32> [[TMP2]] to <4 x i16>
+; BE-NEXT:    ret <4 x i16> [[INS1]]
+;
+; LE-LABEL: @insert_32_v4i16(
+; LE-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
+; LE-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
+; LE-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
+; LE-NEXT:    [[INS0:%.*]] = insertelement <4 x i16> [[V:%.*]], i16 [[LO16]], i64 3
+; LE-NEXT:    [[INS1:%.*]] = insertelement <4 x i16> [[INS0]], i16 [[HI16]], i64 2
+; LE-NEXT:    ret <4 x i16> [[INS1]]
 ;
   %hi32 = lshr i32 %x, 16
   %hi16 = trunc i32 %hi32 to i16
@@ -206,3 +264,131 @@
   %ins1 = insertelement <4 x i16> %ins0, i16 %hi16, i64 2
   ret <4 x i16> %ins1
 }
+
+; negative test - need half-width shift
+
+define <4 x i16> @insert_01_v4i16_wrong_shift1(i32 %x) {
+; ALL-LABEL: @insert_01_v4i16_wrong_shift1(
+; ALL-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 8
+; ALL-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
+; ALL-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
+; ALL-NEXT:    [[INS0:%.*]] = insertelement <4 x i16> poison, i16 [[LO16]], i64 0
+; ALL-NEXT:    [[INS1:%.*]] = insertelement <4 x i16> [[INS0]], i16 [[HI16]], i64 1
+; ALL-NEXT:    ret <4 x i16> [[INS1]]
+;
+  %hi32 = lshr i32 %x, 8
+  %hi16 = trunc i32 %hi32 to i16
+  %lo16 = trunc i32 %x to i16
+  %ins0 = insertelement <4 x i16> poison, i16 %lo16, i64 0
+  %ins1 = insertelement <4 x i16> %ins0, i16 %hi16, i64 1
+  ret <4 x i16> %ins1
+}
+
+; negative test - need common scalar
+
+define <4 x i16> @insert_01_v4i16_wrong_op(i32 %x, i32 %y) {
+; ALL-LABEL: @insert_01_v4i16_wrong_op(
+; ALL-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
+; ALL-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
+; ALL-NEXT:    [[LO16:%.*]] = trunc i32 [[Y:%.*]] to i16
+; ALL-NEXT:    [[INS0:%.*]] = insertelement <4 x i16> poison, i16 [[LO16]], i64 0
+; ALL-NEXT:    [[INS1:%.*]] = insertelement <4 x i16> [[INS0]], i16 [[HI16]], i64 1
+; ALL-NEXT:    ret <4 x i16> [[INS1]]
+;
+  %hi32 = lshr i32 %x, 16
+  %hi16 = trunc i32 %hi32 to i16
+  %lo16 = trunc i32 %y to i16
+  %ins0 = insertelement <4 x i16> poison, i16 %lo16, i64 0
+  %ins1 = insertelement <4 x i16> %ins0, i16 %hi16, i64 1
+  ret <4 x i16> %ins1
+}
+
+; TODO: extra use doesn't have to prevent the fold.
+
+define <8 x i16> @insert_67_v4i16_uses1(i32 %x, <8 x i16> %v) {
+; ALL-LABEL: @insert_67_v4i16_uses1(
+; ALL-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
+; ALL-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
+; ALL-NEXT:    call void @use(i16 [[HI16]])
+; ALL-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
+; ALL-NEXT:    [[INS0:%.*]] = insertelement <8 x i16> [[V:%.*]], i16 [[LO16]], i64 6
+; ALL-NEXT:    [[INS1:%.*]] = insertelement <8 x i16> [[INS0]], i16 [[HI16]], i64 7
+; ALL-NEXT:    ret <8 x i16> [[INS1]]
+;
+  %hi32 = lshr i32 %x, 16
+  %hi16 = trunc i32 %hi32 to i16
+  call void @use(i16 %hi16)
+  %lo16 = trunc i32 %x to i16
+  %ins0 = insertelement <8 x i16> %v, i16 %lo16, i64 6
+  %ins1 = insertelement <8 x i16> %ins0, i16 %hi16, i64 7
+  ret <8 x i16> %ins1
+}
+
+; extra use is ok
+
+define <8 x i16> @insert_76_v4i16_uses2(i32 %x, <8 x i16> %v) {
+; BE-LABEL: @insert_76_v4i16_uses2(
+; BE-NEXT:    [[LO16:%.*]] = trunc i32 [[X:%.*]] to i16
+; BE-NEXT:    call void @use(i16 [[LO16]])
+; BE-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <4 x i32>
+; BE-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[X]], i64 3
+; BE-NEXT:    [[INS1:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x i16>
+; BE-NEXT:    ret <8 x i16> [[INS1]]
+;
+; LE-LABEL: @insert_76_v4i16_uses2(
+; LE-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
+; LE-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
+; LE-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
+; LE-NEXT:    call void @use(i16 [[LO16]])
+; LE-NEXT:    [[INS0:%.*]] = insertelement <8 x i16> [[V:%.*]], i16 [[LO16]], i64 7
+; LE-NEXT:    [[INS1:%.*]] = insertelement <8 x i16> [[INS0]], i16 [[HI16]], i64 6
+; LE-NEXT:    ret <8 x i16> [[INS1]]
+;
+  %hi32 = lshr i32 %x, 16
+  %hi16 = trunc i32 %hi32 to i16
+  %lo16 = trunc i32 %x to i16
+  call void @use(i16 %lo16)
+  %ins0 = insertelement <8 x i16> %v, i16 %lo16, i64 7
+  %ins1 = insertelement <8 x i16> %ins0, i16 %hi16, i64 6
+  ret <8 x i16> %ins1
+}
+
+; TODO: extra use doesn't have to prevent the fold.
+
+define <8 x i16> @insert_67_v4i16_uses3(i32 %x, <8 x i16> %v) {
+; ALL-LABEL: @insert_67_v4i16_uses3(
+; ALL-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
+; ALL-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
+; ALL-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
+; ALL-NEXT:    [[INS0:%.*]] = insertelement <8 x i16> [[V:%.*]], i16 [[LO16]], i64 6
+; ALL-NEXT:    call void @use_vec(<8 x i16> [[INS0]])
+; ALL-NEXT:    [[INS1:%.*]] = insertelement <8 x i16> [[INS0]], i16 [[HI16]], i64 7
+; ALL-NEXT:    ret <8 x i16> [[INS1]]
+;
+  %hi32 = lshr i32 %x, 16
+  %hi16 = trunc i32 %hi32 to i16
+  %lo16 = trunc i32 %x to i16
+  %ins0 = insertelement <8 x i16> %v, i16 %lo16, i64 6
+  call void @use_vec(<8 x i16> %ins0)
+  %ins1 = insertelement <8 x i16> %ins0, i16 %hi16, i64 7
+  ret <8 x i16> %ins1
+}
+
+; TODO: This is equivalent to the 1st test.
+
+define <4 x i16> @insert_01_poison_v4i16_high_first(i32 %x) {
+; ALL-LABEL: @insert_01_poison_v4i16_high_first(
+; ALL-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
+; ALL-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
+; ALL-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
+; ALL-NEXT:    [[INS1:%.*]] = insertelement <4 x i16> poison, i16 [[HI16]], i64 1
+; ALL-NEXT:    [[INS0:%.*]] = insertelement <4 x i16> [[INS1]], i16 [[LO16]], i64 0
+; ALL-NEXT:    ret <4 x i16> [[INS0]]
+;
+  %hi32 = lshr i32 %x, 16
+  %hi16 = trunc i32 %hi32 to i16
+  %lo16 = trunc i32 %x to i16
+  %ins1 = insertelement <4 x i16> poison, i16 %hi16, i64 1
+  %ins0 = insertelement <4 x i16> %ins1, i16 %lo16, i64 0
+  ret <4 x i16> %ins0
+}
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
--- a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
@@ -47,34 +47,24 @@
 define noundef <4 x float> @ConvertVectors_ByVal(ptr noundef nonnull align 16 dereferenceable(16) %V) #0 {
 ; SSE-LABEL: @ConvertVectors_ByVal(
 ; SSE-NEXT:  entry:
-; SSE-NEXT:    [[V_VAL20:%.*]] = load i64, ptr [[V:%.*]], align 16
-; SSE-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[V]], i64 8
-; SSE-NEXT:    [[V_VAL421:%.*]] = load i64, ptr [[TMP0]], align 8
-; SSE-NEXT:    [[TMP1:%.*]] = lshr i64 [[V_VAL20]], 32
-; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[V_VAL20]], i64 0
-; SSE-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP1]], i64 1
-; SSE-NEXT:    [[TMP4:%.*]] = trunc <2 x i64> [[TMP3]] to <2 x i32>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; SSE-NEXT:    [[TMP6:%.*]] = trunc i64 [[V_VAL421]] to i32
-; SSE-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP6]], i64 2
-; SSE-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP6]], i64 3
-; SSE-NEXT:    [[VECINIT16:%.*]] = bitcast <4 x i32> [[TMP8]] to <4 x float>
+; SSE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[V:%.*]], align 16
+; SSE-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[V]], i64 8
+; SSE-NEXT:    [[V_VAL421:%.*]] = load i64, ptr [[TMP1]], align 8
+; SSE-NEXT:    [[TMP2:%.*]] = trunc i64 [[V_VAL421]] to i32
+; SSE-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to float
+; SSE-NEXT:    [[VECINIT11:%.*]] = insertelement <4 x float> [[TMP0]], float [[TMP3]], i64 2
+; SSE-NEXT:    [[VECINIT16:%.*]] = insertelement <4 x float> [[VECINIT11]], float [[TMP3]], i64 3
 ; SSE-NEXT:    ret <4 x float> [[VECINIT16]]
 ;
 ; AVX-LABEL: @ConvertVectors_ByVal(
 ; AVX-NEXT:  entry:
-; AVX-NEXT:    [[V_VAL20:%.*]] = load i64, ptr [[V:%.*]], align 16
-; AVX-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[V]], i64 8
-; AVX-NEXT:    [[V_VAL421:%.*]] = load i64, ptr [[TMP0]], align 8
-; AVX-NEXT:    [[TMP1:%.*]] = trunc i64 [[V_VAL20]] to i32
-; AVX-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1]], i64 0
-; AVX-NEXT:    [[TMP3:%.*]] = lshr i64 [[V_VAL20]], 32
-; AVX-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
-; AVX-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i64 1
-; AVX-NEXT:    [[TMP6:%.*]] = trunc i64 [[V_VAL421]] to i32
-; AVX-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP6]], i64 2
-; AVX-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP6]], i64 3
-; AVX-NEXT:    [[VECINIT16:%.*]] = bitcast <4 x i32> [[TMP8]] to <4 x float>
+; AVX-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[V:%.*]], align 16
+; AVX-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[V]], i64 8
+; AVX-NEXT:    [[V_VAL421:%.*]] = load i64, ptr [[TMP1]], align 8
+; AVX-NEXT:    [[TMP2:%.*]] = trunc i64 [[V_VAL421]] to i32
+; AVX-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to float
+; AVX-NEXT:    [[VECINIT11:%.*]] = insertelement <4 x float> [[TMP0]], float [[TMP3]], i64 2
+; AVX-NEXT:    [[VECINIT16:%.*]] = insertelement <4 x float> [[VECINIT11]], float [[TMP3]], i64 3
 ; AVX-NEXT:    ret <4 x float> [[VECINIT16]]
 ;
 entry: