diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp
--- a/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -1454,8 +1454,83 @@
   return createStoreExpression(SI, StoreAccess);
 }
 
+// A load can have one or more dependencies as the following examples show:
+//
+// Example 1:
+//  BB1:
+//   ...
+//   store i32 %V1, i32* %P
+//   ...
+//   %V2 = load i32, i32* %P
+//   ...
+//
+// Example 2:
+//  BB1:                       BB2:
+//   store i32 %V1, i32* %P     %V2 = load i32, i32* %P
+//   br label %BB3              br label %BB3
+//                      \      /
+//                     BB3:
+//                      %V3 = load i32, i32* %P
+//
+// In the first example, the load (%V2) has only one dependency. In the second
+// example, the load (%V3) has two dependencies. Therefore, we add the load
+// along with its two dependencies in LoadCoercion map. However, this is not
+// always the case as it is shown below:
+//
+// Example 3:
+//                   BB1:
+//                    %P1 = bitcast i32* %P to <4 x i32>*
+//                    %V1 = load <4 x i32>, <4 x i32>* %P1
+//                    br i1 %cond, label %BB2, label %BB3
+//                   /                          \
+//   BB2:                                      BB3:
+//    %P2 = bitcast i32* %P to <2 x i32>*       %V3 = load i32, i32* %P
+//    %V2 = load <2 x i32>, <2 x i32>* %P2      br label %BB4
+//    br label %BB4                             /
+//		     \                         /
+//                  BB4:
+//                   %V4 = load i32, i32* %P
+//
+// In the above example, the load (%V4) can be optimized out by any of the loads
+// (%V1, %V2, %V3). But, loads %V2 and %V3 can also be optimized out by %V1. For
+// this reason, we need to do an extra check before we add the load in the map.
+// We check if the load is already in the map and if the existing depending
+// instruction dominates the current depending instruction. If so, then we do
+// not add the new depending instruction in LoadCoercion map. If the current
+// depending instruction dominates the existing depending instruction, then we
+// remove the existing depending instruction from LoadCoercion map and we add
+// the current depending instruction. Therefore, in Example 3, the load
+// (%V4) has only one dependency (%V1) and we add only this one in LoadCoercion
+// map.
 void NewGVN::tryAddLoadDepInsnIntoLoadCoercionMap(
     LoadInst *LI, Instruction *CurrentDepI) const {
+  // Check if LI already exit in LoadCoercion map.
+  auto It = const_cast<NewGVN *>(this)->LoadCoercion.find(LI);
+  if (It != LoadCoercion.end()) {
+    auto &ExistingDepIs = It->second;
+    // Iterate over all the existing depending instructions of LI.
+    for (Instruction *ExistingDepI :
+         llvm::make_early_inc_range(ExistingDepIs)) {
+
+      if (MSSAWalker->getClobberingMemoryAccess(getMemoryAccess(CurrentDepI)) ==
+              MSSAWalker->getClobberingMemoryAccess(
+                  getMemoryAccess(ExistingDepI)) &&
+          isa<LoadInst>(ExistingDepI) && isa<LoadInst>(CurrentDepI)) {
+        // If the existing depending instruction dominates the current depending
+        // instruction, then we should not add the current depending instruction
+        // in LoadCoercion map (Example 3).
+        if (DT->dominates(ExistingDepI, CurrentDepI))
+          return;
+
+        // If the current depending instruction dominates the existing one, then
+        // we remove the existing depending instruction from the LoadCoercion
+        // map. Next, we add the current depending instruction in LoadCoercion
+        // map.
+        if (DT->dominates(CurrentDepI, ExistingDepI))
+          ExistingDepIs.erase(ExistingDepI);
+      }
+    }
+  }
   // Add the load and the corresponding depending instruction in LoadCoercion
   // map.
   const_cast<NewGVN *>(this)->LoadCoercion[LI].insert(CurrentDepI);
@@ -1498,13 +1573,19 @@
     int Offset = analyzeLoadFromClobberingLoad(LoadType, LoadPtr, DepLI, DL);
     if (Offset >= 0) {
       // We can coerce a constant load into a load.
-      if (auto *C = dyn_cast<Constant>(lookupOperandLeader(DepLI)))
+      if (auto *C = dyn_cast<Constant>(lookupOperandLeader(DepLI))) {
         if (auto *PossibleConstant =
                 getConstantLoadValueForLoad(C, Offset, LoadType, DL)) {
           LLVM_DEBUG(dbgs() << "Coercing load from load " << *LI
                             << " to constant " << *PossibleConstant << "\n");
           return createConstantExpression(PossibleConstant);
         }
+      } else if (EnableLoadCoercion) {
+        // Similarly, we do not create a load expression for the loads that are
+        // elimianted with load coercion.
+        tryAddLoadDepInsnIntoLoadCoercionMap(LI, DepInst);
+        return nullptr;
+      }
     }
   } else if (auto *DepMI = dyn_cast<MemIntrinsic>(DepInst)) {
     int Offset = analyzeLoadFromClobberingMemInst(LoadType, LoadPtr, DepMI, DL);
@@ -1574,8 +1655,45 @@
                                           DefiningInst, DefiningAccess))
         return CoercionResult;
     }
+  } else if (EnableLoadCoercion) {
+    // Check if any of the live-in loads can be eliminated with load coercion.
+    for (const auto &U : DefiningAccess->uses())
+      if (auto *MemUse = dyn_cast<MemoryUse>(U.getUser())) {
+        LoadInst *DependingLoad = dyn_cast<LoadInst>(MemUse->getMemoryInst());
+
+        if (!DependingLoad || LI == DependingLoad)
+          continue;
+
+        // The DependingLoad should have bigger bit size than the load that we
+        // should optimize.
+        if (DL.getTypeSizeInBits(DependingLoad->getType()).getFixedSize() <
+            DL.getTypeSizeInBits(LI->getType()).getFixedSize())
+          continue;
+
+        // If the depending load does not have any uses, then we should not do
+        // load coercion because the depending load will be eliminated.
+        if (DependingLoad->getNumUses() == 0)
+          continue;
+
+        // If two load instructions have the same operands, then it is not
+        // load coercion.
+        bool DependingLoadDomintatesLI = DT->dominates(DependingLoad, LI);
+        if (DependingLoad->getPointerOperand() == LI->getPointerOperand() &&
+            DependingLoadDomintatesLI)
+          continue;
+
+        // The two loads should be executed in the right order.
+        if (DependingLoadDomintatesLI)
+          performSymbolicLoadCoercion(LI->getType(), LI->getPointerOperand(),
+                                      LI, DependingLoad,
+                                      getMemoryAccess(DependingLoad));
+      }
   }
 
+  // We do not create a load expression for the loads of load coercion.
+  if (EnableLoadCoercion && LoadCoercion.count(LI))
+    return nullptr;
+
   const auto *LE = createLoadExpression(LI->getType(), LoadAddressLeader, LI,
                                         DefiningAccess);
   // If our MemoryLeader is not our defining access, add a use to the
diff --git a/llvm/test/Transforms/NewGVN/load_coercion_between_loads.ll b/llvm/test/Transforms/NewGVN/load_coercion_between_loads.ll
--- a/llvm/test/Transforms/NewGVN/load_coercion_between_loads.ll
+++ b/llvm/test/Transforms/NewGVN/load_coercion_between_loads.ll
@@ -12,10 +12,9 @@
 ;
 ; NEWGVN-LABEL: @test1(
 ; NEWGVN-NEXT:    [[V1:%.*]] = load i32, i32* [[P1:%.*]], align 4
+; NEWGVN-NEXT:    [[TMP1:%.*]] = trunc i32 [[V1]] to i8
 ; NEWGVN-NEXT:    [[P2:%.*]] = bitcast i32* [[P1]] to i8*
-; NEWGVN-NEXT:    [[V2:%.*]] = load i8, i8* [[P2]], align 1
-; NEWGVN-NEXT:    [[V3:%.*]] = trunc i32 [[V1]] to i8
-; NEWGVN-NEXT:    [[V4:%.*]] = add i8 [[V2]], [[V3]]
+; NEWGVN-NEXT:    [[V4:%.*]] = add i8 [[TMP1]], [[TMP1]]
 ; NEWGVN-NEXT:    ret i8 [[V4]]
 ;
   %V1 = load i32, i32* %P1
@@ -41,11 +40,10 @@
 ; NEWGVN-NEXT:  Entry:
 ; NEWGVN-NEXT:    [[P1:%.*]] = bitcast i8* [[P:%.*]] to <2 x i32>*
 ; NEWGVN-NEXT:    [[V1:%.*]] = load <2 x i32>, <2 x i32>* [[P1]], align 8
+; NEWGVN-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V1]] to i64
+; NEWGVN-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
 ; NEWGVN-NEXT:    [[P2:%.*]] = bitcast i8* [[P]] to i32*
-; NEWGVN-NEXT:    [[V2:%.*]] = load i32, i32* [[P2]], align 4
-; NEWGVN-NEXT:    [[V3:%.*]] = bitcast <2 x i32> [[V1]] to i64
-; NEWGVN-NEXT:    [[V4:%.*]] = trunc i64 [[V3]] to i32
-; NEWGVN-NEXT:    [[V5:%.*]] = add i32 [[V2]], [[V4]]
+; NEWGVN-NEXT:    [[V5:%.*]] = add i32 [[TMP1]], [[TMP1]]
 ; NEWGVN-NEXT:    ret i32 [[V5]]
 ;
 Entry:
@@ -75,10 +73,11 @@
 ; NEWGVN-NEXT:  Entry:
 ; NEWGVN-NEXT:    [[P1:%.*]] = bitcast i8* [[P:%.*]] to <2 x i32>*
 ; NEWGVN-NEXT:    [[V1:%.*]] = load <2 x i32>, <2 x i32>* [[P1]], align 8
+; NEWGVN-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V1]] to i64
+; NEWGVN-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
 ; NEWGVN-NEXT:    [[P2:%.*]] = bitcast i8* [[P]] to i32*
-; NEWGVN-NEXT:    [[V2:%.*]] = load i32, i32* [[P2]], align 4
 ; NEWGVN-NEXT:    [[I1:%.*]] = insertvalue <{ <2 x i32>, i32 }> undef, <2 x i32> [[V1]], 0
-; NEWGVN-NEXT:    [[I2:%.*]] = insertvalue <{ <2 x i32>, i32 }> [[I1]], i32 [[V2]], 1
+; NEWGVN-NEXT:    [[I2:%.*]] = insertvalue <{ <2 x i32>, i32 }> [[I1]], i32 [[TMP1]], 1
 ; NEWGVN-NEXT:    ret <{ <2 x i32>, i32 }> [[I2]]
 ;
 Entry:
@@ -181,14 +180,16 @@
 ; NEWGVN-NEXT:  Entry:
 ; NEWGVN-NEXT:    [[P1:%.*]] = bitcast i8* [[P:%.*]] to <4 x i32>*
 ; NEWGVN-NEXT:    [[V1:%.*]] = load <4 x i32>, <4 x i32>* [[P1]], align 16
+; NEWGVN-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V1]] to i128
+; NEWGVN-NEXT:    [[TMP1:%.*]] = trunc i128 [[TMP0]] to i64
+; NEWGVN-NEXT:    [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32>
+; NEWGVN-NEXT:    [[TMP3:%.*]] = trunc i128 [[TMP0]] to i32
 ; NEWGVN-NEXT:    [[P2:%.*]] = bitcast i8* [[P]] to <2 x i32>*
-; NEWGVN-NEXT:    [[V2:%.*]] = load <2 x i32>, <2 x i32>* [[P2]], align 8
 ; NEWGVN-NEXT:    [[P3:%.*]] = bitcast i8* [[P]] to i32*
-; NEWGVN-NEXT:    [[V3:%.*]] = load i32, i32* [[P3]], align 4
 ; NEWGVN-NEXT:    [[I1:%.*]] = insertvalue <{ <4 x i32>, <2 x i32>, i32, <2 x i32> }> undef, <4 x i32> [[V1]], 0
-; NEWGVN-NEXT:    [[I2:%.*]] = insertvalue <{ <4 x i32>, <2 x i32>, i32, <2 x i32> }> [[I1]], <2 x i32> [[V2]], 1
-; NEWGVN-NEXT:    [[I3:%.*]] = insertvalue <{ <4 x i32>, <2 x i32>, i32, <2 x i32> }> [[I2]], i32 [[V3]], 2
-; NEWGVN-NEXT:    [[I4:%.*]] = insertvalue <{ <4 x i32>, <2 x i32>, i32, <2 x i32> }> [[I3]], <2 x i32> [[V2]], 3
+; NEWGVN-NEXT:    [[I2:%.*]] = insertvalue <{ <4 x i32>, <2 x i32>, i32, <2 x i32> }> [[I1]], <2 x i32> [[TMP2]], 1
+; NEWGVN-NEXT:    [[I3:%.*]] = insertvalue <{ <4 x i32>, <2 x i32>, i32, <2 x i32> }> [[I2]], i32 [[TMP3]], 2
+; NEWGVN-NEXT:    [[I4:%.*]] = insertvalue <{ <4 x i32>, <2 x i32>, i32, <2 x i32> }> [[I3]], <2 x i32> [[TMP2]], 3
 ; NEWGVN-NEXT:    ret <{ <4 x i32>, <2 x i32>, i32, <2 x i32> }> [[I4]]
 ;
 Entry:
@@ -215,24 +216,23 @@
 ; OLDGVN-NEXT:  Entry:
 ; OLDGVN-NEXT:    [[V1:%.*]] = load i32, i32* [[P1:%.*]], align 4
 ; OLDGVN-NEXT:    [[P2:%.*]] = bitcast i32* [[P1]] to float*
-; OLDGVN-NEXT:    [[TMP0:%.*]] = bitcast i32 [[V1]] to float
+; OLDGVN-NEXT:    [[TMP1:%.*]] = bitcast i32 [[V1]] to float
 ; OLDGVN-NEXT:    br i1 [[COND:%.*]], label [[T:%.*]], label [[F:%.*]]
 ; OLDGVN:       T:
-; OLDGVN-NEXT:    ret float [[TMP0]]
+; OLDGVN-NEXT:    ret float [[TMP1]]
 ; OLDGVN:       F:
-; OLDGVN-NEXT:    ret float [[TMP0]]
+; OLDGVN-NEXT:    ret float [[TMP1]]
 ;
 ; NEWGVN-LABEL: @test7(
 ; NEWGVN-NEXT:  Entry:
 ; NEWGVN-NEXT:    [[V1:%.*]] = load i32, i32* [[P1:%.*]], align 4
+; NEWGVN-NEXT:    [[TMP1:%.*]] = bitcast i32 [[V1]] to float
 ; NEWGVN-NEXT:    [[P2:%.*]] = bitcast i32* [[P1]] to float*
-; NEWGVN-NEXT:    [[V2:%.*]] = load float, float* [[P2]], align 4
 ; NEWGVN-NEXT:    br i1 [[COND:%.*]], label [[T:%.*]], label [[F:%.*]]
 ; NEWGVN:       T:
-; NEWGVN-NEXT:    ret float [[V2]]
+; NEWGVN-NEXT:    ret float [[TMP1]]
 ; NEWGVN:       F:
-; NEWGVN-NEXT:    [[V3:%.*]] = bitcast i32 [[V1]] to float
-; NEWGVN-NEXT:    ret float [[V3]]
+; NEWGVN-NEXT:    ret float [[TMP1]]
 ;
 Entry:
   %V1 = load i32, i32* %P1
@@ -316,14 +316,16 @@
 ; NEWGVN-NEXT:  Entry:
 ; NEWGVN-NEXT:    [[P1:%.*]] = bitcast i8* [[P:%.*]] to <4 x i32>*
 ; NEWGVN-NEXT:    [[V1:%.*]] = load <4 x i32>, <4 x i32>* [[P1]], align 16
+; NEWGVN-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V1]] to i128
+; NEWGVN-NEXT:    [[TMP1:%.*]] = trunc i128 [[TMP0]] to i64
+; NEWGVN-NEXT:    [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32>
+; NEWGVN-NEXT:    [[TMP3:%.*]] = trunc i128 [[TMP0]] to i32
 ; NEWGVN-NEXT:    [[P2:%.*]] = bitcast i8* [[P]] to <2 x i32>*
-; NEWGVN-NEXT:    [[V2:%.*]] = load <2 x i32>, <2 x i32>* [[P2]], align 8
 ; NEWGVN-NEXT:    [[P3:%.*]] = bitcast i8* [[P]] to i32*
-; NEWGVN-NEXT:    [[V3:%.*]] = load i32, i32* [[P3]], align 4
 ; NEWGVN-NEXT:    [[I1:%.*]] = insertvalue <{ <4 x i32>, <2 x i32>, i32, <2 x i32> }> undef, <4 x i32> [[V1]], 0
-; NEWGVN-NEXT:    [[I2:%.*]] = insertvalue <{ <4 x i32>, <2 x i32>, i32, <2 x i32> }> [[I1]], <2 x i32> [[V2]], 1
-; NEWGVN-NEXT:    [[I3:%.*]] = insertvalue <{ <4 x i32>, <2 x i32>, i32, <2 x i32> }> [[I2]], i32 [[V3]], 2
-; NEWGVN-NEXT:    [[I4:%.*]] = insertvalue <{ <4 x i32>, <2 x i32>, i32, <2 x i32> }> [[I3]], <2 x i32> [[V2]], 3
+; NEWGVN-NEXT:    [[I2:%.*]] = insertvalue <{ <4 x i32>, <2 x i32>, i32, <2 x i32> }> [[I1]], <2 x i32> [[TMP2]], 1
+; NEWGVN-NEXT:    [[I3:%.*]] = insertvalue <{ <4 x i32>, <2 x i32>, i32, <2 x i32> }> [[I2]], i32 [[TMP3]], 2
+; NEWGVN-NEXT:    [[I4:%.*]] = insertvalue <{ <4 x i32>, <2 x i32>, i32, <2 x i32> }> [[I3]], <2 x i32> [[TMP2]], 3
 ; NEWGVN-NEXT:    ret <{ <4 x i32>, <2 x i32>, i32, <2 x i32> }> [[I4]]
 ;
 Entry:
@@ -369,17 +371,19 @@
 ; NEWGVN-NEXT:  Entry:
 ; NEWGVN-NEXT:    [[P1:%.*]] = bitcast i8* [[P:%.*]] to <4 x i32>*
 ; NEWGVN-NEXT:    [[V1:%.*]] = load <4 x i32>, <4 x i32>* [[P1]], align 16
+; NEWGVN-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V1]] to i128
+; NEWGVN-NEXT:    [[TMP1:%.*]] = trunc i128 [[TMP0]] to i32
+; NEWGVN-NEXT:    [[TMP2:%.*]] = trunc i128 [[TMP0]] to i64
+; NEWGVN-NEXT:    [[TMP3:%.*]] = bitcast i64 [[TMP2]] to <2 x i32>
 ; NEWGVN-NEXT:    [[I1:%.*]] = insertvalue <{ <4 x i32>, <2 x i32>, i32 }> undef, <4 x i32> [[V1]], 0
 ; NEWGVN-NEXT:    br i1 [[COND:%.*]], label [[T:%.*]], label [[F:%.*]]
 ; NEWGVN:       T:
 ; NEWGVN-NEXT:    [[P2:%.*]] = bitcast i8* [[P]] to <2 x i32>*
-; NEWGVN-NEXT:    [[V2:%.*]] = load <2 x i32>, <2 x i32>* [[P2]], align 8
-; NEWGVN-NEXT:    [[I2:%.*]] = insertvalue <{ <4 x i32>, <2 x i32>, i32 }> [[I1]], <2 x i32> [[V2]], 1
+; NEWGVN-NEXT:    [[I2:%.*]] = insertvalue <{ <4 x i32>, <2 x i32>, i32 }> [[I1]], <2 x i32> [[TMP3]], 1
 ; NEWGVN-NEXT:    ret <{ <4 x i32>, <2 x i32>, i32 }> [[I2]]
 ; NEWGVN:       F:
 ; NEWGVN-NEXT:    [[P3:%.*]] = bitcast i8* [[P]] to i32*
-; NEWGVN-NEXT:    [[V3:%.*]] = load i32, i32* [[P3]], align 4
-; NEWGVN-NEXT:    [[I3:%.*]] = insertvalue <{ <4 x i32>, <2 x i32>, i32 }> [[I1]], i32 [[V3]], 2
+; NEWGVN-NEXT:    [[I3:%.*]] = insertvalue <{ <4 x i32>, <2 x i32>, i32 }> [[I1]], i32 [[TMP1]], 2
 ; NEWGVN-NEXT:    ret <{ <4 x i32>, <2 x i32>, i32 }> [[I3]]
 ;
 Entry:
@@ -419,13 +423,14 @@
 ; NEWGVN-NEXT:  Entry:
 ; NEWGVN-NEXT:    [[P1:%.*]] = bitcast i8* [[P:%.*]] to <2 x i32>*
 ; NEWGVN-NEXT:    [[V1:%.*]] = load <2 x i32>, <2 x i32>* [[P1]], align 8
+; NEWGVN-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V1]] to i64
+; NEWGVN-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+; NEWGVN-NEXT:    [[TMP2:%.*]] = bitcast i32 [[TMP1]] to float
 ; NEWGVN-NEXT:    [[P2:%.*]] = bitcast i8* [[P]] to i32*
-; NEWGVN-NEXT:    [[V2:%.*]] = load i32, i32* [[P2]], align 4
 ; NEWGVN-NEXT:    [[P3:%.*]] = bitcast i8* [[P]] to float*
-; NEWGVN-NEXT:    [[V3:%.*]] = load float, float* [[P3]], align 4
 ; NEWGVN-NEXT:    [[I1:%.*]] = insertvalue <{ <2 x i32>, i32, float }> undef, <2 x i32> [[V1]], 0
-; NEWGVN-NEXT:    [[I2:%.*]] = insertvalue <{ <2 x i32>, i32, float }> [[I1]], i32 [[V2]], 1
-; NEWGVN-NEXT:    [[I3:%.*]] = insertvalue <{ <2 x i32>, i32, float }> [[I2]], float [[V3]], 2
+; NEWGVN-NEXT:    [[I2:%.*]] = insertvalue <{ <2 x i32>, i32, float }> [[I1]], i32 [[TMP1]], 1
+; NEWGVN-NEXT:    [[I3:%.*]] = insertvalue <{ <2 x i32>, i32, float }> [[I2]], float [[TMP2]], 2
 ; NEWGVN-NEXT:    ret <{ <2 x i32>, i32, float }> [[I3]]
 ;
 Entry:
@@ -454,13 +459,11 @@
 ;
 ; NEWGVN-LABEL: @test12(
 ; NEWGVN-NEXT:    [[V1:%.*]] = load i32, i32* [[P1:%.*]], align 4
-; NEWGVN-NEXT:    [[P2:%.*]] = bitcast i32* [[P1]] to i8*
-; NEWGVN-NEXT:    [[V2:%.*]] = load i8, i8* [[P2]], align 1
-; NEWGVN-NEXT:    [[V3:%.*]] = trunc i32 [[V1]] to i8
+; NEWGVN-NEXT:    [[TMP1:%.*]] = trunc i32 [[V1]] to i8
 ; NEWGVN-NEXT:    store i32 [[V:%.*]], i32* [[P1]], align 4
-; NEWGVN-NEXT:    [[TMP1:%.*]] = trunc i32 [[V]] to i8
-; NEWGVN-NEXT:    [[V5:%.*]] = add i8 [[V2]], [[V3]]
-; NEWGVN-NEXT:    [[V6:%.*]] = add i8 [[TMP1]], [[V5]]
+; NEWGVN-NEXT:    [[TMP2:%.*]] = trunc i32 [[V]] to i8
+; NEWGVN-NEXT:    [[V5:%.*]] = add i8 [[TMP1]], [[TMP1]]
+; NEWGVN-NEXT:    [[V6:%.*]] = add i8 [[TMP2]], [[V5]]
 ; NEWGVN-NEXT:    ret i8 [[V6]]
 ;
   %V1 = load i32, i32* %P1
@@ -548,6 +551,8 @@
 ; NEWGVN-NEXT:  Entry:
 ; NEWGVN-NEXT:    [[P1:%.*]] = bitcast i32* [[P:%.*]] to <2 x i32>*
 ; NEWGVN-NEXT:    [[V1:%.*]] = load <2 x i32>, <2 x i32>* [[P1]], align 8
+; NEWGVN-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V1]] to i64
+; NEWGVN-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
 ; NEWGVN-NEXT:    br i1 [[COND:%.*]], label [[T:%.*]], label [[F:%.*]]
 ; NEWGVN:       T:
 ; NEWGVN-NEXT:    br label [[EXIT:%.*]]
@@ -556,8 +561,7 @@
 ; NEWGVN:       Exit:
 ; NEWGVN-NEXT:    [[PHI:%.*]] = phi i32 [ 100, [[T]] ], [ 200, [[F]] ]
 ; NEWGVN-NEXT:    [[V2:%.*]] = extractelement <2 x i32> [[V1]], i64 1
-; NEWGVN-NEXT:    [[V3:%.*]] = load i32, i32* [[P]], align 4
-; NEWGVN-NEXT:    [[V4:%.*]] = add i32 [[V3]], [[V2]]
+; NEWGVN-NEXT:    [[V4:%.*]] = add i32 [[TMP1]], [[V2]]
 ; NEWGVN-NEXT:    [[V5:%.*]] = add i32 [[V4]], [[PHI]]
 ; NEWGVN-NEXT:    ret i32 [[V5]]
 ;