diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/CodeGen/ExpandMemCmp.cpp
--- a/llvm/lib/CodeGen/ExpandMemCmp.cpp
+++ b/llvm/lib/CodeGen/ExpandMemCmp.cpp
@@ -23,6 +23,7 @@
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SizeOpts.h"
 
 using namespace llvm;
@@ -103,8 +104,12 @@
   Value *getMemCmpExpansionZeroCase();
   Value *getMemCmpEqZeroOneBlock();
   Value *getMemCmpOneBlock();
-  Value *getPtrToElementAtOffset(Value *Source, Type *LoadSizeType,
-                                 uint64_t OffsetBytes);
+  struct LoadPair {
+    Value *Lhs = nullptr;
+    Value *Rhs = nullptr;
+  };
+  LoadPair getLoadPair(Type *LoadSizeType, bool NeedsBSwap, Type *CmpSizeType,
+                       unsigned OffsetBytes);
 
   static LoadEntryVector
   computeGreedyLoadSequence(uint64_t Size, llvm::ArrayRef<unsigned> LoadSizes,
@@ -261,18 +266,52 @@
                                    EndBlock->getParent(), EndBlock);
 }
 
-/// Return a pointer to an element of type `LoadSizeType` at offset
-/// `OffsetBytes`.
-Value *MemCmpExpansion::getPtrToElementAtOffset(Value *Source,
-                                                Type *LoadSizeType,
-                                                uint64_t OffsetBytes) {
+MemCmpExpansion::LoadPair MemCmpExpansion::getLoadPair(Type *LoadSizeType,
+                                                       bool NeedsBSwap,
+                                                       Type *CmpSizeType,
+                                                       unsigned OffsetBytes) {
+  // Get the memory source at offset `OffsetBytes`.
+  Value *LhsSource = CI->getArgOperand(0);
+  Value *RhsSource = CI->getArgOperand(1);
   if (OffsetBytes > 0) {
     auto *ByteType = Type::getInt8Ty(CI->getContext());
-    Source = Builder.CreateConstGEP1_64(
-        ByteType, Builder.CreateBitCast(Source, ByteType->getPointerTo()),
+    LhsSource = Builder.CreateConstGEP1_64(
+        ByteType, Builder.CreateBitCast(LhsSource, ByteType->getPointerTo()),
+        OffsetBytes);
+    RhsSource = Builder.CreateConstGEP1_64(
+        ByteType, Builder.CreateBitCast(RhsSource, ByteType->getPointerTo()),
         OffsetBytes);
   }
-  return Builder.CreateBitCast(Source, LoadSizeType->getPointerTo());
+  LhsSource = Builder.CreateBitCast(LhsSource, LoadSizeType->getPointerTo());
+  RhsSource = Builder.CreateBitCast(RhsSource, LoadSizeType->getPointerTo());
+
+  // Create a constant or a load from the source.
+  Value *Lhs = nullptr;
+  if (auto *C = dyn_cast<Constant>(LhsSource))
+    Lhs = ConstantFoldLoadFromConstPtr(C, LoadSizeType, DL);
+  if (!Lhs)
+    Lhs = Builder.CreateLoad(LoadSizeType, LhsSource);
+
+  Value *Rhs = nullptr;
+  if (auto *C = dyn_cast<Constant>(RhsSource))
+    Rhs = ConstantFoldLoadFromConstPtr(C, LoadSizeType, DL);
+  if (!Rhs)
+    Rhs = Builder.CreateLoad(LoadSizeType, RhsSource);
+
+  // Swap bytes if required.
+  if (NeedsBSwap) {
+    Function *Bswap = Intrinsic::getDeclaration(CI->getModule(),
+                                                Intrinsic::bswap, LoadSizeType);
+    Lhs = Builder.CreateCall(Bswap, Lhs);
+    Rhs = Builder.CreateCall(Bswap, Rhs);
+  }
+
+  // Zero extend if required.
+  if (CmpSizeType != nullptr && CmpSizeType != LoadSizeType) {
+    Lhs = Builder.CreateZExt(Lhs, CmpSizeType);
+    Rhs = Builder.CreateZExt(Rhs, CmpSizeType);
+  }
+  return {Lhs, Rhs};
 }
 
 // This function creates the IR instructions for loading and comparing 1 byte.
@@ -282,18 +321,10 @@
 void MemCmpExpansion::emitLoadCompareByteBlock(unsigned BlockIndex,
                                                unsigned OffsetBytes) {
   Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);
-  Type *LoadSizeType = Type::getInt8Ty(CI->getContext());
-  Value *Source1 =
-      getPtrToElementAtOffset(CI->getArgOperand(0), LoadSizeType, OffsetBytes);
-  Value *Source2 =
-      getPtrToElementAtOffset(CI->getArgOperand(1), LoadSizeType, OffsetBytes);
-
-  Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
-  Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
-
-  LoadSrc1 = Builder.CreateZExt(LoadSrc1, Type::getInt32Ty(CI->getContext()));
-  LoadSrc2 = Builder.CreateZExt(LoadSrc2, Type::getInt32Ty(CI->getContext()));
-  Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2);
+  const LoadPair Loads =
+      getLoadPair(Type::getInt8Ty(CI->getContext()), /*NeedsBSwap=*/false,
+                  Type::getInt32Ty(CI->getContext()), OffsetBytes);
+  Value *Diff = Builder.CreateSub(Loads.Lhs, Loads.Rhs);
 
   PhiRes->addIncoming(Diff, LoadCmpBlocks[BlockIndex]);
 
@@ -340,41 +371,19 @@
                     : IntegerType::get(CI->getContext(), MaxLoadSize * 8);
   for (unsigned i = 0; i < NumLoads; ++i, ++LoadIndex) {
     const LoadEntry &CurLoadEntry = LoadSequence[LoadIndex];
-
-    IntegerType *LoadSizeType =
-        IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8);
-
-    Value *Source1 = getPtrToElementAtOffset(CI->getArgOperand(0), LoadSizeType,
-                                             CurLoadEntry.Offset);
-    Value *Source2 = getPtrToElementAtOffset(CI->getArgOperand(1), LoadSizeType,
-                                             CurLoadEntry.Offset);
-
-    // Get a constant or load a value for each source address.
-    Value *LoadSrc1 = nullptr;
-    if (auto *Source1C = dyn_cast<Constant>(Source1))
-      LoadSrc1 = ConstantFoldLoadFromConstPtr(Source1C, LoadSizeType, DL);
-    if (!LoadSrc1)
-      LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
-
-    Value *LoadSrc2 = nullptr;
-    if (auto *Source2C = dyn_cast<Constant>(Source2))
-      LoadSrc2 = ConstantFoldLoadFromConstPtr(Source2C, LoadSizeType, DL);
-    if (!LoadSrc2)
-      LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
+    const LoadPair Loads = getLoadPair(
+        IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8),
+        /*NeedsBSwap=*/false, MaxLoadType, CurLoadEntry.Offset);
 
     if (NumLoads != 1) {
-      if (LoadSizeType != MaxLoadType) {
-        LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType);
-        LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType);
-      }
       // If we have multiple loads per block, we need to generate a composite
       // comparison using xor+or.
-      Diff = Builder.CreateXor(LoadSrc1, LoadSrc2);
+      Diff = Builder.CreateXor(Loads.Lhs, Loads.Rhs);
       Diff = Builder.CreateZExt(Diff, MaxLoadType);
       XorList.push_back(Diff);
     } else {
       // If there's only one load per block, we just compare the loaded values.
-      Cmp = Builder.CreateICmpNE(LoadSrc1, LoadSrc2);
+      Cmp = Builder.CreateICmpNE(Loads.Lhs, Loads.Rhs);
     }
   }
 
@@ -451,35 +460,18 @@
 
   Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);
 
-  Value *Source1 = getPtrToElementAtOffset(CI->getArgOperand(0), LoadSizeType,
-                                           CurLoadEntry.Offset);
-  Value *Source2 = getPtrToElementAtOffset(CI->getArgOperand(1), LoadSizeType,
-                                           CurLoadEntry.Offset);
-
-  // Load LoadSizeType from the base address.
-  Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
-  Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
-
-  if (DL.isLittleEndian()) {
-    Function *Bswap = Intrinsic::getDeclaration(CI->getModule(),
-                                                Intrinsic::bswap, LoadSizeType);
-    LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1);
-    LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2);
-  }
-
-  if (LoadSizeType != MaxLoadType) {
-    LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType);
-    LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType);
-  }
+  const LoadPair Loads =
+      getLoadPair(LoadSizeType, /*NeedsBSwap=*/DL.isLittleEndian(), MaxLoadType,
+                  CurLoadEntry.Offset);
 
   // Add the loaded values to the phi nodes for calculating memcmp result only
   // if result is not used in a zero equality.
   if (!IsUsedForZeroCmp) {
-    ResBlock.PhiSrc1->addIncoming(LoadSrc1, LoadCmpBlocks[BlockIndex]);
-    ResBlock.PhiSrc2->addIncoming(LoadSrc2, LoadCmpBlocks[BlockIndex]);
+    ResBlock.PhiSrc1->addIncoming(Loads.Lhs, LoadCmpBlocks[BlockIndex]);
+    ResBlock.PhiSrc2->addIncoming(Loads.Rhs, LoadCmpBlocks[BlockIndex]);
   }
 
-  Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, LoadSrc1, LoadSrc2);
+  Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Loads.Lhs, Loads.Rhs);
   BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1))
                            ? EndBlock
                            : LoadCmpBlocks[BlockIndex + 1];
@@ -568,42 +560,27 @@
 /// the compare, branch, and phi IR that is required in the general case.
 Value *MemCmpExpansion::getMemCmpOneBlock() {
   Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8);
-  Value *Source1 = CI->getArgOperand(0);
-  Value *Source2 = CI->getArgOperand(1);
-
-  // Cast source to LoadSizeType*.
-  if (Source1->getType() != LoadSizeType)
-    Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
-  if (Source2->getType() != LoadSizeType)
-    Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
-
-  // Load LoadSizeType from the base address.
-  Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
-  Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
-
-  if (DL.isLittleEndian() && Size != 1) {
-    Function *Bswap = Intrinsic::getDeclaration(CI->getModule(),
-                                                Intrinsic::bswap, LoadSizeType);
-    LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1);
-    LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2);
-  }
+  bool NeedsBSwap = DL.isLittleEndian() && Size != 1;
 
+  // The i8 and i16 cases don't need compares. We zext the loaded values and
+  // subtract them to get the suitable negative, zero, or positive i32 result.
   if (Size < 4) {
-    // The i8 and i16 cases don't need compares. We zext the loaded values and
-    // subtract them to get the suitable negative, zero, or positive i32 result.
-    LoadSrc1 = Builder.CreateZExt(LoadSrc1, Builder.getInt32Ty());
-    LoadSrc2 = Builder.CreateZExt(LoadSrc2, Builder.getInt32Ty());
-    return Builder.CreateSub(LoadSrc1, LoadSrc2);
+    const LoadPair Loads =
+        getLoadPair(LoadSizeType, NeedsBSwap, Builder.getInt32Ty(),
+                    /*Offset*/ 0);
+    return Builder.CreateSub(Loads.Lhs, Loads.Rhs);
   }
 
+  const LoadPair Loads = getLoadPair(LoadSizeType, NeedsBSwap, LoadSizeType,
+                                     /*Offset*/ 0);
   // The result of memcmp is negative, zero, or positive, so produce that by
   // subtracting 2 extended compare bits: sub (ugt, ult).
   // If a target prefers to use selects to get -1/0/1, they should be able
   // to transform this later. The inverse transform (going from selects to math)
   // may not be possible in the DAG because the selects got converted into
   // branches before we got there.
-  Value *CmpUGT = Builder.CreateICmpUGT(LoadSrc1, LoadSrc2);
-  Value *CmpULT = Builder.CreateICmpULT(LoadSrc1, LoadSrc2);
+  Value *CmpUGT = Builder.CreateICmpUGT(Loads.Lhs, Loads.Rhs);
+  Value *CmpULT = Builder.CreateICmpULT(Loads.Lhs, Loads.Rhs);
   Value *ZextUGT = Builder.CreateZExt(CmpUGT, Builder.getInt32Ty());
   Value *ZextULT = Builder.CreateZExt(CmpULT, Builder.getInt32Ty());
   return Builder.CreateSub(ZextUGT, ZextULT);
@@ -869,6 +846,9 @@
       ++BBIt;
     }
   }
+  if (MadeChanges)
+    for (BasicBlock &BB : F)
+      SimplifyInstructionsInBlock(&BB);
   return MadeChanges ? PreservedAnalyses::none() : PreservedAnalyses::all();
 }
 
diff --git a/llvm/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll b/llvm/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll
--- a/llvm/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll
+++ b/llvm/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll
@@ -89,32 +89,8 @@
 ; Validate with > 0
 define signext i32 @zeroEqualityTest04() {
 ; CHECK-LABEL: zeroEqualityTest04:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addis 3, 2, .LzeroEqualityTest02.buffer1@toc@ha
-; CHECK-NEXT:    addis 4, 2, .LzeroEqualityTest02.buffer2@toc@ha
-; CHECK-NEXT:    addi 6, 3, .LzeroEqualityTest02.buffer1@toc@l
-; CHECK-NEXT:    addi 5, 4, .LzeroEqualityTest02.buffer2@toc@l
-; CHECK-NEXT:    ldbrx 3, 0, 6
-; CHECK-NEXT:    ldbrx 4, 0, 5
-; CHECK-NEXT:    cmpld 3, 4
-; CHECK-NEXT:    bne 0, .LBB3_2
-; CHECK-NEXT:  # %bb.1: # %loadbb1
-; CHECK-NEXT:    li 4, 8
-; CHECK-NEXT:    ldbrx 3, 6, 4
-; CHECK-NEXT:    ldbrx 4, 5, 4
-; CHECK-NEXT:    li 5, 0
-; CHECK-NEXT:    cmpld 3, 4
-; CHECK-NEXT:    beq 0, .LBB3_3
-; CHECK-NEXT:  .LBB3_2: # %res_block
-; CHECK-NEXT:    cmpld 3, 4
-; CHECK-NEXT:    li 3, 1
-; CHECK-NEXT:    li 4, -1
-; CHECK-NEXT:    isel 5, 4, 3, 0
-; CHECK-NEXT:  .LBB3_3: # %endblock
-; CHECK-NEXT:    extsw 3, 5
-; CHECK-NEXT:    neg 3, 3
-; CHECK-NEXT:    rldicl 3, 3, 1, 63
-; CHECK-NEXT:    xori 3, 3, 1
+; CHECK:       # %bb.0: # %loadbb
+; CHECK-NEXT:    li 3, 0
 ; CHECK-NEXT:    blr
   %call = tail call signext i32 @memcmp(i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer1 to i8*), i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer2 to i8*), i64 16)
   %not.cmp = icmp slt i32 %call, 1
@@ -125,30 +101,8 @@
 ; Validate with < 0
 define signext i32 @zeroEqualityTest05() {
 ; CHECK-LABEL: zeroEqualityTest05:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addis 3, 2, .LzeroEqualityTest03.buffer1@toc@ha
-; CHECK-NEXT:    addis 4, 2, .LzeroEqualityTest03.buffer2@toc@ha
-; CHECK-NEXT:    addi 6, 3, .LzeroEqualityTest03.buffer1@toc@l
-; CHECK-NEXT:    addi 5, 4, .LzeroEqualityTest03.buffer2@toc@l
-; CHECK-NEXT:    ldbrx 3, 0, 6
-; CHECK-NEXT:    ldbrx 4, 0, 5
-; CHECK-NEXT:    cmpld 3, 4
-; CHECK-NEXT:    bne 0, .LBB4_2
-; CHECK-NEXT:  # %bb.1: # %loadbb1
-; CHECK-NEXT:    li 4, 8
-; CHECK-NEXT:    ldbrx 3, 6, 4
-; CHECK-NEXT:    ldbrx 4, 5, 4
-; CHECK-NEXT:    li 5, 0
-; CHECK-NEXT:    cmpld 3, 4
-; CHECK-NEXT:    beq 0, .LBB4_3
-; CHECK-NEXT:  .LBB4_2: # %res_block
-; CHECK-NEXT:    cmpld 3, 4
-; CHECK-NEXT:    li 3, 1
-; CHECK-NEXT:    li 4, -1
-; CHECK-NEXT:    isel 5, 4, 3, 0
-; CHECK-NEXT:  .LBB4_3: # %endblock
-; CHECK-NEXT:    nor 3, 5, 5
-; CHECK-NEXT:    rlwinm 3, 3, 1, 31, 31
+; CHECK:       # %bb.0: # %loadbb
+; CHECK-NEXT:    li 3, 0
 ; CHECK-NEXT:    blr
   %call = tail call signext i32 @memcmp(i8* bitcast ([4 x i32]* @zeroEqualityTest03.buffer1 to i8*), i8* bitcast ([4 x i32]* @zeroEqualityTest03.buffer2 to i8*), i64 16)
   %call.lobit = lshr i32 %call, 31
diff --git a/llvm/test/CodeGen/PowerPC/memcmpIR.ll b/llvm/test/CodeGen/PowerPC/memcmpIR.ll
--- a/llvm/test/CodeGen/PowerPC/memcmpIR.ll
+++ b/llvm/test/CodeGen/PowerPC/memcmpIR.ll
@@ -20,8 +20,8 @@
   ; CHECK: [[BCC1:%[0-9]+]] = bitcast i32* {{.*}} to i8*
   ; CHECK-NEXT: [[BCC2:%[0-9]+]] = bitcast i32* {{.*}} to i8*
   ; CHECK-NEXT: [[GEP1:%[0-9]+]] = getelementptr i8, i8* [[BCC2]], i64 8
-  ; CHECK-NEXT: [[BCL1:%[0-9]+]] = bitcast i8* [[GEP1]] to i64*
   ; CHECK-NEXT: [[GEP2:%[0-9]+]] = getelementptr i8, i8* [[BCC1]], i64 8
+  ; CHECK-NEXT: [[BCL1:%[0-9]+]] = bitcast i8* [[GEP1]] to i64*
   ; CHECK-NEXT: [[BCL2:%[0-9]+]] = bitcast i8* [[GEP2]] to i64*
   ; CHECK-NEXT: [[LOAD1:%[0-9]+]] = load i64, i64* [[BCL1]]
   ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* [[BCL2]]
@@ -45,8 +45,8 @@
   ; CHECK-BE: [[BCC1:%[0-9]+]] = bitcast i32* {{.*}} to i8*
   ; CHECK-BE-NEXT: [[BCC2:%[0-9]+]] = bitcast i32* {{.*}} to i8*
   ; CHECK-BE-NEXT: [[GEP1:%[0-9]+]] = getelementptr i8, i8* [[BCC2]], i64 8
-  ; CHECK-BE-NEXT: [[BCL1:%[0-9]+]] = bitcast i8* [[GEP1]] to i64*
   ; CHECK-BE-NEXT: [[GEP2:%[0-9]+]] = getelementptr i8, i8* [[BCC1]], i64 8
+  ; CHECK-BE-NEXT: [[BCL1:%[0-9]+]] = bitcast i8* [[GEP1]] to i64*
   ; CHECK-BE-NEXT: [[BCL2:%[0-9]+]] = bitcast i8* [[GEP2]] to i64*
   ; CHECK-BE-NEXT: [[LOAD1:%[0-9]+]] = load i64, i64* [[BCL1]]
   ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* [[BCL2]]
diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
--- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
+++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
@@ -230,7 +230,7 @@
 
 define i32 @length3(i8* %X, i8* %Y) nounwind {
 ; X86-LABEL: length3:
-; X86:       # %bb.0: # %loadbb
+; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -239,14 +239,14 @@
 ; X86-NEXT:    rolw $8, %dx
 ; X86-NEXT:    rolw $8, %si
 ; X86-NEXT:    cmpw %si, %dx
-; X86-NEXT:    jne .LBB9_1
-; X86-NEXT:  # %bb.2: # %loadbb1
+; X86-NEXT:    jne .LBB9_3
+; X86-NEXT:  # %bb.1: # %loadbb1
 ; X86-NEXT:    movzbl 2(%eax), %eax
 ; X86-NEXT:    movzbl 2(%ecx), %ecx
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
-; X86-NEXT:  .LBB9_1: # %res_block
+; X86-NEXT:  .LBB9_3: # %res_block
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    leal -1(%eax,%eax), %eax
@@ -254,19 +254,19 @@
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: length3:
-; X64:       # %bb.0: # %loadbb
+; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
 ; X64-NEXT:    movzwl (%rsi), %ecx
 ; X64-NEXT:    rolw $8, %ax
 ; X64-NEXT:    rolw $8, %cx
 ; X64-NEXT:    cmpw %cx, %ax
-; X64-NEXT:    jne .LBB9_1
-; X64-NEXT:  # %bb.2: # %loadbb1
+; X64-NEXT:    jne .LBB9_3
+; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movzbl 2(%rdi), %eax
 ; X64-NEXT:    movzbl 2(%rsi), %ecx
 ; X64-NEXT:    subl %ecx, %eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB9_1: # %res_block
+; X64-NEXT:  .LBB9_3: # %res_block
 ; X64-NEXT:    setae %al
 ; X64-NEXT:    movzbl %al, %eax
 ; X64-NEXT:    leal -1(%rax,%rax), %eax
@@ -445,7 +445,7 @@
 
 define i32 @length5(i8* %X, i8* %Y) nounwind {
 ; X86-LABEL: length5:
-; X86:       # %bb.0: # %loadbb
+; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -454,14 +454,14 @@
 ; X86-NEXT:    bswapl %edx
 ; X86-NEXT:    bswapl %esi
 ; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    jne .LBB16_1
-; X86-NEXT:  # %bb.2: # %loadbb1
+; X86-NEXT:    jne .LBB16_3
+; X86-NEXT:  # %bb.1: # %loadbb1
 ; X86-NEXT:    movzbl 4(%eax), %eax
 ; X86-NEXT:    movzbl 4(%ecx), %ecx
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
-; X86-NEXT:  .LBB16_1: # %res_block
+; X86-NEXT:  .LBB16_3: # %res_block
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    leal -1(%eax,%eax), %eax
@@ -469,19 +469,19 @@
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: length5:
-; X64:       # %bb.0: # %loadbb
+; X64:       # %bb.0:
 ; X64-NEXT:    movl (%rdi), %eax
 ; X64-NEXT:    movl (%rsi), %ecx
 ; X64-NEXT:    bswapl %eax
 ; X64-NEXT:    bswapl %ecx
 ; X64-NEXT:    cmpl %ecx, %eax
-; X64-NEXT:    jne .LBB16_1
-; X64-NEXT:  # %bb.2: # %loadbb1
+; X64-NEXT:    jne .LBB16_3
+; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movzbl 4(%rdi), %eax
 ; X64-NEXT:    movzbl 4(%rsi), %ecx
 ; X64-NEXT:    subl %ecx, %eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB16_1: # %res_block
+; X64-NEXT:  .LBB16_3: # %res_block
 ; X64-NEXT:    setae %al
 ; X64-NEXT:    movzbl %al, %eax
 ; X64-NEXT:    leal -1(%rax,%rax), %eax
@@ -521,7 +521,7 @@
 
 define i1 @length5_lt(i8* %X, i8* %Y) nounwind {
 ; X86-LABEL: length5_lt:
-; X86:       # %bb.0: # %loadbb
+; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -530,38 +530,38 @@
 ; X86-NEXT:    bswapl %edx
 ; X86-NEXT:    bswapl %esi
 ; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    jne .LBB18_1
-; X86-NEXT:  # %bb.2: # %loadbb1
+; X86-NEXT:    jne .LBB18_3
+; X86-NEXT:  # %bb.1: # %loadbb1
 ; X86-NEXT:    movzbl 4(%eax), %eax
 ; X86-NEXT:    movzbl 4(%ecx), %ecx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    jmp .LBB18_3
-; X86-NEXT:  .LBB18_1: # %res_block
+; X86-NEXT:    jmp .LBB18_2
+; X86-NEXT:  .LBB18_3: # %res_block
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    leal -1(%eax,%eax), %eax
-; X86-NEXT:  .LBB18_3: # %endblock
+; X86-NEXT:  .LBB18_2: # %endblock
 ; X86-NEXT:    shrl $31, %eax
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: length5_lt:
-; X64:       # %bb.0: # %loadbb
+; X64:       # %bb.0:
 ; X64-NEXT:    movl (%rdi), %eax
 ; X64-NEXT:    movl (%rsi), %ecx
 ; X64-NEXT:    bswapl %eax
 ; X64-NEXT:    bswapl %ecx
 ; X64-NEXT:    cmpl %ecx, %eax
-; X64-NEXT:    jne .LBB18_1
-; X64-NEXT:  # %bb.2: # %loadbb1
+; X64-NEXT:    jne .LBB18_3
+; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movzbl 4(%rdi), %eax
 ; X64-NEXT:    movzbl 4(%rsi), %ecx
 ; X64-NEXT:    subl %ecx, %eax
 ; X64-NEXT:    shrl $31, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB18_1: # %res_block
+; X64-NEXT:  .LBB18_3: # %res_block
 ; X64-NEXT:    setae %al
 ; X64-NEXT:    movzbl %al, %eax
 ; X64-NEXT:    leal -1(%rax,%rax), %eax
diff --git a/llvm/test/CodeGen/X86/memcmp-optsize.ll b/llvm/test/CodeGen/X86/memcmp-optsize.ll
--- a/llvm/test/CodeGen/X86/memcmp-optsize.ll
+++ b/llvm/test/CodeGen/X86/memcmp-optsize.ll
@@ -111,7 +111,7 @@
 
 define i32 @length3(i8* %X, i8* %Y) nounwind optsize {
 ; X86-LABEL: length3:
-; X86:       # %bb.0: # %loadbb
+; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -120,34 +120,34 @@
 ; X86-NEXT:    rolw $8, %dx
 ; X86-NEXT:    rolw $8, %si
 ; X86-NEXT:    cmpw %si, %dx
-; X86-NEXT:    jne .LBB4_1
-; X86-NEXT:  # %bb.2: # %loadbb1
+; X86-NEXT:    jne .LBB4_3
+; X86-NEXT:  # %bb.1: # %loadbb1
 ; X86-NEXT:    movzbl 2(%eax), %eax
 ; X86-NEXT:    movzbl 2(%ecx), %ecx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    jmp .LBB4_3
-; X86-NEXT:  .LBB4_1: # %res_block
+; X86-NEXT:    jmp .LBB4_2
+; X86-NEXT:  .LBB4_3: # %res_block
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    leal -1(%eax,%eax), %eax
-; X86-NEXT:  .LBB4_3: # %endblock
+; X86-NEXT:  .LBB4_2: # %endblock
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: length3:
-; X64:       # %bb.0: # %loadbb
+; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
 ; X64-NEXT:    movzwl (%rsi), %ecx
 ; X64-NEXT:    rolw $8, %ax
 ; X64-NEXT:    rolw $8, %cx
 ; X64-NEXT:    cmpw %cx, %ax
-; X64-NEXT:    jne .LBB4_1
-; X64-NEXT:  # %bb.2: # %loadbb1
+; X64-NEXT:    jne .LBB4_3
+; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movzbl 2(%rdi), %eax
 ; X64-NEXT:    movzbl 2(%rsi), %ecx
 ; X64-NEXT:    subl %ecx, %eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB4_1: # %res_block
+; X64-NEXT:  .LBB4_3: # %res_block
 ; X64-NEXT:    setae %al
 ; X64-NEXT:    movzbl %al, %eax
 ; X64-NEXT:    leal -1(%rax,%rax), %eax
@@ -256,7 +256,7 @@
 
 define i32 @length5(i8* %X, i8* %Y) nounwind optsize {
 ; X86-LABEL: length5:
-; X86:       # %bb.0: # %loadbb
+; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -265,34 +265,34 @@
 ; X86-NEXT:    bswapl %edx
 ; X86-NEXT:    bswapl %esi
 ; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    jne .LBB9_1
-; X86-NEXT:  # %bb.2: # %loadbb1
+; X86-NEXT:    jne .LBB9_3
+; X86-NEXT:  # %bb.1: # %loadbb1
 ; X86-NEXT:    movzbl 4(%eax), %eax
 ; X86-NEXT:    movzbl 4(%ecx), %ecx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    jmp .LBB9_3
-; X86-NEXT:  .LBB9_1: # %res_block
+; X86-NEXT:    jmp .LBB9_2
+; X86-NEXT:  .LBB9_3: # %res_block
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    leal -1(%eax,%eax), %eax
-; X86-NEXT:  .LBB9_3: # %endblock
+; X86-NEXT:  .LBB9_2: # %endblock
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: length5:
-; X64:       # %bb.0: # %loadbb
+; X64:       # %bb.0:
 ; X64-NEXT:    movl (%rdi), %eax
 ; X64-NEXT:    movl (%rsi), %ecx
 ; X64-NEXT:    bswapl %eax
 ; X64-NEXT:    bswapl %ecx
 ; X64-NEXT:    cmpl %ecx, %eax
-; X64-NEXT:    jne .LBB9_1
-; X64-NEXT:  # %bb.2: # %loadbb1
+; X64-NEXT:    jne .LBB9_3
+; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movzbl 4(%rdi), %eax
 ; X64-NEXT:    movzbl 4(%rsi), %ecx
 ; X64-NEXT:    subl %ecx, %eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB9_1: # %res_block
+; X64-NEXT:  .LBB9_3: # %res_block
 ; X64-NEXT:    setae %al
 ; X64-NEXT:    movzbl %al, %eax
 ; X64-NEXT:    leal -1(%rax,%rax), %eax
diff --git a/llvm/test/CodeGen/X86/memcmp-pgso.ll b/llvm/test/CodeGen/X86/memcmp-pgso.ll
--- a/llvm/test/CodeGen/X86/memcmp-pgso.ll
+++ b/llvm/test/CodeGen/X86/memcmp-pgso.ll
@@ -111,7 +111,7 @@
 
 define i32 @length3(i8* %X, i8* %Y) nounwind !prof !14 {
 ; X86-LABEL: length3:
-; X86:       # %bb.0: # %loadbb
+; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -120,34 +120,34 @@
 ; X86-NEXT:    rolw $8, %dx
 ; X86-NEXT:    rolw $8, %si
 ; X86-NEXT:    cmpw %si, %dx
-; X86-NEXT:    jne .LBB4_1
-; X86-NEXT:  # %bb.2: # %loadbb1
+; X86-NEXT:    jne .LBB4_3
+; X86-NEXT:  # %bb.1: # %loadbb1
 ; X86-NEXT:    movzbl 2(%eax), %eax
 ; X86-NEXT:    movzbl 2(%ecx), %ecx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    jmp .LBB4_3
-; X86-NEXT:  .LBB4_1: # %res_block
+; X86-NEXT:    jmp .LBB4_2
+; X86-NEXT:  .LBB4_3: # %res_block
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    leal -1(%eax,%eax), %eax
-; X86-NEXT:  .LBB4_3: # %endblock
+; X86-NEXT:  .LBB4_2: # %endblock
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: length3:
-; X64:       # %bb.0: # %loadbb
+; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
 ; X64-NEXT:    movzwl (%rsi), %ecx
 ; X64-NEXT:    rolw $8, %ax
 ; X64-NEXT:    rolw $8, %cx
 ; X64-NEXT:    cmpw %cx, %ax
-; X64-NEXT:    jne .LBB4_1
-; X64-NEXT:  # %bb.2: # %loadbb1
+; X64-NEXT:    jne .LBB4_3
+; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movzbl 2(%rdi), %eax
 ; X64-NEXT:    movzbl 2(%rsi), %ecx
 ; X64-NEXT:    subl %ecx, %eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB4_1: # %res_block
+; X64-NEXT:  .LBB4_3: # %res_block
 ; X64-NEXT:    setae %al
 ; X64-NEXT:    movzbl %al, %eax
 ; X64-NEXT:    leal -1(%rax,%rax), %eax
@@ -256,7 +256,7 @@
 
 define i32 @length5(i8* %X, i8* %Y) nounwind !prof !14 {
 ; X86-LABEL: length5:
-; X86:       # %bb.0: # %loadbb
+; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -265,34 +265,34 @@
 ; X86-NEXT:    bswapl %edx
 ; X86-NEXT:    bswapl %esi
 ; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    jne .LBB9_1
-; X86-NEXT:  # %bb.2: # %loadbb1
+; X86-NEXT:    jne .LBB9_3
+; X86-NEXT:  # %bb.1: # %loadbb1
 ; X86-NEXT:    movzbl 4(%eax), %eax
 ; X86-NEXT:    movzbl 4(%ecx), %ecx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    jmp .LBB9_3
-; X86-NEXT:  .LBB9_1: # %res_block
+; X86-NEXT:    jmp .LBB9_2
+; X86-NEXT:  .LBB9_3: # %res_block
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    leal -1(%eax,%eax), %eax
-; X86-NEXT:  .LBB9_3: # %endblock
+; X86-NEXT:  .LBB9_2: # %endblock
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: length5:
-; X64:       # %bb.0: # %loadbb
+; X64:       # %bb.0:
 ; X64-NEXT:    movl (%rdi), %eax
 ; X64-NEXT:    movl (%rsi), %ecx
 ; X64-NEXT:    bswapl %eax
 ; X64-NEXT:    bswapl %ecx
 ; X64-NEXT:    cmpl %ecx, %eax
-; X64-NEXT:    jne .LBB9_1
-; X64-NEXT:  # %bb.2: # %loadbb1
+; X64-NEXT:    jne .LBB9_3
+; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movzbl 4(%rdi), %eax
 ; X64-NEXT:    movzbl 4(%rsi), %ecx
 ; X64-NEXT:    subl %ecx, %eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB9_1: # %res_block
+; X64-NEXT:  .LBB9_3: # %res_block
 ; X64-NEXT:    setae %al
 ; X64-NEXT:    movzbl %al, %eax
 ; X64-NEXT:    leal -1(%rax,%rax), %eax
diff --git a/llvm/test/CodeGen/X86/memcmp.ll b/llvm/test/CodeGen/X86/memcmp.ll
--- a/llvm/test/CodeGen/X86/memcmp.ll
+++ b/llvm/test/CodeGen/X86/memcmp.ll
@@ -98,23 +98,17 @@
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    movzwl .L.str+1, %ecx
 ; X86-NEXT:    rolw $8, %ax
-; X86-NEXT:    rolw $8, %cx
 ; X86-NEXT:    movzwl %ax, %eax
-; X86-NEXT:    movzwl %cx, %ecx
-; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    addl $-12594, %eax # imm = 0xCECE
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: length2_const:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    movzwl .L.str+{{.*}}(%rip), %ecx
 ; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    rolw $8, %cx
 ; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    movzwl %cx, %ecx
-; X64-NEXT:    subl %ecx, %eax
+; X64-NEXT:    addl $-12594, %eax # imm = 0xCECE
 ; X64-NEXT:    retq
   %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 1), i64 2) nounwind
   ret i32 %m
@@ -125,12 +119,9 @@
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    movzwl .L.str+1, %ecx
 ; X86-NEXT:    rolw $8, %ax
-; X86-NEXT:    rolw $8, %cx
 ; X86-NEXT:    movzwl %ax, %eax
-; X86-NEXT:    movzwl %cx, %ecx
-; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    addl $-12594, %eax # imm = 0xCECE
 ; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    setg %al
 ; X86-NEXT:    retl
@@ -138,12 +129,9 @@
 ; X64-LABEL: length2_gt_const:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    movzwl .L.str+{{.*}}(%rip), %ecx
 ; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    rolw $8, %cx
 ; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    movzwl %cx, %ecx
-; X64-NEXT:    subl %ecx, %eax
+; X64-NEXT:    addl $-12594, %eax # imm = 0xCECE
 ; X64-NEXT:    testl %eax, %eax
 ; X64-NEXT:    setg %al
 ; X64-NEXT:    retq
@@ -288,7 +276,7 @@
 
 define i32 @length3(i8* %X, i8* %Y) nounwind {
 ; X86-LABEL: length3:
-; X86:       # %bb.0: # %loadbb
+; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -297,14 +285,14 @@
 ; X86-NEXT:    rolw $8, %dx
 ; X86-NEXT:    rolw $8, %si
 ; X86-NEXT:    cmpw %si, %dx
-; X86-NEXT:    jne .LBB11_1
-; X86-NEXT:  # %bb.2: # %loadbb1
+; X86-NEXT:    jne .LBB11_3
+; X86-NEXT:  # %bb.1: # %loadbb1
 ; X86-NEXT:    movzbl 2(%eax), %eax
 ; X86-NEXT:    movzbl 2(%ecx), %ecx
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
-; X86-NEXT:  .LBB11_1: # %res_block
+; X86-NEXT:  .LBB11_3: # %res_block
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    leal -1(%eax,%eax), %eax
@@ -312,19 +300,19 @@
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: length3:
-; X64:       # %bb.0: # %loadbb
+; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
 ; X64-NEXT:    movzwl (%rsi), %ecx
 ; X64-NEXT:    rolw $8, %ax
 ; X64-NEXT:    rolw $8, %cx
 ; X64-NEXT:    cmpw %cx, %ax
-; X64-NEXT:    jne .LBB11_1
-; X64-NEXT:  # %bb.2: # %loadbb1
+; X64-NEXT:    jne .LBB11_3
+; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movzbl 2(%rdi), %eax
 ; X64-NEXT:    movzbl 2(%rsi), %ecx
 ; X64-NEXT:    subl %ecx, %eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB11_1: # %res_block
+; X64-NEXT:  .LBB11_3: # %res_block
 ; X64-NEXT:    setae %al
 ; X64-NEXT:    movzbl %al, %eax
 ; X64-NEXT:    leal -1(%rax,%rax), %eax
@@ -503,7 +491,7 @@
 
 define i32 @length5(i8* %X, i8* %Y) nounwind {
 ; X86-LABEL: length5:
-; X86:       # %bb.0: # %loadbb
+; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -512,14 +500,14 @@
 ; X86-NEXT:    bswapl %edx
 ; X86-NEXT:    bswapl %esi
 ; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    jne .LBB18_1
-; X86-NEXT:  # %bb.2: # %loadbb1
+; X86-NEXT:    jne .LBB18_3
+; X86-NEXT:  # %bb.1: # %loadbb1
 ; X86-NEXT:    movzbl 4(%eax), %eax
 ; X86-NEXT:    movzbl 4(%ecx), %ecx
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
-; X86-NEXT:  .LBB18_1: # %res_block
+; X86-NEXT:  .LBB18_3: # %res_block
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    leal -1(%eax,%eax), %eax
@@ -527,19 +515,19 @@
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: length5:
-; X64:       # %bb.0: # %loadbb
+; X64:       # %bb.0:
 ; X64-NEXT:    movl (%rdi), %eax
 ; X64-NEXT:    movl (%rsi), %ecx
 ; X64-NEXT:    bswapl %eax
 ; X64-NEXT:    bswapl %ecx
 ; X64-NEXT:    cmpl %ecx, %eax
-; X64-NEXT:    jne .LBB18_1
-; X64-NEXT:  # %bb.2: # %loadbb1
+; X64-NEXT:    jne .LBB18_3
+; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movzbl 4(%rdi), %eax
 ; X64-NEXT:    movzbl 4(%rsi), %ecx
 ; X64-NEXT:    subl %ecx, %eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB18_1: # %res_block
+; X64-NEXT:  .LBB18_3: # %res_block
 ; X64-NEXT:    setae %al
 ; X64-NEXT:    movzbl %al, %eax
 ; X64-NEXT:    leal -1(%rax,%rax), %eax
@@ -579,7 +567,7 @@
 
 define i1 @length5_lt(i8* %X, i8* %Y) nounwind {
 ; X86-LABEL: length5_lt:
-; X86:       # %bb.0: # %loadbb
+; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -588,38 +576,38 @@
 ; X86-NEXT:    bswapl %edx
 ; X86-NEXT:    bswapl %esi
 ; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    jne .LBB20_1
-; X86-NEXT:  # %bb.2: # %loadbb1
+; X86-NEXT:    jne .LBB20_3
+; X86-NEXT:  # %bb.1: # %loadbb1
 ; X86-NEXT:    movzbl 4(%eax), %eax
 ; X86-NEXT:    movzbl 4(%ecx), %ecx
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    jmp .LBB20_3
-; X86-NEXT:  .LBB20_1: # %res_block
+; X86-NEXT:    jmp .LBB20_2
+; X86-NEXT:  .LBB20_3: # %res_block
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    leal -1(%eax,%eax), %eax
-; X86-NEXT:  .LBB20_3: # %endblock
+; X86-NEXT:  .LBB20_2: # %endblock
 ; X86-NEXT:    shrl $31, %eax
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: length5_lt:
-; X64:       # %bb.0: # %loadbb
+; X64:       # %bb.0:
 ; X64-NEXT:    movl (%rdi), %eax
 ; X64-NEXT:    movl (%rsi), %ecx
 ; X64-NEXT:    bswapl %eax
 ; X64-NEXT:    bswapl %ecx
 ; X64-NEXT:    cmpl %ecx, %eax
-; X64-NEXT:    jne .LBB20_1
-; X64-NEXT:  # %bb.2: # %loadbb1
+; X64-NEXT:    jne .LBB20_3
+; X64-NEXT:  # %bb.1: # %loadbb1
 ; X64-NEXT:    movzbl 4(%rdi), %eax
 ; X64-NEXT:    movzbl 4(%rsi), %ecx
 ; X64-NEXT:    subl %ecx, %eax
 ; X64-NEXT:    shrl $31, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
-; X64-NEXT:  .LBB20_1: # %res_block
+; X64-NEXT:  .LBB20_3: # %res_block
 ; X64-NEXT:    setae %al
 ; X64-NEXT:    movzbl %al, %eax
 ; X64-NEXT:    leal -1(%rax,%rax), %eax
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll
--- a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll
@@ -26,9 +26,7 @@
 ; ALL-LABEL: @cmp3(
 ; ALL-NEXT:    br label [[LOADBB:%.*]]
 ; ALL:       res_block:
-; ALL-NEXT:    [[PHI_SRC1:%.*]] = phi i16 [ [[TMP7:%.*]], [[LOADBB]] ]
-; ALL-NEXT:    [[PHI_SRC2:%.*]] = phi i16 [ [[TMP8:%.*]], [[LOADBB]] ]
-; ALL-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[PHI_SRC1]], [[PHI_SRC2]]
+; ALL-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP7:%.*]], [[TMP8:%.*]]
 ; ALL-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; ALL-NEXT:    br label [[ENDBLOCK:%.*]]
 ; ALL:       loadbb:
@@ -80,9 +78,7 @@
 ; ALL-LABEL: @cmp5(
 ; ALL-NEXT:    br label [[LOADBB:%.*]]
 ; ALL:       res_block:
-; ALL-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ]
-; ALL-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ]
-; ALL-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; ALL-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP7:%.*]], [[TMP8:%.*]]
 ; ALL-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; ALL-NEXT:    br label [[ENDBLOCK:%.*]]
 ; ALL:       loadbb:
@@ -131,10 +127,10 @@
 ; ALL-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; ALL:       loadbb1:
 ; ALL-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i64 4
-; ALL-NEXT:    [[TMP11:%.*]] = bitcast i8* [[TMP10]] to i16*
-; ALL-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[Y]], i64 4
-; ALL-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP12]] to i16*
-; ALL-NEXT:    [[TMP14:%.*]] = load i16, i16* [[TMP11]]
+; ALL-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 4
+; ALL-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i16*
+; ALL-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP11]] to i16*
+; ALL-NEXT:    [[TMP14:%.*]] = load i16, i16* [[TMP12]]
 ; ALL-NEXT:    [[TMP15:%.*]] = load i16, i16* [[TMP13]]
 ; ALL-NEXT:    [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]])
 ; ALL-NEXT:    [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]])
@@ -179,10 +175,10 @@
 ; X32-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X32:       loadbb1:
 ; X32-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i64 4
-; X32-NEXT:    [[TMP11:%.*]] = bitcast i8* [[TMP10]] to i32*
-; X32-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[Y]], i64 4
-; X32-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP12]] to i32*
-; X32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP11]]
+; X32-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 4
+; X32-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32*
+; X32-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP11]] to i32*
+; X32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP12]]
 ; X32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP13]]
 ; X32-NEXT:    [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
 ; X32-NEXT:    [[TMP17]] = call i32 @llvm.bswap.i32(i32 [[TMP15]])
@@ -218,9 +214,7 @@
 ; X64-LABEL: @cmp9(
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
-; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ]
-; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ]
-; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP7:%.*]], [[TMP8:%.*]]
 ; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64:       loadbb:
@@ -273,10 +267,10 @@
 ; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
 ; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i64 8
-; X64-NEXT:    [[TMP11:%.*]] = bitcast i8* [[TMP10]] to i16*
-; X64-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[Y]], i64 8
-; X64-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP12]] to i16*
-; X64-NEXT:    [[TMP14:%.*]] = load i16, i16* [[TMP11]]
+; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 8
+; X64-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i16*
+; X64-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP11]] to i16*
+; X64-NEXT:    [[TMP14:%.*]] = load i16, i16* [[TMP12]]
 ; X64-NEXT:    [[TMP15:%.*]] = load i16, i16* [[TMP13]]
 ; X64-NEXT:    [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]])
 ; X64-NEXT:    [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]])
@@ -325,10 +319,10 @@
 ; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
 ; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i64 8
-; X64-NEXT:    [[TMP11:%.*]] = bitcast i8* [[TMP10]] to i32*
-; X64-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[Y]], i64 8
-; X64-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP12]] to i32*
-; X64-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP11]]
+; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 8
+; X64-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32*
+; X64-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP11]] to i32*
+; X64-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP12]]
 ; X64-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP13]]
 ; X64-NEXT:    [[TMP16:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
 ; X64-NEXT:    [[TMP17:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP15]])
@@ -395,10 +389,10 @@
 ; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
 ; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i64 8
-; X64-NEXT:    [[TMP11:%.*]] = bitcast i8* [[TMP10]] to i64*
-; X64-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[Y]], i64 8
-; X64-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP12]] to i64*
-; X64-NEXT:    [[TMP14:%.*]] = load i64, i64* [[TMP11]]
+; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 8
+; X64-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i64*
+; X64-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP11]] to i64*
+; X64-NEXT:    [[TMP14:%.*]] = load i64, i64* [[TMP12]]
 ; X64-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP13]]
 ; X64-NEXT:    [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]])
 ; X64-NEXT:    [[TMP17]] = call i64 @llvm.bswap.i64(i64 [[TMP15]])
@@ -598,10 +592,10 @@
 ; X32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]]
 ; X32-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
 ; X32-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 4
-; X32-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i16*
-; X32-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i64 4
-; X32-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i16*
-; X32-NEXT:    [[TMP10:%.*]] = load i16, i16* [[TMP7]]
+; X32-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 4
+; X32-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP6]] to i16*
+; X32-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP7]] to i16*
+; X32-NEXT:    [[TMP10:%.*]] = load i16, i16* [[TMP8]]
 ; X32-NEXT:    [[TMP11:%.*]] = load i16, i16* [[TMP9]]
 ; X32-NEXT:    [[TMP12:%.*]] = zext i16 [[TMP10]] to i32
 ; X32-NEXT:    [[TMP13:%.*]] = zext i16 [[TMP11]] to i32
@@ -626,10 +620,10 @@
 ; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
 ; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 4
-; X64_1LD-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i16*
-; X64_1LD-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i64 4
-; X64_1LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i16*
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i16, i16* [[TMP7]]
+; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 4
+; X64_1LD-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP6]] to i16*
+; X64_1LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP7]] to i16*
+; X64_1LD-NEXT:    [[TMP10:%.*]] = load i16, i16* [[TMP8]]
 ; X64_1LD-NEXT:    [[TMP11:%.*]] = load i16, i16* [[TMP9]]
 ; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]]
 ; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
@@ -646,10 +640,10 @@
 ; X64_2LD-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]]
 ; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
 ; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 4
-; X64_2LD-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i16*
-; X64_2LD-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i64 4
-; X64_2LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i16*
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i16, i16* [[TMP7]]
+; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 4
+; X64_2LD-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP6]] to i16*
+; X64_2LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP7]] to i16*
+; X64_2LD-NEXT:    [[TMP10:%.*]] = load i16, i16* [[TMP8]]
 ; X64_2LD-NEXT:    [[TMP11:%.*]] = load i16, i16* [[TMP9]]
 ; X64_2LD-NEXT:    [[TMP12:%.*]] = zext i16 [[TMP10]] to i32
 ; X64_2LD-NEXT:    [[TMP13:%.*]] = zext i16 [[TMP11]] to i32
@@ -675,10 +669,10 @@
 ; X32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]]
 ; X32-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
 ; X32-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 3
-; X32-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32*
-; X32-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i64 3
-; X32-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32*
-; X32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP7]]
+; X32-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 3
+; X32-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP6]] to i32*
+; X32-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP7]] to i32*
+; X32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP8]]
 ; X32-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP9]]
 ; X32-NEXT:    [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]]
 ; X32-NEXT:    [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]]
@@ -701,10 +695,10 @@
 ; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
 ; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 3
-; X64_1LD-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32*
-; X64_1LD-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i64 3
-; X64_1LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32*
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP7]]
+; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 3
+; X64_1LD-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP6]] to i32*
+; X64_1LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP7]] to i32*
+; X64_1LD-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP8]]
 ; X64_1LD-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP9]]
 ; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP10]], [[TMP11]]
 ; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
@@ -721,10 +715,10 @@
 ; X64_2LD-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]]
 ; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
 ; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 3
-; X64_2LD-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32*
-; X64_2LD-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i64 3
-; X64_2LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32*
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP7]]
+; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 3
+; X64_2LD-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP6]] to i32*
+; X64_2LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP7]] to i32*
+; X64_2LD-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP8]]
 ; X64_2LD-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP9]]
 ; X64_2LD-NEXT:    [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]]
 ; X64_2LD-NEXT:    [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]]
@@ -748,10 +742,10 @@
 ; X32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]]
 ; X32-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
 ; X32-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 4
-; X32-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32*
-; X32-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i64 4
-; X32-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32*
-; X32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP7]]
+; X32-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 4
+; X32-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP6]] to i32*
+; X32-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP7]] to i32*
+; X32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP8]]
 ; X32-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP9]]
 ; X32-NEXT:    [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]]
 ; X32-NEXT:    [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]]
@@ -855,10 +849,10 @@
 ; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
 ; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 8
-; X64_1LD-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i16*
-; X64_1LD-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i64 8
-; X64_1LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i16*
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i16, i16* [[TMP7]]
+; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 8
+; X64_1LD-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP6]] to i16*
+; X64_1LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP7]] to i16*
+; X64_1LD-NEXT:    [[TMP10:%.*]] = load i16, i16* [[TMP8]]
 ; X64_1LD-NEXT:    [[TMP11:%.*]] = load i16, i16* [[TMP9]]
 ; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]]
 ; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
@@ -875,10 +869,10 @@
 ; X64_2LD-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP2]]
 ; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]]
 ; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 8
-; X64_2LD-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i16*
-; X64_2LD-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i64 8
-; X64_2LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i16*
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i16, i16* [[TMP7]]
+; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 8
+; X64_2LD-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP6]] to i16*
+; X64_2LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP7]] to i16*
+; X64_2LD-NEXT:    [[TMP10:%.*]] = load i16, i16* [[TMP8]]
 ; X64_2LD-NEXT:    [[TMP11:%.*]] = load i16, i16* [[TMP9]]
 ; X64_2LD-NEXT:    [[TMP12:%.*]] = zext i16 [[TMP10]] to i64
 ; X64_2LD-NEXT:    [[TMP13:%.*]] = zext i16 [[TMP11]] to i64
@@ -916,10 +910,10 @@
 ; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
 ; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 3
-; X64_1LD-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i64*
-; X64_1LD-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i64 3
-; X64_1LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i64*
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i64, i64* [[TMP7]]
+; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 3
+; X64_1LD-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP6]] to i64*
+; X64_1LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP7]] to i64*
+; X64_1LD-NEXT:    [[TMP10:%.*]] = load i64, i64* [[TMP8]]
 ; X64_1LD-NEXT:    [[TMP11:%.*]] = load i64, i64* [[TMP9]]
 ; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP10]], [[TMP11]]
 ; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
@@ -936,10 +930,10 @@
 ; X64_2LD-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP2]]
 ; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]]
 ; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 3
-; X64_2LD-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i64*
-; X64_2LD-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i64 3
-; X64_2LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i64*
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i64, i64* [[TMP7]]
+; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 3
+; X64_2LD-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP6]] to i64*
+; X64_2LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP7]] to i64*
+; X64_2LD-NEXT:    [[TMP10:%.*]] = load i64, i64* [[TMP8]]
 ; X64_2LD-NEXT:    [[TMP11:%.*]] = load i64, i64* [[TMP9]]
 ; X64_2LD-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP10]], [[TMP11]]
 ; X64_2LD-NEXT:    [[TMP13:%.*]] = or i64 [[TMP5]], [[TMP12]]
@@ -975,10 +969,10 @@
 ; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
 ; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 8
-; X64_1LD-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32*
-; X64_1LD-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i64 8
-; X64_1LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32*
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP7]]
+; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 8
+; X64_1LD-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP6]] to i32*
+; X64_1LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP7]] to i32*
+; X64_1LD-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP8]]
 ; X64_1LD-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP9]]
 ; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP10]], [[TMP11]]
 ; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
@@ -995,10 +989,10 @@
 ; X64_2LD-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP2]]
 ; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]]
 ; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 8
-; X64_2LD-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32*
-; X64_2LD-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i64 8
-; X64_2LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32*
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP7]]
+; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 8
+; X64_2LD-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP6]] to i32*
+; X64_2LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP7]] to i32*
+; X64_2LD-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP8]]
 ; X64_2LD-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP9]]
 ; X64_2LD-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP10]] to i64
 ; X64_2LD-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP11]] to i64
@@ -1036,10 +1030,10 @@
 ; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
 ; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 5
-; X64_1LD-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i64*
-; X64_1LD-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i64 5
-; X64_1LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i64*
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i64, i64* [[TMP7]]
+; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 5
+; X64_1LD-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP6]] to i64*
+; X64_1LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP7]] to i64*
+; X64_1LD-NEXT:    [[TMP10:%.*]] = load i64, i64* [[TMP8]]
 ; X64_1LD-NEXT:    [[TMP11:%.*]] = load i64, i64* [[TMP9]]
 ; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP10]], [[TMP11]]
 ; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
@@ -1056,10 +1050,10 @@
 ; X64_2LD-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP2]]
 ; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]]
 ; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 5
-; X64_2LD-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i64*
-; X64_2LD-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i64 5
-; X64_2LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i64*
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i64, i64* [[TMP7]]
+; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 5
+; X64_2LD-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP6]] to i64*
+; X64_2LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP7]] to i64*
+; X64_2LD-NEXT:    [[TMP10:%.*]] = load i64, i64* [[TMP8]]
 ; X64_2LD-NEXT:    [[TMP11:%.*]] = load i64, i64* [[TMP9]]
 ; X64_2LD-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP10]], [[TMP11]]
 ; X64_2LD-NEXT:    [[TMP13:%.*]] = or i64 [[TMP5]], [[TMP12]]
@@ -1095,10 +1089,10 @@
 ; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
 ; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 6
-; X64_1LD-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i64*
-; X64_1LD-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i64 6
-; X64_1LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i64*
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i64, i64* [[TMP7]]
+; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 6
+; X64_1LD-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP6]] to i64*
+; X64_1LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP7]] to i64*
+; X64_1LD-NEXT:    [[TMP10:%.*]] = load i64, i64* [[TMP8]]
 ; X64_1LD-NEXT:    [[TMP11:%.*]] = load i64, i64* [[TMP9]]
 ; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP10]], [[TMP11]]
 ; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
@@ -1115,10 +1109,10 @@
 ; X64_2LD-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP2]]
 ; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]]
 ; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 6
-; X64_2LD-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i64*
-; X64_2LD-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i64 6
-; X64_2LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i64*
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i64, i64* [[TMP7]]
+; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 6
+; X64_2LD-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP6]] to i64*
+; X64_2LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP7]] to i64*
+; X64_2LD-NEXT:    [[TMP10:%.*]] = load i64, i64* [[TMP8]]
 ; X64_2LD-NEXT:    [[TMP11:%.*]] = load i64, i64* [[TMP9]]
 ; X64_2LD-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP10]], [[TMP11]]
 ; X64_2LD-NEXT:    [[TMP13:%.*]] = or i64 [[TMP5]], [[TMP12]]
@@ -1154,10 +1148,10 @@
 ; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
 ; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 7
-; X64_1LD-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i64*
-; X64_1LD-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i64 7
-; X64_1LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i64*
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i64, i64* [[TMP7]]
+; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 7
+; X64_1LD-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP6]] to i64*
+; X64_1LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP7]] to i64*
+; X64_1LD-NEXT:    [[TMP10:%.*]] = load i64, i64* [[TMP8]]
 ; X64_1LD-NEXT:    [[TMP11:%.*]] = load i64, i64* [[TMP9]]
 ; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP10]], [[TMP11]]
 ; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
@@ -1174,10 +1168,10 @@
 ; X64_2LD-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP2]]
 ; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]]
 ; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 7
-; X64_2LD-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i64*
-; X64_2LD-NEXT:    [[TMP8:%.*]] = getelementptr i8, i8* [[Y]], i64 7
-; X64_2LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i64*
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i64, i64* [[TMP7]]
+; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 7
+; X64_2LD-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP6]] to i64*
+; X64_2LD-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP7]] to i64*
+; X64_2LD-NEXT:    [[TMP10:%.*]] = load i64, i64* [[TMP8]]
 ; X64_2LD-NEXT:    [[TMP11:%.*]] = load i64, i64* [[TMP9]]
 ; X64_2LD-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP10]], [[TMP11]]
 ; X64_2LD-NEXT:    [[TMP13:%.*]] = or i64 [[TMP5]], [[TMP12]]