Index: llvm/docs/LangRef.rst
===================================================================
--- llvm/docs/LangRef.rst
+++ llvm/docs/LangRef.rst
@@ -1232,15 +1232,21 @@
     to ensure a pointer is not null or otherwise the behavior is undefined.
 
 ``dereferenceable(<n>)``
-    This indicates that the parameter or return pointer is dereferenceable. This
-    attribute may only be applied to pointer typed parameters. A pointer that
-    is dereferenceable can be loaded from speculatively without a risk of
-    trapping. The number of bytes known to be dereferenceable must be provided
-    in parentheses. It is legal for the number of bytes to be less than the
-    size of the pointee type. The ``nonnull`` attribute does not imply
-    dereferenceability (consider a pointer to one element past the end of an
-    array), however ``dereferenceable(<n>)`` does imply ``nonnull`` in
-    ``addrspace(0)`` (which is the default address space), except if the
+    This indicates that the parameter or return pointer is dereferenceable at
+    the instant of the call. This attribute may only be applied to pointer
+    typed parameters. The number of bytes known to be dereferenceable must
+    be provided in parentheses. It is legal for the number of bytes to be less
+    than the size of the pointee type.
+
+    A pointer that is dereferenceable at a particular location in the program
+    can be loaded from speculatively without a risk of trapping at that
+    location. In general, once a memory location becomes dereferenceable, it
+    will remain dereferenceable until the underlying object is freed.
+
+    The ``nonnull`` attribute does not imply dereferenceability (consider a
+    // pointer to one element past the end of an array), however
+    ``dereferenceable(<n>)`` does imply ``nonnull`` in ``addrspace(0)``
+    (which is the default address space), except if the
     ``null_pointer_is_valid`` function attribute is present.
     ``n`` should be a positive number. The pointer should be well defined,
     otherwise it is undefined behavior. This means ``dereferenceable(<n>)``
@@ -6003,10 +6009,10 @@
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 The existence of the ``!dereferenceable`` metadata on the instruction
-tells the optimizer that the value loaded is known to be dereferenceable.
-The number of bytes known to be dereferenceable is specified by the integer
-value in the metadata node. This is analogous to the ''dereferenceable''
-attribute on parameters and return values.
+tells the optimizer that the value loaded is known to be dereferenceable at
+that program location. The number of bytes known to be dereferenceable is
+specified by the integer value in the metadata node. This is analogous to the
+''dereferenceable'' attribute on parameters and return values.
 
 .. _md_dereferenceable_or_null:
 
@@ -6015,7 +6021,7 @@
 
 The existence of the ``!dereferenceable_or_null`` metadata on the
 instruction tells the optimizer that the value loaded is known to be either
-dereferenceable or null.
+dereferenceable or null at that program location.
 The number of bytes known to be dereferenceable is specified by the integer
 value in the metadata node. This is analogous to the ''dereferenceable_or_null''
 attribute on parameters and return values.
Index: llvm/lib/IR/Value.cpp
===================================================================
--- llvm/lib/IR/Value.cpp
+++ llvm/lib/IR/Value.cpp
@@ -39,7 +39,7 @@
 using namespace llvm;
 
 static cl::opt<unsigned> UseDerefAtPointSemantics(
-    "use-dereferenceable-at-point-semantics", cl::Hidden, cl::init(false),
+    "use-dereferenceable-at-point-semantics", cl::Hidden, cl::init(true),
     cl::desc("Deref attributes and metadata infer facts at definition only"));
 
 
Index: llvm/test/Analysis/ValueTracking/deref-bitcast-of-gep.ll
===================================================================
--- llvm/test/Analysis/ValueTracking/deref-bitcast-of-gep.ll
+++ llvm/test/Analysis/ValueTracking/deref-bitcast-of-gep.ll
@@ -14,10 +14,10 @@
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[PTR_GEP:%.*]] = getelementptr i8, i8* [[PTR:%.*]], i32 32
 ; CHECK-NEXT:    [[PTR_I32:%.*]] = bitcast i8* [[PTR_GEP]] to i32*
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[PTR_I32]], align 4
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    call void @use(i32 0)
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[PTR_I32]], align 4, !invariant.load !0
 ; CHECK-NEXT:    call void @use(i32 [[VAL]])
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
@@ -43,10 +43,10 @@
 ; CHECK-NEXT:    [[PTR_IS_NULL:%.*]] = icmp eq i8* [[PTR]], null
 ; CHECK-NEXT:    br i1 [[PTR_IS_NULL]], label [[LEAVE:%.*]], label [[LOOP_PREHEADER:%.*]]
 ; CHECK:       loop.preheader:
-; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[PTR_I32]], align 4
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    call void @use(i32 0)
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[PTR_I32]], align 4, !invariant.load !0
 ; CHECK-NEXT:    call void @use(i32 [[VAL]])
 ; CHECK-NEXT:    br label [[LOOP]]
 ; CHECK:       leave:
@@ -110,10 +110,10 @@
 ; CHECK-LABEL: @checkLaunder(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[L:%.*]] = call i8* @llvm.launder.invariant.group.p0i8(i8* [[P:%.*]])
-; CHECK-NEXT:    [[VAL:%.*]] = load i8, i8* [[L]], align 1
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    call void @use(i32 0)
+; CHECK-NEXT:    [[VAL:%.*]] = load i8, i8* [[L]], align 1, !invariant.load !0
 ; CHECK-NEXT:    call void @use8(i8 [[VAL]])
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
Index: llvm/test/CodeGen/X86/hoist-invariant-load.ll
===================================================================
--- llvm/test/CodeGen/X86/hoist-invariant-load.ll
+++ llvm/test/CodeGen/X86/hoist-invariant-load.ll
@@ -217,14 +217,13 @@
 ; CHECK:       ## %bb.0: ## %entry
 ; CHECK-NEXT:    movq %rdx, %rax
 ; CHECK-NEXT:    xorl %r8d, %r8d
-; CHECK-NEXT:    movq (%rdi), %rdx
-; CHECK-NEXT:    movq (%rsi), %r9
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  LBB4_2: ## %for.body
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    mulxq %r9, %rsi, %rdi
-; CHECK-NEXT:    addq %rsi, (%rax)
-; CHECK-NEXT:    adcq %rdi, 8(%rax)
+; CHECK-NEXT:    movq (%rdi), %rdx
+; CHECK-NEXT:    mulxq (%rsi), %rdx, %r9
+; CHECK-NEXT:    addq %rdx, (%rax)
+; CHECK-NEXT:    adcq %r9, 8(%rax)
 ; CHECK-NEXT:  ## %bb.1: ## %for.check
 ; CHECK-NEXT:    ## in Loop: Header=BB4_2 Depth=1
 ; CHECK-NEXT:    incq %r8
@@ -265,14 +264,12 @@
 ; CHECK:       ## %bb.0: ## %entry
 ; CHECK-NEXT:    movq %rdx, %r8
 ; CHECK-NEXT:    xorl %r9d, %r9d
-; CHECK-NEXT:    movl (%rdi), %edi
-; CHECK-NEXT:    movl (%rsi), %esi
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  LBB5_2: ## %for.body
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    movl (%rdi), %eax
 ; CHECK-NEXT:    xorl %edx, %edx
-; CHECK-NEXT:    divl %esi
+; CHECK-NEXT:    divl (%rsi)
 ; CHECK-NEXT:    addl %eax, (%r8,%r9,4)
 ; CHECK-NEXT:  ## %bb.1: ## %for.check
 ; CHECK-NEXT:    ## in Loop: Header=BB5_2 Depth=1
Index: llvm/test/CodeGen/X86/licm-dominance.ll
===================================================================
--- llvm/test/CodeGen/X86/licm-dominance.ll
+++ llvm/test/CodeGen/X86/licm-dominance.ll
@@ -11,10 +11,9 @@
 ; CHECK-LABEL: CMSColorWorldCreateParametricData:
 ; CHECK:       ## %bb.0: ## %entry
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    movb (%rdi), %dil
 ; CHECK-NEXT:  LBB0_1: ## %for.body.i
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    testb %dil, %dil
+; CHECK-NEXT:    cmpb $0, (%rdi)
 ; CHECK-NEXT:    je LBB0_5
 ; CHECK-NEXT:  ## %bb.2: ## %if.then26.i
 ; CHECK-NEXT:    ## in Loop: Header=BB0_1 Depth=1
Index: llvm/test/CodeGen/X86/load-partial.ll
===================================================================
--- llvm/test/CodeGen/X86/load-partial.ll
+++ llvm/test/CodeGen/X86/load-partial.ll
@@ -10,14 +10,30 @@
 ;
 
 define <4 x float> @load_float4_float3(<4 x float>* nocapture readonly dereferenceable(16)) {
-; SSE-LABEL: load_float4_float3:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movups (%rdi), %xmm0
-; SSE-NEXT:    retq
+; SSE2-LABEL: load_float4_float3:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_float4_float3:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSSE3-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_float4_float3:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: load_float4_float3:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovups (%rdi), %xmm0
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
 ; AVX-NEXT:    retq
   %p0 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 0
   %p1 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 1
@@ -35,14 +51,14 @@
 ; SSE-LABEL: load_float4_float3_0122:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE-NEXT:    movups (%rdi), %xmm0
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: load_float4_float3_0122:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT:    vmovups (%rdi), %xmm1
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0]
 ; AVX-NEXT:    retq
   %p0 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 0
@@ -59,14 +75,30 @@
 }
 
 define <8 x float> @load_float8_float3(<4 x float>* nocapture readonly dereferenceable(16)) {
-; SSE-LABEL: load_float8_float3:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movups (%rdi), %xmm0
-; SSE-NEXT:    retq
+; SSE2-LABEL: load_float8_float3:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_float8_float3:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSSE3-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_float8_float3:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: load_float8_float3:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovups (%rdi), %xmm0
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
 ; AVX-NEXT:    retq
   %p0 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 0
   %p1 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 1
@@ -84,14 +116,14 @@
 ; SSE-LABEL: load_float8_float3_0122:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE-NEXT:    movups (%rdi), %xmm0
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: load_float8_float3_0122:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT:    vmovups (%rdi), %xmm1
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0]
 ; AVX-NEXT:    retq
   %p0 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 0
@@ -108,14 +140,32 @@
 }
 
 define <4 x float> @load_float4_float3_as_float2_float(<4 x float>* nocapture readonly dereferenceable(16)) {
-; SSE-LABEL: load_float4_float3_as_float2_float:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movups (%rdi), %xmm0
-; SSE-NEXT:    retq
+; SSE2-LABEL: load_float4_float3_as_float2_float:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_float4_float3_as_float2_float:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_float4_float3_as_float2_float:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: load_float4_float3_as_float2_float:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovups (%rdi), %xmm0
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
 ; AVX-NEXT:    retq
   %2 = bitcast <4 x float>* %0 to <2 x float>*
   %3 = load <2 x float>, <2 x float>* %2, align 4
@@ -157,14 +207,30 @@
 }
 
 define <4 x float> @load_float4_float3_trunc(<4 x float>* nocapture readonly dereferenceable(16)) {
-; SSE-LABEL: load_float4_float3_trunc:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movaps (%rdi), %xmm0
-; SSE-NEXT:    retq
+; SSE2-LABEL: load_float4_float3_trunc:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_float4_float3_trunc:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSSE3-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_float4_float3_trunc:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: load_float4_float3_trunc:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps (%rdi), %xmm0
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
 ; AVX-NEXT:    retq
   %2 = bitcast <4 x float>* %0 to i64*
   %3 = load i64, i64* %2, align 16
@@ -188,14 +254,14 @@
 ; SSE-LABEL: load_float4_float3_trunc_0122:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE-NEXT:    movaps (%rdi), %xmm0
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: load_float4_float3_trunc_0122:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT:    vmovaps (%rdi), %xmm1
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0]
 ; AVX-NEXT:    retq
   %2 = bitcast <4 x float>* %0 to i64*
@@ -220,26 +286,26 @@
 define <4 x float> @load_float4_float3_trunc_0123(<4 x float>* nocapture readonly dereferenceable(16)) {
 ; SSE2-LABEL: load_float4_float3_trunc_0123:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movaps (%rdi), %xmm0
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE2-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: load_float4_float3_trunc_0123:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movaps (%rdi), %xmm0
+; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSSE3-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: load_float4_float3_trunc_0123:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movaps (%rdi), %xmm0
+; SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
 ; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: load_float4_float3_trunc_0123:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps (%rdi), %xmm0
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
 ; AVX-NEXT:    retq
@@ -289,7 +355,10 @@
 ;
 ; AVX-LABEL: load_double4_0u2u:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovddup {{.*#+}} ymm0 = mem[0,0,2,2]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
 ; AVX-NEXT:    retq
   %2 = load double, double* %0, align 8
   %3 = insertelement <4 x double> undef, double %2, i32 0
Index: llvm/test/CodeGen/X86/memcmp-mergeexpand.ll
===================================================================
--- llvm/test/CodeGen/X86/memcmp-mergeexpand.ll
+++ llvm/test/CodeGen/X86/memcmp-mergeexpand.ll
@@ -8,22 +8,37 @@
 
 define zeroext i1 @opeq1(
 ; X86-LABEL: opeq1:
-; X86:       # %bb.0: # %"entry+land.rhs.i"
+; X86:       # %bb.0: # %entry
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl (%ecx), %edx
+; X86-NEXT:    cmpl (%eax), %edx
+; X86-NEXT:    jne .LBB0_1
+; X86-NEXT:  # %bb.2: # %land.rhs.i
 ; X86-NEXT:    movl 4(%ecx), %ecx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 4(%eax), %ecx
-; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    cmpl 4(%eax), %ecx
 ; X86-NEXT:    sete %al
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_1:
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: opeq1:
-; X64:       # %bb.0: # %"entry+land.rhs.i"
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    cmpq (%rsi), %rax
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    cmpl (%rsi), %eax
+; X64-NEXT:    jne .LBB0_1
+; X64-NEXT:  # %bb.2: # %land.rhs.i
+; X64-NEXT:    movl 4(%rdi), %eax
+; X64-NEXT:    cmpl 4(%rsi), %eax
 ; X64-NEXT:    sete %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB0_1:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %"struct.std::pair"* nocapture readonly dereferenceable(8) %a,
   %"struct.std::pair"* nocapture readonly dereferenceable(8) %b) local_unnamed_addr #0 {
Index: llvm/test/Transforms/GVN/PRE/load-pre-licm.ll
===================================================================
--- llvm/test/Transforms/GVN/PRE/load-pre-licm.ll
+++ llvm/test/Transforms/GVN/PRE/load-pre-licm.ll
@@ -266,11 +266,11 @@
 define i32 @test6b(i1 %cnd, i32* dereferenceable(8) align 4 %p) {
 ; CHECK-LABEL: @test6b(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[V1_PRE:%.*]] = load i32, i32* [[P:%.*]], align 4
 ; CHECK-NEXT:    br label [[HEADER:%.*]]
 ; CHECK:       header:
 ; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 [[CND:%.*]]) [ "deopt"() ]
-; CHECK-NEXT:    call void @hold(i32 [[V1_PRE]])
+; CHECK-NEXT:    [[V1:%.*]] = load i32, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    call void @hold(i32 [[V1]])
 ; CHECK-NEXT:    br label [[HEADER]]
 ;
 entry:
Index: llvm/test/Transforms/GVN/PRE/pre-load.ll
===================================================================
--- llvm/test/Transforms/GVN/PRE/pre-load.ll
+++ llvm/test/Transforms/GVN/PRE/pre-load.ll
@@ -687,17 +687,14 @@
 ; CHECK-LABEL: @test15(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[A:%.*]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL]], label [[ENTRY_IF_END_CRIT_EDGE:%.*]], label [[IF_THEN:%.*]]
-; CHECK:       entry.if.end_crit_edge:
-; CHECK-NEXT:    [[VV_PRE:%.*]] = load i32, i32* [[X:%.*]], align 4
-; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[UU:%.*]] = load i32, i32* [[X]], align 4
+; CHECK-NEXT:    [[UU:%.*]] = load i32, i32* [[X:%.*]], align 4
 ; CHECK-NEXT:    store i32 [[UU]], i32* [[R:%.*]], align 4
 ; CHECK-NEXT:    br label [[IF_END]]
 ; CHECK:       if.end:
-; CHECK-NEXT:    [[VV:%.*]] = phi i32 [ [[VV_PRE]], [[ENTRY_IF_END_CRIT_EDGE]] ], [ [[UU]], [[IF_THEN]] ]
 ; CHECK-NEXT:    call void @f()
+; CHECK-NEXT:    [[VV:%.*]] = load i32, i32* [[X]], align 4
 ; CHECK-NEXT:    ret i32 [[VV]]
 ;
 
@@ -728,17 +725,14 @@
 ; CHECK-LABEL: @test16(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[A:%.*]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL]], label [[ENTRY_IF_END_CRIT_EDGE:%.*]], label [[IF_THEN:%.*]]
-; CHECK:       entry.if.end_crit_edge:
-; CHECK-NEXT:    [[VV_PRE:%.*]] = load i32, i32* [[X:%.*]], align 4
-; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[UU:%.*]] = load i32, i32* [[X]], align 4
+; CHECK-NEXT:    [[UU:%.*]] = load i32, i32* [[X:%.*]], align 4
 ; CHECK-NEXT:    store i32 [[UU]], i32* [[R:%.*]], align 4
 ; CHECK-NEXT:    br label [[IF_END]]
 ; CHECK:       if.end:
-; CHECK-NEXT:    [[VV:%.*]] = phi i32 [ [[VV_PRE]], [[ENTRY_IF_END_CRIT_EDGE]] ], [ [[UU]], [[IF_THEN]] ]
 ; CHECK-NEXT:    call void @f()
+; CHECK-NEXT:    [[VV:%.*]] = load i32, i32* [[X]], align 4
 ; CHECK-NEXT:    ret i32 [[VV]]
 ;
 
Index: llvm/test/Transforms/GVN/loadpre-context.ll
===================================================================
--- llvm/test/Transforms/GVN/loadpre-context.ll
+++ llvm/test/Transforms/GVN/loadpre-context.ll
@@ -7,15 +7,12 @@
 ; CHECK-LABEL: @loadpre_critical_edge(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32* [[ARG:%.*]], null
-; CHECK-NEXT:    br i1 [[CMP]], label [[NULL_EXIT:%.*]], label [[ENTRY_HEADER_CRIT_EDGE:%.*]]
-; CHECK:       entry.header_crit_edge:
-; CHECK-NEXT:    [[V_PRE:%.*]] = load i32, i32* [[ARG]], align 4
-; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[NULL_EXIT:%.*]], label [[HEADER:%.*]]
 ; CHECK:       header:
-; CHECK-NEXT:    [[V:%.*]] = phi i32 [ [[V_PRE]], [[ENTRY_HEADER_CRIT_EDGE]] ], [ [[SUM:%.*]], [[HEADER]] ]
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY_HEADER_CRIT_EDGE]] ], [ [[IV_NEXT:%.*]], [[HEADER]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[HEADER]] ]
 ; CHECK-NEXT:    [[NEW_V:%.*]] = call i32 @ro_foo(i32 [[IV]]) #[[ATTR0:[0-9]+]]
-; CHECK-NEXT:    [[SUM]] = add i32 [[NEW_V]], [[V]]
+; CHECK-NEXT:    [[V:%.*]] = load i32, i32* [[ARG]], align 4
+; CHECK-NEXT:    [[SUM:%.*]] = add i32 [[NEW_V]], [[V]]
 ; CHECK-NEXT:    store i32 [[SUM]], i32* [[ARG]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[IV_NEXT]], [[N:%.*]]
@@ -54,13 +51,12 @@
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32* [[ARG:%.*]], null
 ; CHECK-NEXT:    br i1 [[CMP]], label [[NULL_EXIT:%.*]], label [[PREHEADER:%.*]]
 ; CHECK:       preheader:
-; CHECK-NEXT:    [[V_PRE:%.*]] = load i32, i32* [[ARG]], align 4
 ; CHECK-NEXT:    br label [[HEADER:%.*]]
 ; CHECK:       header:
-; CHECK-NEXT:    [[V:%.*]] = phi i32 [ [[V_PRE]], [[PREHEADER]] ], [ [[SUM:%.*]], [[HEADER]] ]
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[PREHEADER]] ], [ [[IV_NEXT:%.*]], [[HEADER]] ]
 ; CHECK-NEXT:    [[NEW_V:%.*]] = call i32 @ro_foo(i32 [[IV]]) #[[ATTR0]]
-; CHECK-NEXT:    [[SUM]] = add i32 [[NEW_V]], [[V]]
+; CHECK-NEXT:    [[V:%.*]] = load i32, i32* [[ARG]], align 4
+; CHECK-NEXT:    [[SUM:%.*]] = add i32 [[NEW_V]], [[V]]
 ; CHECK-NEXT:    store i32 [[SUM]], i32* [[ARG]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[IV_NEXT]], [[N:%.*]]
Index: llvm/test/Transforms/InstCombine/call-guard.ll
===================================================================
--- llvm/test/Transforms/InstCombine/call-guard.ll
+++ llvm/test/Transforms/InstCombine/call-guard.ll
@@ -79,10 +79,11 @@
 
 define void @deref_load(i32 %V1, i32* dereferenceable(4) align 4 %P) {
 ; CHECK-LABEL: @deref_load(
+; CHECK-NEXT:    [[A:%.*]] = icmp slt i32 [[V1:%.*]], 0
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 [[A]], i32 123) [ "deopt"() ]
 ; CHECK-NEXT:    [[V2:%.*]] = load i32, i32* [[P:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[V2]], [[V1:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt i32 [[TMP1]], 0
-; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 [[TMP2]], i32 123) [ "deopt"() ]
+; CHECK-NEXT:    [[B:%.*]] = icmp slt i32 [[V2]], 0
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 [[B]], i32 456) [ "deopt"() ]
 ; CHECK-NEXT:    ret void
 ;
   %A = icmp slt i32 %V1, 0
Index: llvm/test/Transforms/InstCombine/masked_intrinsics-inseltpoison.ll
===================================================================
--- llvm/test/Transforms/InstCombine/masked_intrinsics-inseltpoison.ll
+++ llvm/test/Transforms/InstCombine/masked_intrinsics-inseltpoison.ll
@@ -86,9 +86,8 @@
 ; CHECK-LABEL: @load_speculative(
 ; CHECK-NEXT:    [[PTV1:%.*]] = insertelement <2 x double> poison, double [[PT:%.*]], i64 0
 ; CHECK-NEXT:    [[PTV2:%.*]] = shufflevector <2 x double> [[PTV1]], <2 x double> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[UNMASKEDLOAD:%.*]] = load <2 x double>, <2 x double>* [[PTR:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = select <2 x i1> [[MASK:%.*]], <2 x double> [[UNMASKEDLOAD]], <2 x double> [[PTV2]]
-; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* nonnull [[PTR:%.*]], i32 4, <2 x i1> [[MASK:%.*]], <2 x double> [[PTV2]])
+; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %ptv1 = insertelement <2 x double> poison, double %pt, i64 0
   %ptv2 = insertelement <2 x double> %ptv1, double %pt, i64 1
@@ -100,9 +99,8 @@
 ; CHECK-LABEL: @load_speculative_less_aligned(
 ; CHECK-NEXT:    [[PTV1:%.*]] = insertelement <2 x double> poison, double [[PT:%.*]], i64 0
 ; CHECK-NEXT:    [[PTV2:%.*]] = shufflevector <2 x double> [[PTV1]], <2 x double> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[UNMASKEDLOAD:%.*]] = load <2 x double>, <2 x double>* [[PTR:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = select <2 x i1> [[MASK:%.*]], <2 x double> [[UNMASKEDLOAD]], <2 x double> [[PTV2]]
-; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* nonnull [[PTR:%.*]], i32 4, <2 x i1> [[MASK:%.*]], <2 x double> [[PTV2]])
+; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %ptv1 = insertelement <2 x double> poison, double %pt, i64 0
   %ptv2 = insertelement <2 x double> %ptv1, double %pt, i64 1
Index: llvm/test/Transforms/InstCombine/masked_intrinsics.ll
===================================================================
--- llvm/test/Transforms/InstCombine/masked_intrinsics.ll
+++ llvm/test/Transforms/InstCombine/masked_intrinsics.ll
@@ -86,9 +86,8 @@
 ; CHECK-LABEL: @load_speculative(
 ; CHECK-NEXT:    [[PTV1:%.*]] = insertelement <2 x double> undef, double [[PT:%.*]], i64 0
 ; CHECK-NEXT:    [[PTV2:%.*]] = shufflevector <2 x double> [[PTV1]], <2 x double> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[UNMASKEDLOAD:%.*]] = load <2 x double>, <2 x double>* [[PTR:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = select <2 x i1> [[MASK:%.*]], <2 x double> [[UNMASKEDLOAD]], <2 x double> [[PTV2]]
-; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* nonnull [[PTR:%.*]], i32 4, <2 x i1> [[MASK:%.*]], <2 x double> [[PTV2]])
+; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %ptv1 = insertelement <2 x double> undef, double %pt, i64 0
   %ptv2 = insertelement <2 x double> %ptv1, double %pt, i64 1
@@ -100,9 +99,8 @@
 ; CHECK-LABEL: @load_speculative_less_aligned(
 ; CHECK-NEXT:    [[PTV1:%.*]] = insertelement <2 x double> undef, double [[PT:%.*]], i64 0
 ; CHECK-NEXT:    [[PTV2:%.*]] = shufflevector <2 x double> [[PTV1]], <2 x double> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[UNMASKEDLOAD:%.*]] = load <2 x double>, <2 x double>* [[PTR:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = select <2 x i1> [[MASK:%.*]], <2 x double> [[UNMASKEDLOAD]], <2 x double> [[PTV2]]
-; CHECK-NEXT:    ret <2 x double> [[TMP1]]
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* nonnull [[PTR:%.*]], i32 4, <2 x i1> [[MASK:%.*]], <2 x double> [[PTV2]])
+; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %ptv1 = insertelement <2 x double> undef, double %pt, i64 0
   %ptv2 = insertelement <2 x double> %ptv1, double %pt, i64 1
Index: llvm/test/Transforms/InstCombine/select.ll
===================================================================
--- llvm/test/Transforms/InstCombine/select.ll
+++ llvm/test/Transforms/InstCombine/select.ll
@@ -1072,9 +1072,8 @@
 ; fold the load completely away.
 define i32 @test78_deref(i1 %flag, i32* dereferenceable(4) align 4 %x, i32* dereferenceable(4) align 4 %y, i32* %z) {
 ; CHECK-LABEL: @test78_deref(
-; CHECK-NEXT:    [[X_VAL:%.*]] = load i32, i32* [[X:%.*]], align 4
-; CHECK-NEXT:    [[Y_VAL:%.*]] = load i32, i32* [[Y:%.*]], align 4
-; CHECK-NEXT:    [[V:%.*]] = select i1 [[FLAG:%.*]], i32 [[X_VAL]], i32 [[Y_VAL]]
+; CHECK-NEXT:    [[P:%.*]] = select i1 [[FLAG:%.*]], i32* [[X:%.*]], i32* [[Y:%.*]]
+; CHECK-NEXT:    [[V:%.*]] = load i32, i32* [[P]], align 4
 ; CHECK-NEXT:    ret i32 [[V]]
 ;
   %p = select i1 %flag, i32* %x, i32* %y
Index: llvm/test/Transforms/InstCombine/strcmp-memcmp.ll
===================================================================
--- llvm/test/Transforms/InstCombine/strcmp-memcmp.ll
+++ llvm/test/Transforms/InstCombine/strcmp-memcmp.ll
@@ -11,8 +11,8 @@
 define i32 @strcmp_memcmp([12 x i8]* dereferenceable (12) %buf) {
 ; CHECK-LABEL: @strcmp_memcmp(
 ; CHECK-NEXT:    [[STRING:%.*]] = getelementptr inbounds [12 x i8], [12 x i8]* [[BUF:%.*]], i64 0, i64 0
-; CHECK-NEXT:    [[MEMCMP:%.*]] = call i32 @memcmp(i8* noundef nonnull dereferenceable(4) [[STRING]], i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([4 x i8], [4 x i8]* @key, i64 0, i64 0), i64 4)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[MEMCMP]], 0
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @strcmp(i8* noundef nonnull [[STRING]], i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([4 x i8], [4 x i8]* @key, i64 0, i64 0))
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
 ; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; CHECK-NEXT:    ret i32 [[CONV]]
 ;
@@ -28,8 +28,8 @@
 define i32 @strcmp_memcmp2([12 x i8]* dereferenceable (12) %buf) {
 ; CHECK-LABEL: @strcmp_memcmp2(
 ; CHECK-NEXT:    [[STRING:%.*]] = getelementptr inbounds [12 x i8], [12 x i8]* [[BUF:%.*]], i64 0, i64 0
-; CHECK-NEXT:    [[MEMCMP:%.*]] = call i32 @memcmp(i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([4 x i8], [4 x i8]* @key, i64 0, i64 0), i8* noundef nonnull dereferenceable(4) [[STRING]], i64 4)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[MEMCMP]], 0
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @strcmp(i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([4 x i8], [4 x i8]* @key, i64 0, i64 0), i8* noundef nonnull [[STRING]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
 ; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; CHECK-NEXT:    ret i32 [[CONV]]
 ;
@@ -43,8 +43,8 @@
 define i32 @strcmp_memcmp3([12 x i8]* dereferenceable (12) %buf) {
 ; CHECK-LABEL: @strcmp_memcmp3(
 ; CHECK-NEXT:    [[STRING:%.*]] = getelementptr inbounds [12 x i8], [12 x i8]* [[BUF:%.*]], i64 0, i64 0
-; CHECK-NEXT:    [[MEMCMP:%.*]] = call i32 @memcmp(i8* noundef nonnull dereferenceable(4) [[STRING]], i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([4 x i8], [4 x i8]* @key, i64 0, i64 0), i64 4)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[MEMCMP]], 0
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @strcmp(i8* noundef nonnull [[STRING]], i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([4 x i8], [4 x i8]* @key, i64 0, i64 0))
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
 ; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; CHECK-NEXT:    ret i32 [[CONV]]
 ;
@@ -58,8 +58,8 @@
 define i32 @strcmp_memcmp4([12 x i8]* dereferenceable (12) %buf) {
 ; CHECK-LABEL: @strcmp_memcmp4(
 ; CHECK-NEXT:    [[STRING:%.*]] = getelementptr inbounds [12 x i8], [12 x i8]* [[BUF:%.*]], i64 0, i64 0
-; CHECK-NEXT:    [[MEMCMP:%.*]] = call i32 @memcmp(i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([4 x i8], [4 x i8]* @key, i64 0, i64 0), i8* noundef nonnull dereferenceable(4) [[STRING]], i64 4)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[MEMCMP]], 0
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @strcmp(i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([4 x i8], [4 x i8]* @key, i64 0, i64 0), i8* noundef nonnull [[STRING]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
 ; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; CHECK-NEXT:    ret i32 [[CONV]]
 ;
@@ -73,8 +73,8 @@
 define i32 @strcmp_memcmp5([5 x i8]* dereferenceable (5) %buf) {
 ; CHECK-LABEL: @strcmp_memcmp5(
 ; CHECK-NEXT:    [[STRING:%.*]] = getelementptr inbounds [5 x i8], [5 x i8]* [[BUF:%.*]], i64 0, i64 0
-; CHECK-NEXT:    [[MEMCMP:%.*]] = call i32 @memcmp(i8* noundef nonnull dereferenceable(4) [[STRING]], i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([4 x i8], [4 x i8]* @key, i64 0, i64 0), i64 4)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[MEMCMP]], 0
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @strcmp(i8* noundef nonnull align 1 [[STRING]], i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([4 x i8], [4 x i8]* @key, i64 0, i64 0))
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
 ; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; CHECK-NEXT:    ret i32 [[CONV]]
 ;
@@ -88,8 +88,8 @@
 define i32 @strcmp_memcmp6([12 x i8]* dereferenceable (12) %buf) {
 ; CHECK-LABEL: @strcmp_memcmp6(
 ; CHECK-NEXT:    [[STRING:%.*]] = getelementptr inbounds [12 x i8], [12 x i8]* [[BUF:%.*]], i64 0, i64 0
-; CHECK-NEXT:    [[MEMCMP:%.*]] = call i32 @memcmp(i8* noundef nonnull dereferenceable(4) [[STRING]], i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([4 x i8], [4 x i8]* @key, i64 0, i64 0), i64 4)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[MEMCMP]], 0
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @strcmp(i8* noundef nonnull [[STRING]], i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([4 x i8], [4 x i8]* @key, i64 0, i64 0))
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
 ; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; CHECK-NEXT:    ret i32 [[CONV]]
 ;
@@ -103,9 +103,9 @@
 define i32 @strcmp_memcmp7([12 x i8]* dereferenceable (12) %buf) {
 ; CHECK-LABEL: @strcmp_memcmp7(
 ; CHECK-NEXT:    [[STRING:%.*]] = getelementptr inbounds [12 x i8], [12 x i8]* [[BUF:%.*]], i64 0, i64 0
-; CHECK-NEXT:    [[MEMCMP:%.*]] = call i32 @memcmp(i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([4 x i8], [4 x i8]* @key, i64 0, i64 0), i8* noundef nonnull dereferenceable(4) [[STRING]], i64 4)
-; CHECK-NEXT:    [[MEMCMP_LOBIT:%.*]] = lshr i32 [[MEMCMP]], 31
-; CHECK-NEXT:    ret i32 [[MEMCMP_LOBIT]]
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @strcmp(i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([4 x i8], [4 x i8]* @key, i64 0, i64 0), i8* noundef nonnull [[STRING]])
+; CHECK-NEXT:    [[CALL_LOBIT:%.*]] = lshr i32 [[CALL]], 31
+; CHECK-NEXT:    ret i32 [[CALL_LOBIT]]
 ;
   %string = getelementptr inbounds [12 x i8], [12 x i8]* %buf, i64 0, i64 0
   %call = call i32 @strcmp(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @key, i64 0, i64 0), i8* nonnull %string)
@@ -117,8 +117,8 @@
 define i32 @strcmp_memcmp8([4 x i8]* dereferenceable (4) %buf) {
 ; CHECK-LABEL: @strcmp_memcmp8(
 ; CHECK-NEXT:    [[STRING:%.*]] = getelementptr inbounds [4 x i8], [4 x i8]* [[BUF:%.*]], i64 0, i64 0
-; CHECK-NEXT:    [[MEMCMP:%.*]] = call i32 @memcmp(i8* noundef nonnull dereferenceable(4) [[STRING]], i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([4 x i8], [4 x i8]* @key, i64 0, i64 0), i64 4)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[MEMCMP]], 0
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @strcmp(i8* noundef nonnull [[STRING]], i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([4 x i8], [4 x i8]* @key, i64 0, i64 0))
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
 ; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; CHECK-NEXT:    ret i32 [[CONV]]
 ;
@@ -132,8 +132,8 @@
 define i32 @strcmp_memcmp9([12 x i8]* dereferenceable (12) %buf) {
 ; CHECK-LABEL: @strcmp_memcmp9(
 ; CHECK-NEXT:    [[STRING:%.*]] = getelementptr inbounds [12 x i8], [12 x i8]* [[BUF:%.*]], i64 0, i64 0
-; CHECK-NEXT:    [[MEMCMP:%.*]] = call i32 @memcmp(i8* noundef nonnull dereferenceable(4) [[STRING]], i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([8 x i8], [8 x i8]* @abc, i64 0, i64 0), i64 4)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[MEMCMP]], 0
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @strcmp(i8* noundef nonnull [[STRING]], i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([8 x i8], [8 x i8]* @abc, i64 0, i64 0))
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
 ; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; CHECK-NEXT:    ret i32 [[CONV]]
 ;
@@ -148,8 +148,8 @@
 define i32 @strncmp_memcmp([12 x i8]* dereferenceable (12) %buf) {
 ; CHECK-LABEL: @strncmp_memcmp(
 ; CHECK-NEXT:    [[STRING:%.*]] = getelementptr inbounds [12 x i8], [12 x i8]* [[BUF:%.*]], i64 0, i64 0
-; CHECK-NEXT:    [[MEMCMP:%.*]] = call i32 @memcmp(i8* noundef nonnull dereferenceable(2) [[STRING]], i8* noundef nonnull dereferenceable(2) getelementptr inbounds ([4 x i8], [4 x i8]* @key, i64 0, i64 0), i64 2)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[MEMCMP]], 0
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @strncmp(i8* noundef nonnull [[STRING]], i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([4 x i8], [4 x i8]* @key, i64 0, i64 0), i64 2)
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
 ; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; CHECK-NEXT:    ret i32 [[CONV]]
 ;
@@ -165,8 +165,8 @@
 define i32 @strncmp_memcmp2([12 x i8]* dereferenceable (12) %buf) {
 ; CHECK-LABEL: @strncmp_memcmp2(
 ; CHECK-NEXT:    [[STRING:%.*]] = getelementptr inbounds [12 x i8], [12 x i8]* [[BUF:%.*]], i64 0, i64 0
-; CHECK-NEXT:    [[MEMCMP:%.*]] = call i32 @memcmp(i8* noundef nonnull dereferenceable(4) [[STRING]], i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([4 x i8], [4 x i8]* @key, i64 0, i64 0), i64 4)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[MEMCMP]], 0
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @strncmp(i8* noundef nonnull [[STRING]], i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([4 x i8], [4 x i8]* @key, i64 0, i64 0), i64 11)
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
 ; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; CHECK-NEXT:    ret i32 [[CONV]]
 ;
@@ -180,8 +180,8 @@
 define i32 @strncmp_memcmp3([12 x i8]* dereferenceable (12) %buf) {
 ; CHECK-LABEL: @strncmp_memcmp3(
 ; CHECK-NEXT:    [[STRING:%.*]] = getelementptr inbounds [12 x i8], [12 x i8]* [[BUF:%.*]], i64 0, i64 0
-; CHECK-NEXT:    [[MEMCMP:%.*]] = call i32 @memcmp(i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([4 x i8], [4 x i8]* @key, i64 0, i64 0), i8* noundef nonnull dereferenceable(4) [[STRING]], i64 4)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[MEMCMP]], 0
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @strncmp(i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([4 x i8], [4 x i8]* @key, i64 0, i64 0), i8* noundef nonnull [[STRING]], i64 11)
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
 ; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; CHECK-NEXT:    ret i32 [[CONV]]
 ;
@@ -195,8 +195,8 @@
 define i32 @strncmp_memcmp4([12 x i8]* dereferenceable (12) %buf) {
 ; CHECK-LABEL: @strncmp_memcmp4(
 ; CHECK-NEXT:    [[STRING:%.*]] = getelementptr inbounds [12 x i8], [12 x i8]* [[BUF:%.*]], i64 0, i64 0
-; CHECK-NEXT:    [[MEMCMP:%.*]] = call i32 @memcmp(i8* noundef nonnull dereferenceable(4) [[STRING]], i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([4 x i8], [4 x i8]* @key, i64 0, i64 0), i64 4)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[MEMCMP]], 0
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @strncmp(i8* noundef nonnull [[STRING]], i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([4 x i8], [4 x i8]* @key, i64 0, i64 0), i64 5)
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
 ; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; CHECK-NEXT:    ret i32 [[CONV]]
 ;
@@ -210,8 +210,8 @@
 define i32 @strncmp_memcmp5([12 x i8]* dereferenceable (12) %buf) {
 ; CHECK-LABEL: @strncmp_memcmp5(
 ; CHECK-NEXT:    [[STRING:%.*]] = getelementptr inbounds [12 x i8], [12 x i8]* [[BUF:%.*]], i64 0, i64 0
-; CHECK-NEXT:    [[MEMCMP:%.*]] = call i32 @memcmp(i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([4 x i8], [4 x i8]* @key, i64 0, i64 0), i8* noundef nonnull dereferenceable(4) [[STRING]], i64 4)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[MEMCMP]], 0
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @strncmp(i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([4 x i8], [4 x i8]* @key, i64 0, i64 0), i8* noundef nonnull [[STRING]], i64 5)
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
 ; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; CHECK-NEXT:    ret i32 [[CONV]]
 ;
@@ -226,8 +226,8 @@
 define i32 @strncmp_memcmp6([12 x i8]* dereferenceable (12) %buf) {
 ; CHECK-LABEL: @strncmp_memcmp6(
 ; CHECK-NEXT:    [[STRING:%.*]] = getelementptr inbounds [12 x i8], [12 x i8]* [[BUF:%.*]], i64 0, i64 0
-; CHECK-NEXT:    [[MEMCMP:%.*]] = call i32 @memcmp(i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([4 x i8], [4 x i8]* @key, i64 0, i64 0), i8* noundef nonnull dereferenceable(4) [[STRING]], i64 4)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[MEMCMP]], 0
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @strncmp(i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([4 x i8], [4 x i8]* @key, i64 0, i64 0), i8* noundef nonnull [[STRING]], i64 5)
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
 ; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; CHECK-NEXT:    ret i32 [[CONV]]
 ;
@@ -241,8 +241,8 @@
 define i32 @strncmp_memcmp7([12 x i8]* dereferenceable (12) %buf) {
 ; CHECK-LABEL: @strncmp_memcmp7(
 ; CHECK-NEXT:    [[STRING:%.*]] = getelementptr inbounds [12 x i8], [12 x i8]* [[BUF:%.*]], i64 0, i64 0
-; CHECK-NEXT:    [[MEMCMP:%.*]] = call i32 @memcmp(i8* noundef nonnull dereferenceable(4) [[STRING]], i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([4 x i8], [4 x i8]* @key, i64 0, i64 0), i64 4)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[MEMCMP]], 0
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @strncmp(i8* noundef nonnull [[STRING]], i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([4 x i8], [4 x i8]* @key, i64 0, i64 0), i64 4)
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
 ; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; CHECK-NEXT:    ret i32 [[CONV]]
 ;
@@ -256,8 +256,8 @@
 define i32 @strncmp_memcmp8([12 x i8]* dereferenceable (12) %buf) {
 ; CHECK-LABEL: @strncmp_memcmp8(
 ; CHECK-NEXT:    [[STRING:%.*]] = getelementptr inbounds [12 x i8], [12 x i8]* [[BUF:%.*]], i64 0, i64 0
-; CHECK-NEXT:    [[MEMCMP:%.*]] = call i32 @memcmp(i8* noundef nonnull dereferenceable(3) [[STRING]], i8* noundef nonnull dereferenceable(3) getelementptr inbounds ([4 x i8], [4 x i8]* @key, i64 0, i64 0), i64 3)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[MEMCMP]], 0
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @strncmp(i8* noundef nonnull [[STRING]], i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([4 x i8], [4 x i8]* @key, i64 0, i64 0), i64 3)
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
 ; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; CHECK-NEXT:    ret i32 [[CONV]]
 ;
@@ -271,8 +271,8 @@
 define i32 @strncmp_memcmp9([12 x i8]* dereferenceable (12) %buf) {
 ; CHECK-LABEL: @strncmp_memcmp9(
 ; CHECK-NEXT:    [[STRING:%.*]] = getelementptr inbounds [12 x i8], [12 x i8]* [[BUF:%.*]], i64 0, i64 0
-; CHECK-NEXT:    [[MEMCMP:%.*]] = call i32 @memcmp(i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([4 x i8], [4 x i8]* @key, i64 0, i64 0), i8* noundef nonnull dereferenceable(4) [[STRING]], i64 4)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[MEMCMP]], 0
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @strncmp(i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([4 x i8], [4 x i8]* @key, i64 0, i64 0), i8* noundef nonnull [[STRING]], i64 5)
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
 ; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; CHECK-NEXT:    ret i32 [[CONV]]
 ;
@@ -286,9 +286,9 @@
 define i32 @strncmp_memcmp10([12 x i8]* dereferenceable (12) %buf) {
 ; CHECK-LABEL: @strncmp_memcmp10(
 ; CHECK-NEXT:    [[STRING:%.*]] = getelementptr inbounds [12 x i8], [12 x i8]* [[BUF:%.*]], i64 0, i64 0
-; CHECK-NEXT:    [[MEMCMP:%.*]] = call i32 @memcmp(i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([4 x i8], [4 x i8]* @key, i64 0, i64 0), i8* noundef nonnull dereferenceable(4) [[STRING]], i64 4)
-; CHECK-NEXT:    [[MEMCMP_LOBIT:%.*]] = lshr i32 [[MEMCMP]], 31
-; CHECK-NEXT:    ret i32 [[MEMCMP_LOBIT]]
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @strncmp(i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([4 x i8], [4 x i8]* @key, i64 0, i64 0), i8* noundef nonnull [[STRING]], i64 5)
+; CHECK-NEXT:    [[CALL_LOBIT:%.*]] = lshr i32 [[CALL]], 31
+; CHECK-NEXT:    ret i32 [[CALL_LOBIT]]
 ;
   %string = getelementptr inbounds [12 x i8], [12 x i8]* %buf, i64 0, i64 0
   %call = call i32 @strncmp(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @key, i64 0, i64 0), i8* nonnull %string, i64 5)
@@ -300,8 +300,8 @@
 define i32 @strncmp_memcmp11([12 x i8]* dereferenceable (12) %buf) {
 ; CHECK-LABEL: @strncmp_memcmp11(
 ; CHECK-NEXT:    [[STRING:%.*]] = getelementptr inbounds [12 x i8], [12 x i8]* [[BUF:%.*]], i64 0, i64 0
-; CHECK-NEXT:    [[MEMCMP:%.*]] = call i32 @memcmp(i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([4 x i8], [4 x i8]* @key, i64 0, i64 0), i8* noundef nonnull dereferenceable(4) [[STRING]], i64 4)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[MEMCMP]], 0
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @strncmp(i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([4 x i8], [4 x i8]* @key, i64 0, i64 0), i8* noundef nonnull [[STRING]], i64 12)
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
 ; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; CHECK-NEXT:    ret i32 [[CONV]]
 ;
@@ -315,8 +315,8 @@
 define i32 @strncmp_memcmp12([12 x i8]* dereferenceable (12) %buf) {
 ; CHECK-LABEL: @strncmp_memcmp12(
 ; CHECK-NEXT:    [[STRING:%.*]] = getelementptr inbounds [12 x i8], [12 x i8]* [[BUF:%.*]], i64 0, i64 0
-; CHECK-NEXT:    [[MEMCMP:%.*]] = call i32 @memcmp(i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([4 x i8], [4 x i8]* @key, i64 0, i64 0), i8* noundef nonnull dereferenceable(4) [[STRING]], i64 4)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[MEMCMP]], 0
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @strncmp(i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([4 x i8], [4 x i8]* @key, i64 0, i64 0), i8* noundef nonnull [[STRING]], i64 12)
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
 ; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; CHECK-NEXT:    ret i32 [[CONV]]
 ;
@@ -330,8 +330,8 @@
 define i32 @strncmp_memcmp13([12 x i8]* dereferenceable (12) %buf) {
 ; CHECK-LABEL: @strncmp_memcmp13(
 ; CHECK-NEXT:    [[STRING:%.*]] = getelementptr inbounds [12 x i8], [12 x i8]* [[BUF:%.*]], i64 0, i64 0
-; CHECK-NEXT:    [[MEMCMP:%.*]] = call i32 @memcmp(i8* noundef nonnull dereferenceable(2) [[STRING]], i8* noundef nonnull dereferenceable(2) getelementptr inbounds ([8 x i8], [8 x i8]* @abc, i64 0, i64 0), i64 2)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[MEMCMP]], 0
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @strncmp(i8* noundef nonnull [[STRING]], i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([8 x i8], [8 x i8]* @abc, i64 0, i64 0), i64 2)
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
 ; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; CHECK-NEXT:    ret i32 [[CONV]]
 ;
@@ -345,8 +345,8 @@
 define i32 @strncmp_memcmp14([12 x i8]* dereferenceable (12) %buf) {
 ; CHECK-LABEL: @strncmp_memcmp14(
 ; CHECK-NEXT:    [[STRING:%.*]] = getelementptr inbounds [12 x i8], [12 x i8]* [[BUF:%.*]], i64 0, i64 0
-; CHECK-NEXT:    [[MEMCMP:%.*]] = call i32 @memcmp(i8* noundef nonnull dereferenceable(4) [[STRING]], i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([8 x i8], [8 x i8]* @abc, i64 0, i64 0), i64 4)
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[MEMCMP]], 0
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @strncmp(i8* noundef nonnull [[STRING]], i8* noundef nonnull dereferenceable(4) getelementptr inbounds ([8 x i8], [8 x i8]* @abc, i64 0, i64 0), i64 12)
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
 ; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; CHECK-NEXT:    ret i32 [[CONV]]
 ;
Index: llvm/test/Transforms/JumpThreading/guards.ll
===================================================================
--- llvm/test/Transforms/JumpThreading/guards.ll
+++ llvm/test/Transforms/JumpThreading/guards.ll
@@ -361,11 +361,10 @@
 define void @safe_pre_across_guard(i8* noalias nocapture readonly dereferenceable(8) %p, i1 %load.is.valid) {
 ; CHECK-LABEL: @safe_pre_across_guard(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[LOADED_PR:%.*]] = load i8, i8* [[P:%.*]], align 1
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[LOADED]], [[LOOP]] ], [ [[LOADED_PR]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 [[LOAD_IS_VALID:%.*]]) [ "deopt"() ]
+; CHECK-NEXT:    [[LOADED:%.*]] = load i8, i8* [[P:%.*]], align 1
 ; CHECK-NEXT:    [[CONTINUE:%.*]] = icmp eq i8 [[LOADED]], 0
 ; CHECK-NEXT:    br i1 [[CONTINUE]], label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
@@ -416,11 +415,10 @@
 define void @safe_pre_across_call(i8* noalias nocapture readonly dereferenceable(8) %p) {
 ; CHECK-LABEL: @safe_pre_across_call(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[LOADED_PR:%.*]] = load i8, i8* [[P:%.*]], align 1
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[LOADED]], [[LOOP]] ], [ [[LOADED_PR]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @f1()
+; CHECK-NEXT:    [[LOADED:%.*]] = load i8, i8* [[P:%.*]], align 1
 ; CHECK-NEXT:    [[CONTINUE:%.*]] = icmp eq i8 [[LOADED]], 0
 ; CHECK-NEXT:    br i1 [[CONTINUE]], label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
Index: llvm/test/Transforms/LICM/hoist-deref-load.ll
===================================================================
--- llvm/test/Transforms/LICM/hoist-deref-load.ll
+++ llvm/test/Transforms/LICM/hoist-deref-load.ll
@@ -22,18 +22,18 @@
 ; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
 ; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[C:%.*]], align 4
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0
 ; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[C:%.*]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    store i32 [[MUL]], i32* [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    br label [[FOR_INC]]
 ; CHECK:       for.inc:
@@ -151,18 +151,18 @@
 ; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
 ; CHECK:       for.body.preheader:
 ; CHECK-NEXT:    [[C2:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 2
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[C2]], align 4
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0
 ; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[C2]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    store i32 [[MUL]], i32* [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    br label [[FOR_INC]]
 ; CHECK:       for.inc:
@@ -287,18 +287,18 @@
 ; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END]]
 ; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[C]], align 4
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0
 ; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[C]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    store i32 [[MUL]], i32* [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    br label [[FOR_INC]]
 ; CHECK:       for.inc:
@@ -426,18 +426,18 @@
 ; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
 ; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[C]], align 4
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0
 ; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[C]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    store i32 [[MUL]], i32* [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    br label [[FOR_INC]]
 ; CHECK:       for.inc:
@@ -501,18 +501,18 @@
 ; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END]]
 ; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[C]], align 4
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0
 ; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[C]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    store i32 [[MUL]], i32* [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    br label [[FOR_INC]]
 ; CHECK:       for.inc:
@@ -634,7 +634,6 @@
 ; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
 ; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[C:%.*]] = load i32*, i32** [[CPTR:%.*]], align 8
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
@@ -643,6 +642,7 @@
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0
 ; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; CHECK:       if.then:
+; CHECK-NEXT:    [[C:%.*]] = load i32*, i32** [[CPTR:%.*]], align 8, !dereferenceable !0
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[C]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
@@ -763,18 +763,18 @@
 ; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
 ; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[C]], align 4
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0
 ; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[C]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    store i32 [[MUL]], i32* [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    br label [[FOR_INC]]
 ; CHECK:       for.inc:
@@ -898,18 +898,18 @@
 ; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END]]
 ; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[C]], align 4
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0
 ; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[C]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    store i32 [[MUL]], i32* [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    br label [[FOR_INC]]
 ; CHECK:       for.inc:
@@ -968,18 +968,18 @@
 ; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
 ; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[C]], align 4
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0
 ; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[C]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    store i32 [[MUL]], i32* [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    br label [[FOR_INC]]
 ; CHECK:       for.inc:
Index: llvm/test/Transforms/MemCpyOpt/callslot_deref.ll
===================================================================
--- llvm/test/Transforms/MemCpyOpt/callslot_deref.ll
+++ llvm/test/Transforms/MemCpyOpt/callslot_deref.ll
@@ -11,9 +11,8 @@
 ; CHECK-LABEL: @must_remove_memcpy(
 ; CHECK-NEXT:    [[SRC:%.*]] = alloca [4096 x i8], align 1
 ; CHECK-NEXT:    [[P:%.*]] = getelementptr inbounds [4096 x i8], [4096 x i8]* [[SRC]], i64 0, i64 0
-; CHECK-NEXT:    [[DST1:%.*]] = bitcast i8* [[DST:%.*]] to [4096 x i8]*
-; CHECK-NEXT:    [[DST12:%.*]] = bitcast [4096 x i8]* [[DST1]] to i8*
-; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[DST12]], i8 0, i64 4096, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[P]], i8 0, i64 4096, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[DST:%.*]], i8 0, i64 4096, i1 false)
 ; CHECK-NEXT:    ret void
 ;
   %src = alloca [4096 x i8], align 1
Index: llvm/test/Transforms/MemCpyOpt/loadstore-sret.ll
===================================================================
--- llvm/test/Transforms/MemCpyOpt/loadstore-sret.ll
+++ llvm/test/Transforms/MemCpyOpt/loadstore-sret.ll
@@ -12,9 +12,11 @@
 ; CHECK-LABEL: @_Z3foov(
 ; CHECK-NEXT:  _ZNSt8auto_ptrIiED1Ev.exit:
 ; CHECK-NEXT:    [[TEMP_LVALUE:%.*]] = alloca %"class.std::auto_ptr", align 8
-; CHECK-NEXT:    call void @_Z3barv(%"class.std::auto_ptr"* sret(%"class.std::auto_ptr") [[AGG_RESULT:%.*]])
+; CHECK-NEXT:    call void @_Z3barv(%"class.std::auto_ptr"* sret(%"class.std::auto_ptr") [[TEMP_LVALUE]])
 ; CHECK-NEXT:    [[TMP_I_I:%.*]] = getelementptr inbounds %"class.std::auto_ptr", %"class.std::auto_ptr"* [[TEMP_LVALUE]], i64 0, i32 0
-; CHECK-NEXT:    [[TMP_I_I4:%.*]] = getelementptr inbounds %"class.std::auto_ptr", %"class.std::auto_ptr"* [[AGG_RESULT]], i64 0, i32 0
+; CHECK-NEXT:    [[TMP2_I_I:%.*]] = load i32*, i32** [[TMP_I_I]], align 8
+; CHECK-NEXT:    [[TMP_I_I4:%.*]] = getelementptr inbounds %"class.std::auto_ptr", %"class.std::auto_ptr"* [[AGG_RESULT:%.*]], i64 0, i32 0
+; CHECK-NEXT:    store i32* [[TMP2_I_I]], i32** [[TMP_I_I4]], align 8
 ; CHECK-NEXT:    ret void
 ;
 _ZNSt8auto_ptrIiED1Ev.exit:
Index: llvm/test/Transforms/MemCpyOpt/memcpy.ll
===================================================================
--- llvm/test/Transforms/MemCpyOpt/memcpy.ll
+++ llvm/test/Transforms/MemCpyOpt/memcpy.ll
@@ -78,11 +78,8 @@
 
 define void @test3(%0* noalias sret(%0) %agg.result) nounwind  {
 ; CHECK-LABEL: @test3(
-; CHECK-NEXT:    [[X_0:%.*]] = alloca [[TMP0:%.*]], align 16
-; CHECK-NEXT:    [[X_01:%.*]] = bitcast %0* [[X_0]] to i8*
-; CHECK-NEXT:    [[AGG_RESULT1:%.*]] = bitcast %0* [[AGG_RESULT:%.*]] to i8*
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[AGG_RESULT1]], i8* align 16 bitcast (%0* @x to i8*), i32 32, i1 false)
-; CHECK-NEXT:    [[AGG_RESULT2:%.*]] = bitcast %0* [[AGG_RESULT]] to i8*
+; CHECK-NEXT:    [[AGG_RESULT2:%.*]] = bitcast %0* [[AGG_RESULT:%.*]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[AGG_RESULT2]], i8* align 16 bitcast (%0* @x to i8*), i32 32, i1 false)
 ; CHECK-NEXT:    ret void
 ;
   %x.0 = alloca %0
Index: llvm/test/Transforms/MemCpyOpt/sret.ll
===================================================================
--- llvm/test/Transforms/MemCpyOpt/sret.ll
+++ llvm/test/Transforms/MemCpyOpt/sret.ll
@@ -21,9 +21,10 @@
 ; CHECK-NEXT:    [[TMP8:%.*]] = load x86_fp80, x86_fp80* [[TMP7]], align 16
 ; CHECK-NEXT:    store x86_fp80 [[TMP3]], x86_fp80* [[REAL]], align 16
 ; CHECK-NEXT:    store x86_fp80 [[TMP8]], x86_fp80* [[TMP4]], align 16
-; CHECK-NEXT:    call void @ccoshl(%0* noalias sret([[TMP0]]) [[AGG_RESULT:%.*]], %0* byval([[TMP0]]) align 8 [[IZ]]) #[[ATTR0:[0-9]+]]
+; CHECK-NEXT:    call void @ccoshl(%0* noalias sret([[TMP0]]) [[MEMTMP]], %0* byval([[TMP0]]) align 8 [[IZ]]) #[[ATTR0:[0-9]+]]
 ; CHECK-NEXT:    [[MEMTMP14:%.*]] = bitcast %0* [[MEMTMP]] to i8*
-; CHECK-NEXT:    [[AGG_RESULT15:%.*]] = bitcast %0* [[AGG_RESULT]] to i8*
+; CHECK-NEXT:    [[AGG_RESULT15:%.*]] = bitcast %0* [[AGG_RESULT:%.*]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[AGG_RESULT15]], i8* align 16 [[MEMTMP14]], i32 32, i1 false)
 ; CHECK-NEXT:    ret void
 ;
 entry:
Index: llvm/test/Transforms/MergeICmps/X86/alias-merge-blocks.ll
===================================================================
--- llvm/test/Transforms/MergeICmps/X86/alias-merge-blocks.ll
+++ llvm/test/Transforms/MergeICmps/X86/alias-merge-blocks.ll
@@ -5,18 +5,39 @@
 
 define zeroext i1 @opeq1(
 ; X86-LABEL: @opeq1(
-; X86-NEXT:  "entry+land.rhs.i+land.rhs.i.2+land.rhs.i.3":
+; X86-NEXT:  entry:
 ; X86-NEXT:    [[PTR:%.*]] = alloca i32, align 4
+; X86-NEXT:    [[FIRST_I:%.*]] = getelementptr inbounds [[S:%.*]], %S* [[A:%.*]], i64 0, i32 0
+; X86-NEXT:    [[TMP0:%.*]] = load i32, i32* [[FIRST_I]], align 4
+; X86-NEXT:    [[FIRST1_I:%.*]] = getelementptr inbounds [[S]], %S* [[B:%.*]], i64 0, i32 0
+; X86-NEXT:    [[TMP1:%.*]] = load i32, i32* [[FIRST1_I]], align 4
 ; X86-NEXT:    store i32 42, i32* [[PTR]], align 4
-; X86-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[S:%.*]], %S* [[A:%.*]], i64 0, i32 0
-; X86-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[S]], %S* [[B:%.*]], i64 0, i32 0
-; X86-NEXT:    [[CSTR:%.*]] = bitcast i32* [[TMP0]] to i8*
-; X86-NEXT:    [[CSTR1:%.*]] = bitcast i32* [[TMP1]] to i8*
-; X86-NEXT:    [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[CSTR]], i8* [[CSTR1]], i64 16)
-; X86-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[MEMCMP]], 0
-; X86-NEXT:    br label [[OPEQ1_EXIT:%.*]]
+; X86-NEXT:    [[CMP_I:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
+; X86-NEXT:    br i1 [[CMP_I]], label [[LAND_RHS_I:%.*]], label [[OPEQ1_EXIT:%.*]]
+; X86:       land.rhs.i:
+; X86-NEXT:    [[SECOND_I:%.*]] = getelementptr inbounds [[S]], %S* [[A]], i64 0, i32 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, i32* [[SECOND_I]], align 4
+; X86-NEXT:    [[SECOND2_I:%.*]] = getelementptr inbounds [[S]], %S* [[B]], i64 0, i32 1
+; X86-NEXT:    [[TMP3:%.*]] = load i32, i32* [[SECOND2_I]], align 4
+; X86-NEXT:    [[CMP2_I:%.*]] = icmp eq i32 [[TMP2]], [[TMP3]]
+; X86-NEXT:    br i1 [[CMP2_I]], label [[LAND_RHS_I_2:%.*]], label [[OPEQ1_EXIT]]
+; X86:       land.rhs.i.2:
+; X86-NEXT:    [[THIRD_I:%.*]] = getelementptr inbounds [[S]], %S* [[A]], i64 0, i32 2
+; X86-NEXT:    [[TMP4:%.*]] = load i32, i32* [[THIRD_I]], align 4
+; X86-NEXT:    [[THIRD2_I:%.*]] = getelementptr inbounds [[S]], %S* [[B]], i64 0, i32 2
+; X86-NEXT:    [[TMP5:%.*]] = load i32, i32* [[THIRD2_I]], align 4
+; X86-NEXT:    [[CMP3_I:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]]
+; X86-NEXT:    br i1 [[CMP3_I]], label [[LAND_RHS_I_3:%.*]], label [[OPEQ1_EXIT]]
+; X86:       land.rhs.i.3:
+; X86-NEXT:    [[FOURTH_I:%.*]] = getelementptr inbounds [[S]], %S* [[A]], i64 0, i32 3
+; X86-NEXT:    [[TMP6:%.*]] = load i32, i32* [[FOURTH_I]], align 4
+; X86-NEXT:    [[FOURTH2_I:%.*]] = getelementptr inbounds [[S]], %S* [[B]], i64 0, i32 3
+; X86-NEXT:    [[TMP7:%.*]] = load i32, i32* [[FOURTH2_I]], align 4
+; X86-NEXT:    [[CMP4_I:%.*]] = icmp eq i32 [[TMP6]], [[TMP7]]
+; X86-NEXT:    br label [[OPEQ1_EXIT]]
 ; X86:       opeq1.exit:
-; X86-NEXT:    ret i1 [[TMP2]]
+; X86-NEXT:    [[TMP8:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ false, [[LAND_RHS_I]] ], [ false, [[LAND_RHS_I_2]] ], [ [[CMP4_I]], [[LAND_RHS_I_3]] ]
+; X86-NEXT:    ret i1 [[TMP8]]
 ;
   %S* nocapture readonly dereferenceable(16) %a,
   %S* nocapture readonly dereferenceable(16) %b) local_unnamed_addr #0 {
Index: llvm/test/Transforms/MergeICmps/X86/entry-block-shuffled.ll
===================================================================
--- llvm/test/Transforms/MergeICmps/X86/entry-block-shuffled.ll
+++ llvm/test/Transforms/MergeICmps/X86/entry-block-shuffled.ll
@@ -9,31 +9,37 @@
 
 define zeroext i1 @opeq1(
 ; CHECK-LABEL: @opeq1(
-; CHECK-NEXT:  "land.rhs.i+land.rhs.i.2":
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[S:%.*]], %S* [[A:%.*]], i64 0, i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[S]], %S* [[B:%.*]], i64 0, i32 0
-; CHECK-NEXT:    [[CSTR:%.*]] = bitcast i32* [[TMP0]] to i8*
-; CHECK-NEXT:    [[CSTR3:%.*]] = bitcast i32* [[TMP1]] to i8*
-; CHECK-NEXT:    [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[CSTR]], i8* [[CSTR3]], i64 8)
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[MEMCMP]], 0
-; CHECK-NEXT:    br i1 [[TMP2]], label [[ENTRY2:%.*]], label [[OPEQ1_EXIT:%.*]]
-; CHECK:       entry2:
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[S]], %S* [[A]], i64 0, i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[S]], %S* [[B]], i64 0, i32 2
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[LAND_RHS_I_31:%.*]], label [[OPEQ1_EXIT]]
-; CHECK:       land.rhs.i.31:
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[S]], %S* [[A]], i64 0, i32 3
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[S]], %S* [[B]], i64 0, i32 3
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP8]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[TMP10]], [[TMP11]]
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[FIRST_I:%.*]] = getelementptr inbounds [[S:%.*]], %S* [[A:%.*]], i64 0, i32 3
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[FIRST_I]], align 4
+; CHECK-NEXT:    [[FIRST1_I:%.*]] = getelementptr inbounds [[S]], %S* [[B:%.*]], i64 0, i32 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[FIRST1_I]], align 4
+; CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[CMP_I]], label [[LAND_RHS_I:%.*]], label [[OPEQ1_EXIT:%.*]]
+; CHECK:       land.rhs.i:
+; CHECK-NEXT:    [[SECOND_I:%.*]] = getelementptr inbounds [[S]], %S* [[A]], i64 0, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[SECOND_I]], align 4
+; CHECK-NEXT:    [[SECOND2_I:%.*]] = getelementptr inbounds [[S]], %S* [[B]], i64 0, i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[SECOND2_I]], align 4
+; CHECK-NEXT:    [[CMP3_I:%.*]] = icmp eq i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    br i1 [[CMP3_I]], label [[LAND_RHS_I_2:%.*]], label [[OPEQ1_EXIT]]
+; CHECK:       land.rhs.i.2:
+; CHECK-NEXT:    [[THIRD_I:%.*]] = getelementptr inbounds [[S]], %S* [[A]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[THIRD_I]], align 4
+; CHECK-NEXT:    [[THIRD2_I:%.*]] = getelementptr inbounds [[S]], %S* [[B]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[THIRD2_I]], align 4
+; CHECK-NEXT:    [[CMP4_I:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    br i1 [[CMP4_I]], label [[LAND_RHS_I_3:%.*]], label [[OPEQ1_EXIT]]
+; CHECK:       land.rhs.i.3:
+; CHECK-NEXT:    [[FOURTH_I:%.*]] = getelementptr inbounds [[S]], %S* [[A]], i64 0, i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[FOURTH_I]], align 4
+; CHECK-NEXT:    [[FOURTH2_I:%.*]] = getelementptr inbounds [[S]], %S* [[B]], i64 0, i32 3
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[FOURTH2_I]], align 4
+; CHECK-NEXT:    [[CMP5_I:%.*]] = icmp eq i32 [[TMP6]], [[TMP7]]
 ; CHECK-NEXT:    br label [[OPEQ1_EXIT]]
 ; CHECK:       opeq1.exit:
-; CHECK-NEXT:    [[TMP13:%.*]] = phi i1 [ [[TMP12]], [[LAND_RHS_I_31]] ], [ false, [[ENTRY2]] ], [ false, %"land.rhs.i+land.rhs.i.2" ]
-; CHECK-NEXT:    ret i1 [[TMP13]]
+; CHECK-NEXT:    [[TMP8:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ false, [[LAND_RHS_I]] ], [ false, [[LAND_RHS_I_2]] ], [ [[CMP5_I]], [[LAND_RHS_I_3]] ]
+; CHECK-NEXT:    ret i1 [[TMP8]]
 ;
   %S* nocapture readonly dereferenceable(16) %a,
   %S* nocapture readonly dereferenceable(16) %b) local_unnamed_addr #0 {
Index: llvm/test/Transforms/MergeICmps/X86/gep-references-bb.ll
===================================================================
--- llvm/test/Transforms/MergeICmps/X86/gep-references-bb.ll
+++ llvm/test/Transforms/MergeICmps/X86/gep-references-bb.ll
@@ -18,17 +18,23 @@
 ; CHECK-NEXT:    [[R0_ADDR:%.*]] = getelementptr inbounds [[TRIPLE]], %Triple* [[RHS]], i64 0, i32 0
 ; CHECK-NEXT:    [[R0:%.*]] = load i32, i32* [[R0_ADDR]], align 4
 ; CHECK-NEXT:    [[CMP0:%.*]] = icmp eq i32 [[L0]], [[R0]]
-; CHECK-NEXT:    br i1 [[CMP0]], label %"bb1+bb2", label [[FINAL:%.*]]
-; CHECK:       "bb1+bb2":
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[TRIPLE]], %Triple* [[LHS]], i64 0, i32 2
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[GEP]], i64 2
-; CHECK-NEXT:    [[CSTR:%.*]] = bitcast i32* [[TMP0]] to i8*
-; CHECK-NEXT:    [[CSTR1:%.*]] = bitcast i32* [[TMP1]] to i8*
-; CHECK-NEXT:    [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[CSTR]], i8* [[CSTR1]], i64 8)
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[MEMCMP]], 0
+; CHECK-NEXT:    br i1 [[CMP0]], label [[BB1:%.*]], label [[FINAL:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[L1_ADDR:%.*]] = getelementptr inbounds [[TRIPLE]], %Triple* [[LHS]], i64 0, i32 2
+; CHECK-NEXT:    [[L1:%.*]] = load i32, i32* [[L1_ADDR]], align 4
+; CHECK-NEXT:    [[R1_ADDR:%.*]] = getelementptr inbounds i32, i32* [[GEP]], i64 2
+; CHECK-NEXT:    [[R1:%.*]] = load i32, i32* [[R1_ADDR]], align 4
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[L1]], [[R1]]
+; CHECK-NEXT:    br i1 [[CMP1]], label [[BB2:%.*]], label [[FINAL]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[L2_ADDR:%.*]] = getelementptr inbounds [[TRIPLE]], %Triple* [[LHS]], i64 0, i32 3
+; CHECK-NEXT:    [[L2:%.*]] = load i32, i32* [[L2_ADDR]], align 4
+; CHECK-NEXT:    [[R2_ADDR:%.*]] = getelementptr inbounds i32, i32* [[GEP]], i64 3
+; CHECK-NEXT:    [[R2:%.*]] = load i32, i32* [[R2_ADDR]], align 4
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[L2]], [[R2]]
 ; CHECK-NEXT:    br label [[FINAL]]
 ; CHECK:       final:
-; CHECK-NEXT:    [[RET:%.*]] = phi i1 [ false, [[BB0:%.*]] ], [ [[TMP2]], %"bb1+bb2" ]
+; CHECK-NEXT:    [[RET:%.*]] = phi i1 [ false, [[BB0:%.*]] ], [ false, [[BB1]] ], [ [[CMP2]], [[BB2]] ]
 ; CHECK-NEXT:    ret i1 [[RET]]
 ;
 bb0:
Index: llvm/test/Transforms/MergeICmps/X86/int64-and-ptr.ll
===================================================================
--- llvm/test/Transforms/MergeICmps/X86/int64-and-ptr.ll
+++ llvm/test/Transforms/MergeICmps/X86/int64-and-ptr.ll
@@ -9,16 +9,23 @@
 ; Function Attrs: nounwind uwtable
 define i1 @test(%struct.outer* align 8 dereferenceable(16) %o1, %struct.outer* align 8 dereferenceable(116) %o2) local_unnamed_addr #0 {
 ; CHECK-LABEL: @test(
-; CHECK-NEXT:  "entry+if.then":
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_OUTER:%.*]], %struct.outer* [[O1:%.*]], i64 0, i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_OUTER]], %struct.outer* [[O2:%.*]], i64 0, i32 0
-; CHECK-NEXT:    [[CSTR:%.*]] = bitcast i64* [[TMP0]] to i8*
-; CHECK-NEXT:    [[CSTR1:%.*]] = bitcast i64* [[TMP1]] to i8*
-; CHECK-NEXT:    [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[CSTR]], i8* [[CSTR1]], i64 16)
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[MEMCMP]], 0
-; CHECK-NEXT:    br label [[IF_END5:%.*]]
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds [[STRUCT_OUTER:%.*]], %struct.outer* [[O1:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* [[P1]], align 8
+; CHECK-NEXT:    [[P11:%.*]] = getelementptr inbounds [[STRUCT_OUTER]], %struct.outer* [[O2:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, i64* [[P11]], align 8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END5:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds [[STRUCT_OUTER]], %struct.outer* [[O1]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load %struct.inner*, %struct.inner** [[P2]], align 8
+; CHECK-NEXT:    [[P22:%.*]] = getelementptr inbounds [[STRUCT_OUTER]], %struct.outer* [[O2]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load %struct.inner*, %struct.inner** [[P22]], align 8
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq %struct.inner* [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    br label [[IF_END5]]
 ; CHECK:       if.end5:
-; CHECK-NEXT:    ret i1 [[TMP2]]
+; CHECK-NEXT:    [[REZ_0:%.*]] = phi i1 [ [[CMP3]], [[IF_THEN]] ], [ false, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i1 [[REZ_0]]
 ;
 entry:
   %p1 = getelementptr inbounds %struct.outer, %struct.outer* %o1, i64 0, i32 0
Index: llvm/test/Transforms/MergeICmps/X86/multiple-blocks-does-work.ll
===================================================================
--- llvm/test/Transforms/MergeICmps/X86/multiple-blocks-does-work.ll
+++ llvm/test/Transforms/MergeICmps/X86/multiple-blocks-does-work.ll
@@ -23,18 +23,24 @@
 ; X86-NEXT:    [[TMP3:%.*]] = load i32, i32* [[SECOND2_I]], align 4
 ; X86-NEXT:    call void (...) @foo()
 ; X86-NEXT:    [[CMP2_I:%.*]] = icmp eq i32 [[TMP2]], [[TMP3]]
-; X86-NEXT:    br i1 [[CMP2_I]], label %"land.rhs.i.2+land.rhs.i.3", label [[OPEQ1_EXIT]]
-; X86:       "land.rhs.i.2+land.rhs.i.3":
-; X86-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[S]], %S* [[A]], i64 0, i32 2
-; X86-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[S]], %S* [[B]], i64 0, i32 2
-; X86-NEXT:    [[CSTR:%.*]] = bitcast i32* [[TMP4]] to i8*
-; X86-NEXT:    [[CSTR1:%.*]] = bitcast i32* [[TMP5]] to i8*
-; X86-NEXT:    [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[CSTR]], i8* [[CSTR1]], i64 8)
-; X86-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[MEMCMP]], 0
+; X86-NEXT:    br i1 [[CMP2_I]], label [[LAND_RHS_I_2:%.*]], label [[OPEQ1_EXIT]]
+; X86:       land.rhs.i.2:
+; X86-NEXT:    [[THIRD_I:%.*]] = getelementptr inbounds [[S]], %S* [[A]], i64 0, i32 2
+; X86-NEXT:    [[TMP4:%.*]] = load i32, i32* [[THIRD_I]], align 4
+; X86-NEXT:    [[THIRD2_I:%.*]] = getelementptr inbounds [[S]], %S* [[B]], i64 0, i32 2
+; X86-NEXT:    [[TMP5:%.*]] = load i32, i32* [[THIRD2_I]], align 4
+; X86-NEXT:    [[CMP3_I:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]]
+; X86-NEXT:    br i1 [[CMP3_I]], label [[LAND_RHS_I_3:%.*]], label [[OPEQ1_EXIT]]
+; X86:       land.rhs.i.3:
+; X86-NEXT:    [[FOURTH_I:%.*]] = getelementptr inbounds [[S]], %S* [[A]], i64 0, i32 3
+; X86-NEXT:    [[TMP6:%.*]] = load i32, i32* [[FOURTH_I]], align 4
+; X86-NEXT:    [[FOURTH2_I:%.*]] = getelementptr inbounds [[S]], %S* [[B]], i64 0, i32 3
+; X86-NEXT:    [[TMP7:%.*]] = load i32, i32* [[FOURTH2_I]], align 4
+; X86-NEXT:    [[CMP4_I:%.*]] = icmp eq i32 [[TMP6]], [[TMP7]]
 ; X86-NEXT:    br label [[OPEQ1_EXIT]]
 ; X86:       opeq1.exit:
-; X86-NEXT:    [[TMP7:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ false, [[LAND_RHS_I]] ], [ [[TMP6]], %"land.rhs.i.2+land.rhs.i.3" ]
-; X86-NEXT:    ret i1 [[TMP7]]
+; X86-NEXT:    [[TMP8:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ false, [[LAND_RHS_I]] ], [ false, [[LAND_RHS_I_2]] ], [ [[CMP4_I]], [[LAND_RHS_I_3]] ]
+; X86-NEXT:    ret i1 [[TMP8]]
 ;
   %S* nocapture readonly dereferenceable(16) %a,
   %S* nocapture readonly dereferenceable(16) %b) local_unnamed_addr #0 {
Index: llvm/test/Transforms/MergeICmps/X86/pair-int32-int32.ll
===================================================================
--- llvm/test/Transforms/MergeICmps/X86/pair-int32-int32.ll
+++ llvm/test/Transforms/MergeICmps/X86/pair-int32-int32.ll
@@ -6,16 +6,23 @@
 
 define zeroext i1 @opeq1(
 ; X86-LABEL: @opeq1(
-; X86-NEXT:  "entry+land.rhs.i":
-; X86-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[S:%.*]], %S* [[A:%.*]], i64 0, i32 0
-; X86-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[S]], %S* [[B:%.*]], i64 0, i32 0
-; X86-NEXT:    [[CSTR:%.*]] = bitcast i32* [[TMP0]] to i8*
-; X86-NEXT:    [[CSTR1:%.*]] = bitcast i32* [[TMP1]] to i8*
-; X86-NEXT:    [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[CSTR]], i8* [[CSTR1]], i64 8)
-; X86-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[MEMCMP]], 0
-; X86-NEXT:    br label [[OPEQ1_EXIT:%.*]]
+; X86-NEXT:  entry:
+; X86-NEXT:    [[FIRST_I:%.*]] = getelementptr inbounds [[S:%.*]], %S* [[A:%.*]], i64 0, i32 0
+; X86-NEXT:    [[TMP0:%.*]] = load i32, i32* [[FIRST_I]], align 4
+; X86-NEXT:    [[FIRST1_I:%.*]] = getelementptr inbounds [[S]], %S* [[B:%.*]], i64 0, i32 0
+; X86-NEXT:    [[TMP1:%.*]] = load i32, i32* [[FIRST1_I]], align 4
+; X86-NEXT:    [[CMP_I:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
+; X86-NEXT:    br i1 [[CMP_I]], label [[LAND_RHS_I:%.*]], label [[OPEQ1_EXIT:%.*]]
+; X86:       land.rhs.i:
+; X86-NEXT:    [[SECOND_I:%.*]] = getelementptr inbounds [[S]], %S* [[A]], i64 0, i32 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, i32* [[SECOND_I]], align 4
+; X86-NEXT:    [[SECOND2_I:%.*]] = getelementptr inbounds [[S]], %S* [[B]], i64 0, i32 1
+; X86-NEXT:    [[TMP3:%.*]] = load i32, i32* [[SECOND2_I]], align 4
+; X86-NEXT:    [[CMP3_I:%.*]] = icmp eq i32 [[TMP2]], [[TMP3]]
+; X86-NEXT:    br label [[OPEQ1_EXIT]]
 ; X86:       opeq1.exit:
-; X86-NEXT:    ret i1 [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[CMP3_I]], [[LAND_RHS_I]] ]
+; X86-NEXT:    ret i1 [[TMP4]]
 ;
 ; X86-NOBUILTIN-LABEL: @opeq1(
 ; X86-NOBUILTIN-NEXT:  entry:
@@ -66,16 +73,23 @@
 ; Same as above, but the two blocks are in inverse order.
 define zeroext i1 @opeq1_inverse(
 ; X86-LABEL: @opeq1_inverse(
-; X86-NEXT:  "land.rhs.i+entry":
-; X86-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[S:%.*]], %S* [[A:%.*]], i64 0, i32 0
-; X86-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[S]], %S* [[B:%.*]], i64 0, i32 0
-; X86-NEXT:    [[CSTR:%.*]] = bitcast i32* [[TMP0]] to i8*
-; X86-NEXT:    [[CSTR1:%.*]] = bitcast i32* [[TMP1]] to i8*
-; X86-NEXT:    [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[CSTR]], i8* [[CSTR1]], i64 8)
-; X86-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[MEMCMP]], 0
-; X86-NEXT:    br label [[OPEQ1_EXIT:%.*]]
+; X86-NEXT:  entry:
+; X86-NEXT:    [[FIRST_I:%.*]] = getelementptr inbounds [[S:%.*]], %S* [[A:%.*]], i64 0, i32 1
+; X86-NEXT:    [[TMP0:%.*]] = load i32, i32* [[FIRST_I]], align 4
+; X86-NEXT:    [[FIRST1_I:%.*]] = getelementptr inbounds [[S]], %S* [[B:%.*]], i64 0, i32 1
+; X86-NEXT:    [[TMP1:%.*]] = load i32, i32* [[FIRST1_I]], align 4
+; X86-NEXT:    [[CMP_I:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
+; X86-NEXT:    br i1 [[CMP_I]], label [[LAND_RHS_I:%.*]], label [[OPEQ1_EXIT:%.*]]
+; X86:       land.rhs.i:
+; X86-NEXT:    [[SECOND_I:%.*]] = getelementptr inbounds [[S]], %S* [[A]], i64 0, i32 0
+; X86-NEXT:    [[TMP2:%.*]] = load i32, i32* [[SECOND_I]], align 4
+; X86-NEXT:    [[SECOND2_I:%.*]] = getelementptr inbounds [[S]], %S* [[B]], i64 0, i32 0
+; X86-NEXT:    [[TMP3:%.*]] = load i32, i32* [[SECOND2_I]], align 4
+; X86-NEXT:    [[CMP3_I:%.*]] = icmp eq i32 [[TMP2]], [[TMP3]]
+; X86-NEXT:    br label [[OPEQ1_EXIT]]
 ; X86:       opeq1.exit:
-; X86-NEXT:    ret i1 [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[CMP3_I]], [[LAND_RHS_I]] ]
+; X86-NEXT:    ret i1 [[TMP4]]
 ;
 ; X86-NOBUILTIN-LABEL: @opeq1_inverse(
 ; X86-NOBUILTIN-NEXT:  entry:
Index: llvm/test/Transforms/MergeICmps/X86/pr41917.ll
===================================================================
--- llvm/test/Transforms/MergeICmps/X86/pr41917.ll
+++ llvm/test/Transforms/MergeICmps/X86/pr41917.ll
@@ -10,21 +10,28 @@
 define dso_local zeroext i1 @pr41917(%class.a* byval(%class.a) nocapture readonly align 4 %g, %class.a* byval(%class.a) nocapture readonly align 4 %p2) local_unnamed_addr #0 {
 ; CHECK-LABEL: @pr41917(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call zeroext i1 @f2() #[[ATTR3:[0-9]+]]
-; CHECK-NEXT:    br i1 [[CALL]], label [[LAND_RHS:%.*]], label %"land.end+land.rhs3"
+; CHECK-NEXT:    [[CALL:%.*]] = tail call zeroext i1 @f2() #[[ATTR2:[0-9]+]]
+; CHECK-NEXT:    br i1 [[CALL]], label [[LAND_RHS:%.*]], label [[LAND_END:%.*]]
 ; CHECK:       land.rhs:
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call zeroext i1 @f2() #[[ATTR3]]
-; CHECK-NEXT:    br label %"land.end+land.rhs3"
-; CHECK:       "land.end+land.rhs3":
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[CLASS_A:%.*]], %class.a* [[G:%.*]], i32 0, i32 1
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[CLASS_A]], %class.a* [[P2:%.*]], i32 0, i32 1
-; CHECK-NEXT:    [[CSTR:%.*]] = bitcast i32* [[TMP0]] to i8*
-; CHECK-NEXT:    [[CSTR1:%.*]] = bitcast i32* [[TMP1]] to i8*
-; CHECK-NEXT:    [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[CSTR]], i8* [[CSTR1]], i32 8)
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[MEMCMP]], 0
-; CHECK-NEXT:    br label [[LAND_END6:%.*]]
+; CHECK-NEXT:    [[CALL1:%.*]] = tail call zeroext i1 @f2() #[[ATTR2]]
+; CHECK-NEXT:    br label [[LAND_END]]
+; CHECK:       land.end:
+; CHECK-NEXT:    [[C:%.*]] = getelementptr inbounds [[CLASS_A:%.*]], %class.a* [[G:%.*]], i32 0, i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[C]], align 4, !tbaa [[TBAA3:![0-9]+]]
+; CHECK-NEXT:    [[C2:%.*]] = getelementptr inbounds [[CLASS_A]], %class.a* [[P2:%.*]], i32 0, i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[C2]], align 4, !tbaa [[TBAA3]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[LAND_RHS3:%.*]], label [[LAND_END6:%.*]]
+; CHECK:       land.rhs3:
+; CHECK-NEXT:    [[H:%.*]] = getelementptr inbounds [[CLASS_A]], %class.a* [[G]], i32 0, i32 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[H]], align 4, !tbaa [[TBAA8:![0-9]+]]
+; CHECK-NEXT:    [[H4:%.*]] = getelementptr inbounds [[CLASS_A]], %class.a* [[P2]], i32 0, i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[H4]], align 4, !tbaa [[TBAA8]]
+; CHECK-NEXT:    [[CMP5:%.*]] = icmp eq i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    br label [[LAND_END6]]
 ; CHECK:       land.end6:
-; CHECK-NEXT:    ret i1 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i1 [ false, [[LAND_END]] ], [ [[CMP5]], [[LAND_RHS3]] ]
+; CHECK-NEXT:    ret i1 [[TMP4]]
 ;
 entry:
   %call = tail call zeroext i1 @f2() #2
Index: llvm/test/Transforms/MergeICmps/X86/split-block-does-work.ll
===================================================================
--- llvm/test/Transforms/MergeICmps/X86/split-block-does-work.ll
+++ llvm/test/Transforms/MergeICmps/X86/split-block-does-work.ll
@@ -8,17 +8,38 @@
 ; We can split %entry and create a memcmp(16 bytes).
 define zeroext i1 @opeq1(
 ; X86-LABEL: @opeq1(
-; X86-NEXT:  "entry+land.rhs.i+land.rhs.i.2+land.rhs.i.3":
+; X86-NEXT:  entry:
+; X86-NEXT:    [[FIRST_I:%.*]] = getelementptr inbounds [[S:%.*]], %S* [[A:%.*]], i64 0, i32 0
+; X86-NEXT:    [[TMP0:%.*]] = load i32, i32* [[FIRST_I]], align 4
+; X86-NEXT:    [[FIRST1_I:%.*]] = getelementptr inbounds [[S]], %S* [[B:%.*]], i64 0, i32 0
+; X86-NEXT:    [[TMP1:%.*]] = load i32, i32* [[FIRST1_I]], align 4
 ; X86-NEXT:    call void (...) @foo()
-; X86-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[S:%.*]], %S* [[A:%.*]], i64 0, i32 0
-; X86-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[S]], %S* [[B:%.*]], i64 0, i32 0
-; X86-NEXT:    [[CSTR:%.*]] = bitcast i32* [[TMP0]] to i8*
-; X86-NEXT:    [[CSTR1:%.*]] = bitcast i32* [[TMP1]] to i8*
-; X86-NEXT:    [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[CSTR]], i8* [[CSTR1]], i64 16)
-; X86-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[MEMCMP]], 0
-; X86-NEXT:    br label [[OPEQ1_EXIT:%.*]]
+; X86-NEXT:    [[CMP_I:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
+; X86-NEXT:    br i1 [[CMP_I]], label [[LAND_RHS_I:%.*]], label [[OPEQ1_EXIT:%.*]]
+; X86:       land.rhs.i:
+; X86-NEXT:    [[SECOND_I:%.*]] = getelementptr inbounds [[S]], %S* [[A]], i64 0, i32 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, i32* [[SECOND_I]], align 4
+; X86-NEXT:    [[SECOND2_I:%.*]] = getelementptr inbounds [[S]], %S* [[B]], i64 0, i32 1
+; X86-NEXT:    [[TMP3:%.*]] = load i32, i32* [[SECOND2_I]], align 4
+; X86-NEXT:    [[CMP2_I:%.*]] = icmp eq i32 [[TMP2]], [[TMP3]]
+; X86-NEXT:    br i1 [[CMP2_I]], label [[LAND_RHS_I_2:%.*]], label [[OPEQ1_EXIT]]
+; X86:       land.rhs.i.2:
+; X86-NEXT:    [[THIRD_I:%.*]] = getelementptr inbounds [[S]], %S* [[A]], i64 0, i32 2
+; X86-NEXT:    [[TMP4:%.*]] = load i32, i32* [[THIRD_I]], align 4
+; X86-NEXT:    [[THIRD2_I:%.*]] = getelementptr inbounds [[S]], %S* [[B]], i64 0, i32 2
+; X86-NEXT:    [[TMP5:%.*]] = load i32, i32* [[THIRD2_I]], align 4
+; X86-NEXT:    [[CMP3_I:%.*]] = icmp eq i32 [[TMP4]], [[TMP5]]
+; X86-NEXT:    br i1 [[CMP3_I]], label [[LAND_RHS_I_3:%.*]], label [[OPEQ1_EXIT]]
+; X86:       land.rhs.i.3:
+; X86-NEXT:    [[FOURTH_I:%.*]] = getelementptr inbounds [[S]], %S* [[A]], i64 0, i32 3
+; X86-NEXT:    [[TMP6:%.*]] = load i32, i32* [[FOURTH_I]], align 4
+; X86-NEXT:    [[FOURTH2_I:%.*]] = getelementptr inbounds [[S]], %S* [[B]], i64 0, i32 3
+; X86-NEXT:    [[TMP7:%.*]] = load i32, i32* [[FOURTH2_I]], align 4
+; X86-NEXT:    [[CMP4_I:%.*]] = icmp eq i32 [[TMP6]], [[TMP7]]
+; X86-NEXT:    br label [[OPEQ1_EXIT]]
 ; X86:       opeq1.exit:
-; X86-NEXT:    ret i1 [[TMP2]]
+; X86-NEXT:    [[TMP8:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ false, [[LAND_RHS_I]] ], [ false, [[LAND_RHS_I_2]] ], [ [[CMP4_I]], [[LAND_RHS_I_3]] ]
+; X86-NEXT:    ret i1 [[TMP8]]
 ;
 ; Make sure this call is moved to the beginning of the entry block.
   %S* nocapture readonly dereferenceable(16) %a,
Index: llvm/test/Transforms/SimplifyCFG/X86/SpeculativeExec.ll
===================================================================
--- llvm/test/Transforms/SimplifyCFG/X86/SpeculativeExec.ll
+++ llvm/test/Transforms/SimplifyCFG/X86/SpeculativeExec.ll
@@ -126,11 +126,13 @@
 ; CHECK-NEXT:    br i1 [[T1]], label [[BB1:%.*]], label [[BB3:%.*]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    [[T2:%.*]] = icmp sgt i32 [[C:%.*]], 1
-; CHECK-NEXT:    [[T3:%.*]] = load i32*, i32** [[PTR3:%.*]], align 8
-; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[T2]], i32* [[T3]], i32* [[PTR2:%.*]]
-; CHECK-NEXT:    ret i32* [[SPEC_SELECT]]
+; CHECK-NEXT:    br i1 [[T2]], label [[BB2:%.*]], label [[BB3]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[T3:%.*]] = load i32*, i32** [[PTR3:%.*]], align 8, !dereferenceable !0
+; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    ret i32* [[PTR1:%.*]]
+; CHECK-NEXT:    [[T4:%.*]] = phi i32* [ [[PTR1:%.*]], [[ENTRY:%.*]] ], [ [[PTR2:%.*]], [[BB1]] ], [ [[T3]], [[BB2]] ]
+; CHECK-NEXT:    ret i32* [[T4]]
 ;
 entry:
   %t1 = icmp eq i32 %b, 0
Index: llvm/test/Transforms/TailCallElim/reorder_load.ll
===================================================================
--- llvm/test/Transforms/TailCallElim/reorder_load.ll
+++ llvm/test/Transforms/TailCallElim/reorder_load.ll
@@ -189,20 +189,16 @@
 define fastcc i32 @raise_load_5(i32* dereferenceable(4) align 4 %a_arg, i32 %a_len_arg, i32 %start_arg) readonly {
 ; CHECK-LABEL: @raise_load_5(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[TAILRECURSE:%.*]]
-; CHECK:       tailrecurse:
-; CHECK-NEXT:    [[ACCUMULATOR_TR:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP10:%.*]], [[ELSE:%.*]] ]
-; CHECK-NEXT:    [[START_ARG_TR:%.*]] = phi i32 [ [[START_ARG:%.*]], [[ENTRY]] ], [ [[TMP7:%.*]], [[ELSE]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp sge i32 [[START_ARG_TR]], [[A_LEN_ARG:%.*]]
-; CHECK-NEXT:    br i1 [[TMP2]], label [[IF:%.*]], label [[ELSE]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp sge i32 [[START_ARG:%.*]], [[A_LEN_ARG:%.*]]
+; CHECK-NEXT:    br i1 [[TMP2]], label [[IF:%.*]], label [[ELSE:%.*]]
 ; CHECK:       if:
-; CHECK-NEXT:    [[ACCUMULATOR_RET_TR:%.*]] = add i32 0, [[ACCUMULATOR_TR]]
-; CHECK-NEXT:    ret i32 [[ACCUMULATOR_RET_TR]]
+; CHECK-NEXT:    ret i32 0
 ; CHECK:       else:
-; CHECK-NEXT:    [[TMP7]] = add i32 [[START_ARG_TR]], 1
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[A_ARG:%.*]], align 4
-; CHECK-NEXT:    [[TMP10]] = add i32 [[TMP9]], [[ACCUMULATOR_TR]]
-; CHECK-NEXT:    br label [[TAILRECURSE]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[START_ARG]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call fastcc i32 @raise_load_5(i32* [[A_ARG:%.*]], i32 [[A_LEN_ARG]], i32 [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, i32* [[A_ARG]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    ret i32 [[TMP10]]
 ;
 entry:
   %tmp2 = icmp sge i32 %start_arg, %a_len_arg		; <i1> [#uses=1]
Index: llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
===================================================================
--- llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
+++ llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
@@ -174,9 +174,8 @@
 
 define <4 x float> @load_f32_insert_v4f32(float* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_f32_insert_v4f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[S:%.*]] = load float, float* [[P:%.*]], align 4
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> poison, float [[S]], i32 0
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %s = load float, float* %p, align 4
@@ -186,8 +185,9 @@
 
 define <4 x float> @casted_load_f32_insert_v4f32(<4 x float>* align 4 dereferenceable(16) %p) {
 ; CHECK-LABEL: @casted_load_f32_insert_v4f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[P:%.*]], align 4
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[B:%.*]] = bitcast <4 x float>* [[P:%.*]] to float*
+; CHECK-NEXT:    [[S:%.*]] = load float, float* [[B]], align 4
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> poison, float [[S]], i32 0
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %b = bitcast <4 x float>* %p to float*
@@ -200,9 +200,8 @@
 
 define <4 x i32> @load_i32_insert_v4i32(i32* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_i32_insert_v4i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 16
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[S:%.*]] = load i32, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i32 0
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
   %s = load i32, i32* %p, align 4
@@ -214,9 +213,9 @@
 
 define <4 x i32> @casted_load_i32_insert_v4i32(<16 x i8>* align 4 dereferenceable(16) %p) {
 ; CHECK-LABEL: @casted_load_i32_insert_v4i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8>* [[P:%.*]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[B:%.*]] = bitcast <16 x i8>* [[P:%.*]] to i32*
+; CHECK-NEXT:    [[S:%.*]] = load i32, i32* [[B]], align 4
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i32 0
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
   %b = bitcast <16 x i8>* %p to i32*
@@ -229,8 +228,9 @@
 
 define <4 x float> @gep00_load_f32_insert_v4f32(<4 x float>* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @gep00_load_f32_insert_v4f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[P:%.*]], align 16
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[P:%.*]], i64 0, i64 0
+; CHECK-NEXT:    [[S:%.*]] = load float, float* [[GEP]], align 16
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> poison, float [[S]], i64 0
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %gep = getelementptr inbounds <4 x float>, <4 x float>* %p, i64 0, i64 0
@@ -243,8 +243,9 @@
 
 define <4 x float> @gep00_load_f32_insert_v4f32_addrspace(<4 x float> addrspace(44)* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @gep00_load_f32_insert_v4f32_addrspace(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float> addrspace(44)* [[P:%.*]], align 16
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <4 x float>, <4 x float> addrspace(44)* [[P:%.*]], i64 0, i64 0
+; CHECK-NEXT:    [[S:%.*]] = load float, float addrspace(44)* [[GEP]], align 16
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> poison, float [[S]], i64 0
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(44)* %p, i64 0, i64 0
@@ -258,9 +259,8 @@
 define <8 x i16> @gep01_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceable(18) %p) {
 ; CHECK-LABEL: @gep01_load_i16_insert_v8i16(
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[GEP]] to <8 x i16>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 2
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 2
+; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0
 ; CHECK-NEXT:    ret <8 x i16> [[R]]
 ;
   %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 0, i64 1
@@ -272,16 +272,11 @@
 ; Can't safely load the offset vector, but can load+shuffle if it is profitable.
 
 define <8 x i16> @gep01_load_i16_insert_v8i16_deref(<8 x i16>* align 16 dereferenceable(17) %p) {
-; SSE2-LABEL: @gep01_load_i16_insert_v8i16_deref(
-; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1
-; SSE2-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 2
-; SSE2-NEXT:    [[R:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0
-; SSE2-NEXT:    ret <8 x i16> [[R]]
-;
-; AVX2-LABEL: @gep01_load_i16_insert_v8i16_deref(
-; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[P:%.*]], align 16
-; AVX2-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    ret <8 x i16> [[R]]
+; CHECK-LABEL: @gep01_load_i16_insert_v8i16_deref(
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1
+; CHECK-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 2
+; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0
+; CHECK-NEXT:    ret <8 x i16> [[R]]
 ;
   %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 0, i64 1
   %s = load i16, i16* %gep, align 2
@@ -292,16 +287,11 @@
 ; Verify that alignment of the new load is not over-specified.
 
 define <8 x i16> @gep01_load_i16_insert_v8i16_deref_minalign(<8 x i16>* align 2 dereferenceable(16) %p) {
-; SSE2-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign(
-; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1
-; SSE2-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 8
-; SSE2-NEXT:    [[R:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0
-; SSE2-NEXT:    ret <8 x i16> [[R]]
-;
-; AVX2-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign(
-; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[P:%.*]], align 2
-; AVX2-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    ret <8 x i16> [[R]]
+; CHECK-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign(
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1
+; CHECK-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 8
+; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0
+; CHECK-NEXT:    ret <8 x i16> [[R]]
 ;
   %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 0, i64 1
   %s = load i16, i16* %gep, align 8
@@ -330,9 +320,10 @@
 
 define <4 x i32> @gep012_bitcast_load_i32_insert_v4i32(<16 x i8>* align 1 dereferenceable(20) %p) {
 ; CHECK-LABEL: @gep012_bitcast_load_i32_insert_v4i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8>* [[P:%.*]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[P:%.*]], i64 0, i64 12
+; CHECK-NEXT:    [[B:%.*]] = bitcast i8* [[GEP]] to i32*
+; CHECK-NEXT:    [[S:%.*]] = load i32, i32* [[B]], align 1
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
   %gep = getelementptr inbounds <16 x i8>, <16 x i8>* %p, i64 0, i64 12
@@ -366,9 +357,8 @@
 define <8 x i16> @gep10_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceable(32) %p) {
 ; CHECK-LABEL: @gep10_load_i16_insert_v8i16(
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[GEP]] to <8 x i16>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 16
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 16
+; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0
 ; CHECK-NEXT:    ret <8 x i16> [[R]]
 ;
   %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0
@@ -470,9 +460,8 @@
 
 define <4 x float> @load_f32_insert_v4f32_align(float* align 1 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_f32_insert_v4f32_align(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[S:%.*]] = load float, float* [[P:%.*]], align 4
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> poison, float [[S]], i32 0
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %s = load float, float* %p, align 4
@@ -495,9 +484,8 @@
 
 define <8 x i32> @load_i32_insert_v8i32(i32* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_i32_insert_v8i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 16
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[S:%.*]] = load i32, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i32> poison, i32 [[S]], i32 0
 ; CHECK-NEXT:    ret <8 x i32> [[R]]
 ;
   %s = load i32, i32* %p, align 4
@@ -507,8 +495,9 @@
 
 define <8 x i32> @casted_load_i32_insert_v8i32(<4 x i32>* align 4 dereferenceable(16) %p) {
 ; CHECK-LABEL: @casted_load_i32_insert_v8i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[P:%.*]], align 4
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[B:%.*]] = bitcast <4 x i32>* [[P:%.*]] to i32*
+; CHECK-NEXT:    [[S:%.*]] = load i32, i32* [[B]], align 4
+; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i32> poison, i32 [[S]], i32 0
 ; CHECK-NEXT:    ret <8 x i32> [[R]]
 ;
   %b = bitcast <4 x i32>* %p to i32*
@@ -519,9 +508,8 @@
 
 define <16 x float> @load_f32_insert_v16f32(float* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_f32_insert_v16f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[S:%.*]] = load float, float* [[P:%.*]], align 4
+; CHECK-NEXT:    [[R:%.*]] = insertelement <16 x float> poison, float [[S]], i32 0
 ; CHECK-NEXT:    ret <16 x float> [[R]]
 ;
   %s = load float, float* %p, align 4
@@ -531,9 +519,8 @@
 
 define <2 x float> @load_f32_insert_v2f32(float* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_f32_insert_v2f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[S:%.*]] = load float, float* [[P:%.*]], align 4
+; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x float> poison, float [[S]], i32 0
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
   %s = load float, float* %p, align 4
@@ -586,9 +573,9 @@
 
 define <4 x float> @load_v2f32_extract_insert_v4f32(<2 x float>* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_v2f32_extract_insert_v4f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float>* [[P:%.*]] to <4 x float>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[L:%.*]] = load <2 x float>, <2 x float>* [[P:%.*]], align 4
+; CHECK-NEXT:    [[S:%.*]] = extractelement <2 x float> [[L]], i32 0
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> poison, float [[S]], i32 0
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %l = load <2 x float>, <2 x float>* %p, align 4
@@ -599,9 +586,9 @@
 
 define <4 x float> @load_v8f32_extract_insert_v4f32(<8 x float>* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_v8f32_extract_insert_v4f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x float>* [[P:%.*]] to <4 x float>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[L:%.*]] = load <8 x float>, <8 x float>* [[P:%.*]], align 4
+; CHECK-NEXT:    [[S:%.*]] = extractelement <8 x float> [[L]], i32 0
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> poison, float [[S]], i32 0
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %l = load <8 x float>, <8 x float>* %p, align 4
@@ -628,18 +615,12 @@
 ; Can't safely load the offset vector, but can load+shuffle if it is profitable.
 
 define <8 x i16> @gep1_load_v2i16_extract_insert_v8i16(<2 x i16>* align 1 dereferenceable(16) %p) {
-; SSE2-LABEL: @gep1_load_v2i16_extract_insert_v8i16(
-; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <2 x i16>, <2 x i16>* [[P:%.*]], i64 1
-; SSE2-NEXT:    [[L:%.*]] = load <2 x i16>, <2 x i16>* [[GEP]], align 8
-; SSE2-NEXT:    [[S:%.*]] = extractelement <2 x i16> [[L]], i32 0
-; SSE2-NEXT:    [[R:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0
-; SSE2-NEXT:    ret <8 x i16> [[R]]
-;
-; AVX2-LABEL: @gep1_load_v2i16_extract_insert_v8i16(
-; AVX2-NEXT:    [[TMP1:%.*]] = bitcast <2 x i16>* [[P:%.*]] to <8 x i16>*
-; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 4
-; AVX2-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> <i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    ret <8 x i16> [[R]]
+; CHECK-LABEL: @gep1_load_v2i16_extract_insert_v8i16(
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <2 x i16>, <2 x i16>* [[P:%.*]], i64 1
+; CHECK-NEXT:    [[L:%.*]] = load <2 x i16>, <2 x i16>* [[GEP]], align 8
+; CHECK-NEXT:    [[S:%.*]] = extractelement <2 x i16> [[L]], i32 0
+; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0
+; CHECK-NEXT:    ret <8 x i16> [[R]]
 ;
   %gep = getelementptr inbounds <2 x i16>, <2 x i16>* %p, i64 1
   %l = load <2 x i16>, <2 x i16>* %gep, align 8
Index: llvm/test/Transforms/VectorCombine/X86/load.ll
===================================================================
--- llvm/test/Transforms/VectorCombine/X86/load.ll
+++ llvm/test/Transforms/VectorCombine/X86/load.ll
@@ -174,9 +174,8 @@
 
 define <4 x float> @load_f32_insert_v4f32(float* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_f32_insert_v4f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[S:%.*]] = load float, float* [[P:%.*]], align 4
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %s = load float, float* %p, align 4
@@ -186,8 +185,9 @@
 
 define <4 x float> @casted_load_f32_insert_v4f32(<4 x float>* align 4 dereferenceable(16) %p) {
 ; CHECK-LABEL: @casted_load_f32_insert_v4f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[P:%.*]], align 4
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[B:%.*]] = bitcast <4 x float>* [[P:%.*]] to float*
+; CHECK-NEXT:    [[S:%.*]] = load float, float* [[B]], align 4
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %b = bitcast <4 x float>* %p to float*
@@ -200,9 +200,8 @@
 
 define <4 x i32> @load_i32_insert_v4i32(i32* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_i32_insert_v4i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 16
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[S:%.*]] = load i32, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i32 0
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
   %s = load i32, i32* %p, align 4
@@ -214,9 +213,9 @@
 
 define <4 x i32> @casted_load_i32_insert_v4i32(<16 x i8>* align 4 dereferenceable(16) %p) {
 ; CHECK-LABEL: @casted_load_i32_insert_v4i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8>* [[P:%.*]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[B:%.*]] = bitcast <16 x i8>* [[P:%.*]] to i32*
+; CHECK-NEXT:    [[S:%.*]] = load i32, i32* [[B]], align 4
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i32 0
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
   %b = bitcast <16 x i8>* %p to i32*
@@ -229,8 +228,9 @@
 
 define <4 x float> @gep00_load_f32_insert_v4f32(<4 x float>* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @gep00_load_f32_insert_v4f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[P:%.*]], align 16
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[P:%.*]], i64 0, i64 0
+; CHECK-NEXT:    [[S:%.*]] = load float, float* [[GEP]], align 16
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i64 0
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %gep = getelementptr inbounds <4 x float>, <4 x float>* %p, i64 0, i64 0
@@ -243,8 +243,9 @@
 
 define <4 x float> @gep00_load_f32_insert_v4f32_addrspace(<4 x float> addrspace(44)* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @gep00_load_f32_insert_v4f32_addrspace(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float> addrspace(44)* [[P:%.*]], align 16
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <4 x float>, <4 x float> addrspace(44)* [[P:%.*]], i64 0, i64 0
+; CHECK-NEXT:    [[S:%.*]] = load float, float addrspace(44)* [[GEP]], align 16
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i64 0
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(44)* %p, i64 0, i64 0
@@ -258,9 +259,8 @@
 define <8 x i16> @gep01_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceable(18) %p) {
 ; CHECK-LABEL: @gep01_load_i16_insert_v8i16(
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[GEP]] to <8 x i16>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 2
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 2
+; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
 ; CHECK-NEXT:    ret <8 x i16> [[R]]
 ;
   %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 0, i64 1
@@ -272,16 +272,11 @@
 ; Can't safely load the offset vector, but can load+shuffle if it is profitable.
 
 define <8 x i16> @gep01_load_i16_insert_v8i16_deref(<8 x i16>* align 16 dereferenceable(17) %p) {
-; SSE2-LABEL: @gep01_load_i16_insert_v8i16_deref(
-; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1
-; SSE2-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 2
-; SSE2-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
-; SSE2-NEXT:    ret <8 x i16> [[R]]
-;
-; AVX2-LABEL: @gep01_load_i16_insert_v8i16_deref(
-; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[P:%.*]], align 16
-; AVX2-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    ret <8 x i16> [[R]]
+; CHECK-LABEL: @gep01_load_i16_insert_v8i16_deref(
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1
+; CHECK-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 2
+; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
+; CHECK-NEXT:    ret <8 x i16> [[R]]
 ;
   %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 0, i64 1
   %s = load i16, i16* %gep, align 2
@@ -292,16 +287,11 @@
 ; Verify that alignment of the new load is not over-specified.
 
 define <8 x i16> @gep01_load_i16_insert_v8i16_deref_minalign(<8 x i16>* align 2 dereferenceable(16) %p) {
-; SSE2-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign(
-; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1
-; SSE2-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 8
-; SSE2-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
-; SSE2-NEXT:    ret <8 x i16> [[R]]
-;
-; AVX2-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign(
-; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[P:%.*]], align 2
-; AVX2-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    ret <8 x i16> [[R]]
+; CHECK-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign(
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1
+; CHECK-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 8
+; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
+; CHECK-NEXT:    ret <8 x i16> [[R]]
 ;
   %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 0, i64 1
   %s = load i16, i16* %gep, align 8
@@ -330,9 +320,10 @@
 
 define <4 x i32> @gep012_bitcast_load_i32_insert_v4i32(<16 x i8>* align 1 dereferenceable(20) %p) {
 ; CHECK-LABEL: @gep012_bitcast_load_i32_insert_v4i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8>* [[P:%.*]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[P:%.*]], i64 0, i64 12
+; CHECK-NEXT:    [[B:%.*]] = bitcast i8* [[GEP]] to i32*
+; CHECK-NEXT:    [[S:%.*]] = load i32, i32* [[B]], align 1
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
   %gep = getelementptr inbounds <16 x i8>, <16 x i8>* %p, i64 0, i64 12
@@ -366,9 +357,8 @@
 define <8 x i16> @gep10_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceable(32) %p) {
 ; CHECK-LABEL: @gep10_load_i16_insert_v8i16(
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[GEP]] to <8 x i16>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 16
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[S:%.*]] = load i16, i16* [[GEP]], align 16
+; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
 ; CHECK-NEXT:    ret <8 x i16> [[R]]
 ;
   %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0
@@ -470,9 +460,8 @@
 
 define <4 x float> @load_f32_insert_v4f32_align(float* align 1 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_f32_insert_v4f32_align(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[S:%.*]] = load float, float* [[P:%.*]], align 4
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %s = load float, float* %p, align 4
@@ -495,9 +484,8 @@
 
 define <8 x i32> @load_i32_insert_v8i32(i32* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_i32_insert_v8i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 16
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[S:%.*]] = load i32, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i32> undef, i32 [[S]], i32 0
 ; CHECK-NEXT:    ret <8 x i32> [[R]]
 ;
   %s = load i32, i32* %p, align 4
@@ -507,8 +495,9 @@
 
 define <8 x i32> @casted_load_i32_insert_v8i32(<4 x i32>* align 4 dereferenceable(16) %p) {
 ; CHECK-LABEL: @casted_load_i32_insert_v8i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[P:%.*]], align 4
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[B:%.*]] = bitcast <4 x i32>* [[P:%.*]] to i32*
+; CHECK-NEXT:    [[S:%.*]] = load i32, i32* [[B]], align 4
+; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i32> undef, i32 [[S]], i32 0
 ; CHECK-NEXT:    ret <8 x i32> [[R]]
 ;
   %b = bitcast <4 x i32>* %p to i32*
@@ -519,9 +508,8 @@
 
 define <16 x float> @load_f32_insert_v16f32(float* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_f32_insert_v16f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[S:%.*]] = load float, float* [[P:%.*]], align 4
+; CHECK-NEXT:    [[R:%.*]] = insertelement <16 x float> undef, float [[S]], i32 0
 ; CHECK-NEXT:    ret <16 x float> [[R]]
 ;
   %s = load float, float* %p, align 4
@@ -531,9 +519,8 @@
 
 define <2 x float> @load_f32_insert_v2f32(float* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_f32_insert_v2f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <2 x i32> <i32 0, i32 undef>
+; CHECK-NEXT:    [[S:%.*]] = load float, float* [[P:%.*]], align 4
+; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x float> undef, float [[S]], i32 0
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
   %s = load float, float* %p, align 4
@@ -586,9 +573,9 @@
 
 define <4 x float> @load_v2f32_extract_insert_v4f32(<2 x float>* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_v2f32_extract_insert_v4f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float>* [[P:%.*]] to <4 x float>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[L:%.*]] = load <2 x float>, <2 x float>* [[P:%.*]], align 4
+; CHECK-NEXT:    [[S:%.*]] = extractelement <2 x float> [[L]], i32 0
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %l = load <2 x float>, <2 x float>* %p, align 4
@@ -599,9 +586,9 @@
 
 define <4 x float> @load_v8f32_extract_insert_v4f32(<8 x float>* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_v8f32_extract_insert_v4f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x float>* [[P:%.*]] to <4 x float>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[L:%.*]] = load <8 x float>, <8 x float>* [[P:%.*]], align 4
+; CHECK-NEXT:    [[S:%.*]] = extractelement <8 x float> [[L]], i32 0
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %l = load <8 x float>, <8 x float>* %p, align 4
@@ -628,18 +615,12 @@
 ; Can't safely load the offset vector, but can load+shuffle if it is profitable.
 
 define <8 x i16> @gep1_load_v2i16_extract_insert_v8i16(<2 x i16>* align 1 dereferenceable(16) %p) {
-; SSE2-LABEL: @gep1_load_v2i16_extract_insert_v8i16(
-; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <2 x i16>, <2 x i16>* [[P:%.*]], i64 1
-; SSE2-NEXT:    [[L:%.*]] = load <2 x i16>, <2 x i16>* [[GEP]], align 8
-; SSE2-NEXT:    [[S:%.*]] = extractelement <2 x i16> [[L]], i32 0
-; SSE2-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
-; SSE2-NEXT:    ret <8 x i16> [[R]]
-;
-; AVX2-LABEL: @gep1_load_v2i16_extract_insert_v8i16(
-; AVX2-NEXT:    [[TMP1:%.*]] = bitcast <2 x i16>* [[P:%.*]] to <8 x i16>*
-; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 4
-; AVX2-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> <i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; AVX2-NEXT:    ret <8 x i16> [[R]]
+; CHECK-LABEL: @gep1_load_v2i16_extract_insert_v8i16(
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <2 x i16>, <2 x i16>* [[P:%.*]], i64 1
+; CHECK-NEXT:    [[L:%.*]] = load <2 x i16>, <2 x i16>* [[GEP]], align 8
+; CHECK-NEXT:    [[S:%.*]] = extractelement <2 x i16> [[L]], i32 0
+; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
+; CHECK-NEXT:    ret <8 x i16> [[R]]
 ;
   %gep = getelementptr inbounds <2 x i16>, <2 x i16>* %p, i64 1
   %l = load <2 x i16>, <2 x i16>* %gep, align 8