Index: lib/Analysis/ConstantFolding.cpp
===================================================================
--- lib/Analysis/ConstantFolding.cpp
+++ lib/Analysis/ConstantFolding.cpp
@@ -98,6 +98,9 @@
   if (C->isAllOnesValue() && !DestTy->isX86_MMXTy() &&
       !DestTy->isPtrOrPtrVectorTy()) // Don't get ones for ptr types!
     return Constant::getAllOnesValue(DestTy);
+  if (DL.isNonIntegralPointerType(C->getType()->getScalarType()) !=
+      DL.isNonIntegralPointerType(DestTy->getScalarType()))
+    return nullptr;
 
   if (auto *VTy = dyn_cast<VectorType>(C->getType())) {
     // Handle a vector->scalar integer/fp cast.
@@ -323,10 +326,25 @@
                                          const DataLayout &DL) {
   do {
     Type *SrcTy = C->getType();
+    uint64_t DestSize = DL.getTypeSizeInBits(DestTy);
+    uint64_t SrcSize = DL.getTypeSizeInBits(SrcTy);
+    if (SrcSize < DestSize)
+      return nullptr;
+
+    // Catch the obvious splat cases (since all-zeros can coerce non-integral
+    // pointers legally).
+    if (C->isNullValue() && !DestTy->isX86_MMXTy())
+      return Constant::getNullValue(DestTy);
+    if (C->isAllOnesValue() && !DestTy->isX86_MMXTy() &&
+        !DestTy->isPtrOrPtrVectorTy()) // Don't get ones for ptr types!
+      return Constant::getAllOnesValue(DestTy);
 
     // If the type sizes are the same and a cast is legal, just directly
     // cast the constant.
-    if (DL.getTypeSizeInBits(DestTy) == DL.getTypeSizeInBits(SrcTy)) {
+    // But be careful not to coerce non-integral pointers illegally.
+    if (SrcSize == DestSize &&
+        DL.isNonIntegralPointerType(SrcTy->getScalarType()) ==
+        DL.isNonIntegralPointerType(DestTy->getScalarType())) {
       Instruction::CastOps Cast = Instruction::BitCast;
       // If we are going from a pointer to int or vice versa, we spell the cast
       // differently.
Index: lib/Transforms/Scalar/GVN.cpp
===================================================================
--- lib/Transforms/Scalar/GVN.cpp
+++ lib/Transforms/Scalar/GVN.cpp
@@ -878,11 +878,12 @@
 
   const DataLayout &DL = LI->getModule()->getDataLayout();
 
+  Instruction *DepInst = DepInfo.getInst();
   if (DepInfo.isClobber()) {
     // If the dependence is to a store that writes to a superset of the bits
     // read by the load, we can extract the bits we need for the load from the
     // stored value.
-    if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInfo.getInst())) {
+    if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInst)) {
       // Can't forward from non-atomic to atomic without violating memory model.
       if (Address && LI->isAtomic() <= DepSI->isAtomic()) {
         int Offset =
@@ -898,7 +899,7 @@
     //    load i32* P
     //    load i8* (P+1)
     // if we have this, replace the later with an extraction from the former.
-    if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInfo.getInst())) {
+    if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInst)) {
       // If this is a clobber and L is the first instruction in its block, then
       // we have the first instruction in the entry block.
       // Can't forward from non-atomic to atomic without violating memory model.
@@ -915,7 +916,7 @@
 
     // If the clobbering value is a memset/memcpy/memmove, see if we can
     // forward a value on from it.
-    if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(DepInfo.getInst())) {
+    if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(DepInst)) {
       if (Address && !LI->isAtomic()) {
         int Offset = analyzeLoadFromClobberingMemInst(LI->getType(), Address,
                                                       DepMI, DL);
@@ -929,8 +930,7 @@
     LLVM_DEBUG(
         // fast print dep, using operator<< on instruction is too slow.
         dbgs() << "GVN: load "; LI->printAsOperand(dbgs());
-        Instruction *I = DepInfo.getInst();
-        dbgs() << " is clobbered by " << *I << '\n';);
+        dbgs() << " is clobbered by " << *DepInst << '\n';);
     if (ORE->allowExtraAnalysis(DEBUG_TYPE))
       reportMayClobberedLoad(LI, DepInfo, DT, ORE);
 
@@ -938,8 +938,6 @@
   }
   assert(DepInfo.isDef() && "follows from above");
 
-  Instruction *DepInst = DepInfo.getInst();
-
   // Loading the allocation -> undef.
   if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI) ||
       // Loading immediately after lifetime begin -> undef.
@@ -958,8 +956,7 @@
     // Reject loads and stores that are to the same address but are of
     // different types if we have to. If the stored value is larger or equal to
     // the loaded value, we can reuse it.
-    if (S->getValueOperand()->getType() != LI->getType() &&
-        !canCoerceMustAliasedValueToLoad(S->getValueOperand(),
+    if (!canCoerceMustAliasedValueToLoad(S->getValueOperand(),
                                          LI->getType(), DL))
       return false;
 
@@ -975,8 +972,7 @@
     // If the types mismatch and we can't handle it, reject reuse of the load.
     // If the stored value is larger or equal to the loaded value, we can reuse
     // it.
-    if (LD->getType() != LI->getType() &&
-        !canCoerceMustAliasedValueToLoad(LD, LI->getType(), DL))
+    if (!canCoerceMustAliasedValueToLoad(LD, LI->getType(), DL))
       return false;
 
     // Can't forward from non-atomic to atomic without violating memory model.
Index: lib/Transforms/Utils/VNCoercion.cpp
===================================================================
--- lib/Transforms/Utils/VNCoercion.cpp
+++ lib/Transforms/Utils/VNCoercion.cpp
@@ -11,16 +11,18 @@
 namespace llvm {
 namespace VNCoercion {
 
-/// Return true if coerceAvailableValueToLoadType will succeed.
-bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
+bool canCoerceMustAliasedValueToLoad(Type *StoredTy, Type *LoadTy,
                                      const DataLayout &DL) {
+  if (StoredTy == LoadTy)
+    return true;
+
   // If the loaded or stored value is an first class array or struct, don't try
   // to transform them.  We need to be able to bitcast to integer.
   if (LoadTy->isStructTy() || LoadTy->isArrayTy() ||
-      StoredVal->getType()->isStructTy() || StoredVal->getType()->isArrayTy())
+      StoredTy->isStructTy() || StoredTy->isArrayTy())
     return false;
 
-  uint64_t StoreSize = DL.getTypeSizeInBits(StoredVal->getType());
+  uint64_t StoreSize = DL.getTypeSizeInBits(StoredTy);
 
   // The store size must be byte-aligned to support future type casts.
   if (llvm::alignTo(StoreSize, 8) != StoreSize)
@@ -31,19 +33,24 @@
     return false;
 
   // Don't coerce non-integral pointers to integers or vice versa.
-  if (DL.isNonIntegralPointerType(StoredVal->getType()->getScalarType()) !=
-      DL.isNonIntegralPointerType(LoadTy->getScalarType())) {
-    // As a special case, allow coercion of memset used to initialize
-    // an array w/null.  Despite non-integral pointers not generally having a
-    // specific bit pattern, we do assume null is zero.
-    if (auto *CI = dyn_cast<Constant>(StoredVal))
-      return CI->isNullValue();
+  if (DL.isNonIntegralPointerType(StoredTy->getScalarType()) !=
+      DL.isNonIntegralPointerType(LoadTy->getScalarType()))
     return false;
-  }
-  
+
   return true;
 }
 
+/// Return true if coerceAvailableValueToLoadType will succeed.
+bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
+                                     const DataLayout &DL) {
+    Type *StoredTy = StoredVal->getType();
+    if (auto *CI = dyn_cast<Constant>(StoredVal))
+      if (CI->isNullValue())
+        if (StoredTy == LoadTy || DL.getTypeSizeInBits(StoredTy) >= DL.getTypeSizeInBits(LoadTy))
+          return true;
+    return canCoerceMustAliasedValueToLoad(StoredTy, LoadTy, DL);
+}
+
 template <class T, class HelperClass>
 static T *coerceAvailableValueToLoadTypeHelper(T *StoredVal, Type *LoadedTy,
                                                HelperClass &Helper,
@@ -160,11 +167,6 @@
                                           Value *WritePtr,
                                           uint64_t WriteSizeInBits,
                                           const DataLayout &DL) {
-  // If the loaded or stored value is a first class array or struct, don't try
-  // to transform them.  We need to be able to bitcast to integer.
-  if (LoadTy->isStructTy() || LoadTy->isArrayTy())
-    return -1;
-
   int64_t StoreOffset = 0, LoadOffset = 0;
   Value *StoreBase =
       GetPointerBaseWithConstantOffset(WritePtr, StoreOffset, DL);
@@ -214,21 +216,9 @@
 int analyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
                                    StoreInst *DepSI, const DataLayout &DL) {
   auto *StoredVal = DepSI->getValueOperand();
-  
-  // Cannot handle reading from store of first-class aggregate yet.
-  if (StoredVal->getType()->isStructTy() ||
-      StoredVal->getType()->isArrayTy())
+  if (!canCoerceMustAliasedValueToLoad(StoredVal, LoadTy, DL))
     return -1;
 
-  // Don't coerce non-integral pointers to integers or vice versa.
-  if (DL.isNonIntegralPointerType(StoredVal->getType()->getScalarType()) !=
-      DL.isNonIntegralPointerType(LoadTy->getScalarType())) {
-    // Allow casts of zero values to null as a special case
-    auto *CI = dyn_cast<Constant>(StoredVal);
-    if (!CI || !CI->isNullValue())
-      return -1;
-  }
-
   Value *StorePtr = DepSI->getPointerOperand();
   uint64_t StoreSize =
       DL.getTypeSizeInBits(DepSI->getValueOperand()->getType());
@@ -241,39 +231,36 @@
 /// the other load can feed into the second load.
 int analyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, LoadInst *DepLI,
                                   const DataLayout &DL) {
-  // Cannot handle reading from store of first-class aggregate yet.
-  if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy())
-    return -1;
-
-  // Don't coerce non-integral pointers to integers or vice versa.
-  if (DL.isNonIntegralPointerType(DepLI->getType()->getScalarType()) !=
-      DL.isNonIntegralPointerType(LoadTy->getScalarType()))
-    return -1;
-
   Value *DepPtr = DepLI->getPointerOperand();
-  uint64_t DepSize = DL.getTypeSizeInBits(DepLI->getType());
-  int R = analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, DepSize, DL);
-  if (R != -1)
-    return R;
+  if (canCoerceMustAliasedValueToLoad(DepLI->getType(), LoadTy, DL)) {
+    uint64_t DepSize = DL.getTypeSizeInBits(DepLI->getType());
+    int R = analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, DepSize, DL);
+    if (R != -1)
+      return R;
+  }
 
   // If we have a load/load clobber an DepLI can be widened to cover this load,
   // then we should widen it!
-  int64_t LoadOffs = 0;
-  const Value *LoadBase =
-      GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, DL);
-  unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
+  if (canCoerceMustAliasedValueToLoad(LoadTy, DepLI->getType(), DL)) {
+    int64_t LoadOffs = 0;
+    const Value *LoadBase =
+        GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, DL);
+    unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
+
+    unsigned Size = MemoryDependenceResults::getLoadLoadClobberFullWidthSize(
+        LoadBase, LoadOffs, LoadSize, DepLI);
+    if (Size == 0)
+      return -1;
 
-  unsigned Size = MemoryDependenceResults::getLoadLoadClobberFullWidthSize(
-      LoadBase, LoadOffs, LoadSize, DepLI);
-  if (Size == 0)
-    return -1;
+    // Check non-obvious conditions enforced by MDA which we rely on for being
+    // able to materialize this potentially available value
+    assert(DepLI->isSimple() && "Cannot widen volatile/atomic load!");
+    assert(DepLI->getType()->isIntegerTy() && "Can't widen non-integer load");
 
-  // Check non-obvious conditions enforced by MDA which we rely on for being
-  // able to materialize this potentially available value
-  assert(DepLI->isSimple() && "Cannot widen volatile/atomic load!");
-  assert(DepLI->getType()->isIntegerTy() && "Can't widen non-integer load");
+    return analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, Size * 8, DL);
+  }
 
-  return analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, Size * 8, DL);
+  return -1;
 }
 
 int analyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
@@ -287,13 +274,16 @@
   // If this is memset, we just need to see if the offset is valid in the size
   // of the memset..
   if (MI->getIntrinsicID() == Intrinsic::memset) {
-    if (DL.isNonIntegralPointerType(LoadTy->getScalarType())) {
-      auto *CI = dyn_cast<ConstantInt>(cast<MemSetInst>(MI)->getValue());
-      if (!CI || !CI->isZero())
-        return -1;
-    }
-    return analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, MI->getDest(),
-                                          MemSizeInBits, DL);
+    Value *StoredVal = cast<MemSetInst>(MI)->getValue();
+    if (auto *CI = dyn_cast<Constant>(StoredVal))
+      if (CI->isNullValue())
+        return analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, MI->getDest(),
+                                              MemSizeInBits, DL);
+    Type *StoreTy = IntegerType::get(LoadTy->getContext(), MemSizeInBits);
+    if (canCoerceMustAliasedValueToLoad(StoreTy, LoadTy, DL))
+      return analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, MI->getDest(),
+                                            MemSizeInBits, DL);
+    return -1;
   }
 
   // If we have a memcpy/memmove, the only case we can handle is if this is a
@@ -306,7 +296,7 @@
     return -1;
 
   GlobalVariable *GV = dyn_cast<GlobalVariable>(GetUnderlyingObject(Src, DL));
-  if (!GV || !GV->isConstant())
+  if (!GV || !GV->isConstant() || !GV->hasDefinitiveInitializer())
     return -1;
 
   // See if the access is within the bounds of the transfer.
@@ -315,21 +305,17 @@
   if (Offset == -1)
     return Offset;
 
-  // Don't coerce non-integral pointers to integers or vice versa, and the
-  // memtransfer is implicitly a raw byte code
-  if (DL.isNonIntegralPointerType(LoadTy->getScalarType()))
-    // TODO: Can allow nullptrs from constant zeros
-    return -1;
-
   unsigned AS = Src->getType()->getPointerAddressSpace();
   // Otherwise, see if we can constant fold a load from the constant with the
   // offset applied as appropriate.
-  Src =
-      ConstantExpr::getBitCast(Src, Type::getInt8PtrTy(Src->getContext(), AS));
-  Constant *OffsetCst =
-      ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
-  Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()), Src,
-                                       OffsetCst);
+  if (Offset) {
+    Src =
+        ConstantExpr::getBitCast(Src, Type::getInt8PtrTy(Src->getContext(), AS));
+    Constant *OffsetCst =
+        ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
+    Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()), Src,
+                                         OffsetCst);
+  }
   Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
   if (ConstantFoldLoadFromConstPtr(Src, LoadTy, DL))
     return Offset;
@@ -341,6 +327,9 @@
                                      HelperClass &Helper,
                                      const DataLayout &DL) {
   LLVMContext &Ctx = SrcVal->getType()->getContext();
+  if (auto *CI = dyn_cast<Constant>(GetUnderlyingObject(SrcVal, DL)))
+    if (CI->isNullValue())
+      return Constant::getNullValue(LoadTy);
 
   // If two pointers are in the same address space, they have the same size,
   // so we don't need to do any truncation, etc. This avoids introducing
@@ -468,6 +457,12 @@
     // memset(P, 'x', 1234) -> splat('x'), even if x is a variable, and
     // independently of what the offset is.
     T *Val = cast<T>(MSI->getValue());
+    if (auto *CI = dyn_cast<Constant>(Val)) {
+      // memset(P, '\0', 1234) -> just directly create the null value for *P
+      // by-passing any later validity checks
+      if (CI->isNullValue())
+        return Constant::getNullValue(LoadTy);
+    }
     if (LoadSize != 1)
       Val =
           Helper.CreateZExtOrBitCast(Val, IntegerType::get(Ctx, LoadSize * 8));
@@ -496,16 +491,18 @@
   // Otherwise, this is a memcpy/memmove from a constant global.
   MemTransferInst *MTI = cast<MemTransferInst>(SrcInst);
   Constant *Src = cast<Constant>(MTI->getSource());
-  unsigned AS = Src->getType()->getPointerAddressSpace();
 
+  unsigned AS = Src->getType()->getPointerAddressSpace();
   // Otherwise, see if we can constant fold a load from the constant with the
   // offset applied as appropriate.
-  Src =
-      ConstantExpr::getBitCast(Src, Type::getInt8PtrTy(Src->getContext(), AS));
-  Constant *OffsetCst =
-      ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
-  Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()), Src,
-                                       OffsetCst);
+  if (Offset) {
+    Src =
+        ConstantExpr::getBitCast(Src, Type::getInt8PtrTy(Src->getContext(), AS));
+    Constant *OffsetCst =
+        ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
+    Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()), Src,
+                                         OffsetCst);
+  }
   Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
   return ConstantFoldLoadFromConstPtr(Src, LoadTy, DL);
 }
Index: test/Transforms/GVN/non-integral-pointers.ll
===================================================================
--- test/Transforms/GVN/non-integral-pointers.ll
+++ test/Transforms/GVN/non-integral-pointers.ll
@@ -169,53 +169,105 @@
   ret i8 addrspace(4)* %ref
 }
 
+
+
 @NonZeroConstant = constant <4 x i64> <i64 3, i64 3, i64 3, i64 3>
+@NonZeroConstant2 = constant <4 x i64 addrspace(4)*> <
+  i64 addrspace(4)* getelementptr (i64, i64 addrspace(4)* null, i32 3),
+  i64 addrspace(4)* getelementptr (i64, i64 addrspace(4)* null, i32 3),
+  i64 addrspace(4)* getelementptr (i64, i64 addrspace(4)* null, i32 3),
+  i64 addrspace(4)* getelementptr (i64, i64 addrspace(4)* null, i32 3)>
 @ZeroConstant = constant <4 x i64> zeroinitializer
 
 
 ; Can't forward as the load might be dead.  (Pretend we wrote out the alwaysfalse idiom above.)
-define i8 addrspace(4)* @neg_forward_memcopy(i8 addrspace(4)* addrspace(4)* %loc) {
+define i64 addrspace(4)* @neg_forward_memcopy(i64 addrspace(4)* addrspace(4)* %loc) {
 ; CHECK-LABEL: @neg_forward_memcopy(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[LOC_BC:%.*]] = bitcast i8 addrspace(4)* addrspace(4)* [[LOC:%.*]] to i8 addrspace(4)*
+; CHECK-NEXT:    [[LOC_BC:%.*]] = bitcast i64 addrspace(4)* addrspace(4)* [[LOC:%.*]] to i8 addrspace(4)*
 ; CHECK-NEXT:    call void @llvm.memcpy.p4i8.p0i8.i64(i8 addrspace(4)* align 4 [[LOC_BC]], i8* bitcast (<4 x i64>* @NonZeroConstant to i8*), i64 8, i1 false)
-; CHECK-NEXT:    [[REF:%.*]] = load i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* [[LOC]]
-; CHECK-NEXT:    ret i8 addrspace(4)* [[REF]]
+; CHECK-NEXT:    [[REF:%.*]] = load i64 addrspace(4)*, i64 addrspace(4)* addrspace(4)* [[LOC]]
+; CHECK-NEXT:    ret i64 addrspace(4)* [[REF]]
 ;
 entry:
-  %loc.bc = bitcast i8 addrspace(4)* addrspace(4)* %loc to i8 addrspace(4)*
+  %loc.bc = bitcast i64 addrspace(4)* addrspace(4)* %loc to i8 addrspace(4)*
   %src.bc = bitcast <4 x i64>* @NonZeroConstant to i8*
   call void @llvm.memcpy.p4i8.p0i8.i64(i8 addrspace(4)* align 4 %loc.bc, i8* %src.bc, i64 8, i1 false)
-  %ref = load i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* %loc
-  ret i8 addrspace(4)* %ref
+  %ref = load i64 addrspace(4)*, i64 addrspace(4)* addrspace(4)* %loc
+  ret i64 addrspace(4)* %ref
 }
 
-define <1 x i8 addrspace(4)*> @neg_forward_memcpy_vload(<1 x i8 addrspace(4)*> addrspace(4)* %loc) {
+define i64 addrspace(4)* @forward_memcopy(i64 addrspace(4)* addrspace(4)* %loc) {
+; CHECK-LABEL: @forward_memcopy(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOC_BC:%.*]] = bitcast i64 addrspace(4)* addrspace(4)* [[LOC:%.*]] to i8 addrspace(4)*
+; CHECK-NEXT:    call void @llvm.memcpy.p4i8.p0i8.i64(i8 addrspace(4)* align 4 [[LOC_BC]], i8* bitcast (<4 x i64 addrspace(4)*>* @NonZeroConstant2 to i8*), i64 8, i1 false)
+; CHECK-NEXT:    ret i64 addrspace(4)* getelementptr (i64, i64 addrspace(4)* null, i32 3)
+;
+entry:
+  %loc.bc = bitcast i64 addrspace(4)* addrspace(4)* %loc to i8 addrspace(4)*
+  %src.bc = bitcast <4 x i64 addrspace(4)*>* @NonZeroConstant2 to i8*
+  call void @llvm.memcpy.p4i8.p0i8.i64(i8 addrspace(4)* align 4 %loc.bc, i8* %src.bc, i64 8, i1 false)
+  %ref = load i64 addrspace(4)*, i64 addrspace(4)* addrspace(4)* %loc
+  ret i64 addrspace(4)* %ref
+}
+
+define <4 x i64 addrspace(4)*> @neg_forward_memcpy_vload(<4 x i64 addrspace(4)*> addrspace(4)* %loc) {
 ; CHECK-LABEL: @neg_forward_memcpy_vload(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[LOC_BC:%.*]] = bitcast <1 x i8 addrspace(4)*> addrspace(4)* [[LOC:%.*]] to i8 addrspace(4)*
-; CHECK-NEXT:    call void @llvm.memcpy.p4i8.p0i8.i64(i8 addrspace(4)* align 4 [[LOC_BC]], i8* bitcast (<4 x i64>* @NonZeroConstant to i8*), i64 8, i1 false)
-; CHECK-NEXT:    [[REF:%.*]] = load <1 x i8 addrspace(4)*>, <1 x i8 addrspace(4)*> addrspace(4)* [[LOC]]
-; CHECK-NEXT:    ret <1 x i8 addrspace(4)*> [[REF]]
+; CHECK-NEXT:    [[LOC_BC:%.*]] = bitcast <4 x i64 addrspace(4)*> addrspace(4)* [[LOC:%.*]] to i8 addrspace(4)*
+; CHECK-NEXT:    call void @llvm.memcpy.p4i8.p0i8.i64(i8 addrspace(4)* align 4 [[LOC_BC]], i8* bitcast (<4 x i64>* @NonZeroConstant to i8*), i64 32, i1 false)
+; CHECK-NEXT:    [[REF:%.*]] = load <4 x i64 addrspace(4)*>, <4 x i64 addrspace(4)*> addrspace(4)* [[LOC]]
+; CHECK-NEXT:    ret <4 x i64 addrspace(4)*> [[REF]]
 ;
 entry:
-  %loc.bc = bitcast <1 x i8 addrspace(4)*> addrspace(4)* %loc to i8 addrspace(4)*
+  %loc.bc = bitcast <4 x i64 addrspace(4)*> addrspace(4)* %loc to i8 addrspace(4)*
   %src.bc = bitcast <4 x i64>* @NonZeroConstant to i8*
-  call void @llvm.memcpy.p4i8.p0i8.i64(i8 addrspace(4)* align 4 %loc.bc, i8* %src.bc, i64 8, i1 false)
-  %ref = load <1 x i8 addrspace(4)*>, <1 x i8 addrspace(4)*> addrspace(4)* %loc
-  ret <1 x i8 addrspace(4)*> %ref
+  call void @llvm.memcpy.p4i8.p0i8.i64(i8 addrspace(4)* align 4 %loc.bc, i8* %src.bc, i64 32, i1 false)
+  %ref = load <4 x i64 addrspace(4)*>, <4 x i64 addrspace(4)*> addrspace(4)* %loc
+  ret <4 x i64 addrspace(4)*> %ref
 }
 
+define <4 x i64> @neg_forward_memcpy_vload2(<4 x i64> addrspace(4)* %loc) {
+; CHECK-LABEL: @neg_forward_memcpy_vload2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOC_BC:%.*]] = bitcast <4 x i64> addrspace(4)* [[LOC:%.*]] to i8 addrspace(4)*
+; CHECK-NEXT:    call void @llvm.memcpy.p4i8.p0i8.i64(i8 addrspace(4)* align 4 [[LOC_BC]], i8* bitcast (<4 x i64 addrspace(4)*>* @NonZeroConstant2 to i8*), i64 32, i1 false)
+; CHECK-NEXT:    [[REF:%.*]] = load <4 x i64>, <4 x i64> addrspace(4)* [[LOC]]
+; CHECK-NEXT:    ret <4 x i64> [[REF]]
+;
+entry:
+  %loc.bc = bitcast <4 x i64> addrspace(4)* %loc to i8 addrspace(4)*
+  %src.bc = bitcast <4 x i64 addrspace(4)*>* @NonZeroConstant2 to i8*
+  call void @llvm.memcpy.p4i8.p0i8.i64(i8 addrspace(4)* align 4 %loc.bc, i8* %src.bc, i64 32, i1 false)
+  %ref = load <4 x i64>, <4 x i64> addrspace(4)* %loc
+  ret <4 x i64> %ref
+}
+
+define <1 x i64 addrspace(4)*> @forward_memcpy_vload2(<4 x i64 addrspace(4)*> addrspace(4)* %loc) {
+; CHECK-LABEL: @forward_memcpy_vload2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOC_BC:%.*]] = bitcast <4 x i64 addrspace(4)*> addrspace(4)* [[LOC:%.*]] to i8 addrspace(4)*
+; CHECK-NEXT:    call void @llvm.memcpy.p4i8.p0i8.i64(i8 addrspace(4)* align 4 [[LOC_BC]], i8* bitcast (<4 x i64 addrspace(4)*>* @NonZeroConstant2 to i8*), i64 32, i1 false)
+; CHECK-NEXT:    ret <1 x i64 addrspace(4)*> <i64 addrspace(4)* getelementptr (i64, i64 addrspace(4)* null, i32 3)>
+;
+entry:
+  %loc.bc = bitcast <4 x i64 addrspace(4)*> addrspace(4)* %loc to i8 addrspace(4)*
+  %src.bc = bitcast <4 x i64 addrspace(4)*>* @NonZeroConstant2 to i8*
+  call void @llvm.memcpy.p4i8.p0i8.i64(i8 addrspace(4)* align 4 %loc.bc, i8* %src.bc, i64 32, i1 false)
+  %ref = load <4 x i64 addrspace(4)*>, <4 x i64 addrspace(4)*> addrspace(4)* %loc
+  %val = extractelement <4 x i64 addrspace(4)*> %ref, i32 0
+  %ret = insertelement <1 x i64 addrspace(4)*> undef, i64 addrspace(4)* %val, i32 0
+  ret <1 x i64 addrspace(4)*> %ret
+}
 
 ; Can forward since we can do so w/o breaking types
-; TODO: missed optimization
 define i8 addrspace(4)* @forward_memcpy_zero(i8 addrspace(4)* addrspace(4)* %loc) {
 ; CHECK-LABEL: @forward_memcpy_zero(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[LOC_BC:%.*]] = bitcast i8 addrspace(4)* addrspace(4)* [[LOC:%.*]] to i8 addrspace(4)*
 ; CHECK-NEXT:    call void @llvm.memcpy.p4i8.p0i8.i64(i8 addrspace(4)* align 4 [[LOC_BC]], i8* bitcast (<4 x i64>* @ZeroConstant to i8*), i64 8, i1 false)
-; CHECK-NEXT:    [[REF:%.*]] = load i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* [[LOC]]
-; CHECK-NEXT:    ret i8 addrspace(4)* [[REF]]
+; CHECK-NEXT:    ret i8 addrspace(4)* null
 ;
 entry:
   %loc.bc = bitcast i8 addrspace(4)* addrspace(4)* %loc to i8 addrspace(4)*
Index: test/Transforms/GlobalOpt/evaluate-call-errors.ll
===================================================================
--- test/Transforms/GlobalOpt/evaluate-call-errors.ll
+++ test/Transforms/GlobalOpt/evaluate-call-errors.ll
@@ -65,7 +65,7 @@
 }
 
 define internal %struct.Foo* @_ZL3foov() {
-  ret %struct.Foo* null
+  ret %struct.Foo* getelementptr (%struct.Foo, %struct.Foo *null, i32 1)
 }
 
 define linkonce_odr void @_ZN1QC2Ev(%struct.Q*) unnamed_addr align 2 {
@@ -73,7 +73,7 @@
   store %struct.Q* %0, %struct.Q** %2, align 8
   %3 = load %struct.Q*, %struct.Q** %2, align 8
   %4 = getelementptr inbounds %struct.Q, %struct.Q* %3, i32 0, i32 0
-  %5 = call i32 bitcast (i32 (i32)* @_ZL3baz3Foo to i32 (%struct.Foo*)*)(%struct.Foo* null)
+  %5 = call i32 bitcast (i32 (i32)* @_ZL3baz3Foo to i32 (%struct.Foo*)*)(%struct.Foo* getelementptr (%struct.Foo, %struct.Foo *null, i32 1))
   store i32 %5, i32* %4, align 4
   ret void
 }