diff --git a/llvm/lib/Transforms/Utils/VNCoercion.cpp b/llvm/lib/Transforms/Utils/VNCoercion.cpp
--- a/llvm/lib/Transforms/Utils/VNCoercion.cpp
+++ b/llvm/lib/Transforms/Utils/VNCoercion.cpp
@@ -30,7 +30,7 @@
   uint64_t StoreSize = DL.getTypeSizeInBits(StoredTy).getFixedSize();
 
   // The store size must be byte-aligned to support future type casts.
-  if (llvm::alignTo(StoreSize, 8) != StoreSize)
+  if (StoreSize & 7)
     return false;
 
   // The store has to be at least as big as the load.
@@ -125,13 +125,13 @@
     StoredVal = Helper.CreateBitCast(StoredVal, StoredValTy);
   }
 
-  // If this is a big-endian system, we need to shift the value down to the low
+  // If this is a big-endian system, we need to shift the bytes down to the low
   // bits so that a truncate will work.
   if (DL.isBigEndian()) {
-    uint64_t ShiftAmt = DL.getTypeStoreSizeInBits(StoredValTy).getFixedSize() -
-                        DL.getTypeStoreSizeInBits(LoadedTy).getFixedSize();
+    uint64_t ShiftBytes = DL.getTypeStoreSize(StoredValTy).getFixedSize() -
+                          DL.getTypeStoreSize(LoadedTy).getFixedSize();
     StoredVal = Helper.CreateLShr(
-        StoredVal, ConstantInt::get(StoredVal->getType(), ShiftAmt));
+        StoredVal, ConstantInt::get(StoredVal->getType(), ShiftBytes * 8));
   }
 
   // Truncate the integer to the right size now.
@@ -189,17 +189,16 @@
   if (StoreBase != LoadBase)
     return -1;
 
-  uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy).getFixedSize();
-
-  if ((WriteSizeInBits & 7) | (LoadSize & 7))
+  if (WriteSizeInBits & 7) // Shifting the input is not handled.
     return -1;
-  uint64_t StoreSize = WriteSizeInBits / 8; // Convert to bytes.
-  LoadSize /= 8;
+  uint64_t StoreSize = (WriteSizeInBits + 7) / 8; // Convert to bytes.
+  uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy).getFixedSize();
+  LoadSize = (LoadSize + 7) / 8;
 
-  // If the Load isn't completely contained within the stored bits, we don't
-  // have all the bits to feed it.  We could do something crazy in the future
-  // (issue a smaller load then merge the bits in) but this seems unlikely to be
-  // valuable.
+  // If the loaded bytes aren't completely contained within the stored bytes,
+  // we don't have all the bits to feed it.  We could do something crazy in the
+  // future (issue a smaller load then merge the bits in) but this seems
+  // unlikely to be valuable.
   if (StoreOffset > LoadOffset ||
       StoreOffset + int64_t(StoreSize) < LoadOffset + int64_t(LoadSize))
     return -1;
@@ -286,7 +285,7 @@
 
   // This is the size of the load to try.  Start with the next larger power of
   // two.
-  unsigned NewLoadByteSize = LI->getType()->getPrimitiveSizeInBits() / 8U;
+  unsigned NewLoadByteSize = (LI->getType()->getPrimitiveSizeInBits() + 7) / 8U;
   NewLoadByteSize = NextPowerOf2(NewLoadByteSize);
 
   while (true) {
@@ -414,29 +413,29 @@
     return SrcVal;
   }
 
-  uint64_t StoreSize =
-      (DL.getTypeSizeInBits(SrcVal->getType()).getFixedSize() + 7) / 8;
-  uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy).getFixedSize() + 7) / 8;
+  uint64_t StoreSize = DL.getTypeSizeInBits(SrcVal->getType()).getFixedSize();
+  uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy).getFixedSize();
   // Compute which bits of the stored value are being used by the load.  Convert
   // to an integer type to start with.
   if (SrcVal->getType()->isPtrOrPtrVectorTy())
     SrcVal = Helper.CreatePtrToInt(SrcVal, DL.getIntPtrType(SrcVal->getType()));
   if (!SrcVal->getType()->isIntegerTy())
-    SrcVal = Helper.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize * 8));
+    SrcVal = Helper.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize));
 
-  // Shift the bits to the least significant depending on endianness.
-  unsigned ShiftAmt;
+  // Shift the bytes to the least significant bits so that a truncate will
+  // work, depending on the endian.
+  unsigned ShiftBytes;
   if (DL.isLittleEndian())
-    ShiftAmt = Offset * 8;
+    ShiftBytes = Offset;
   else
-    ShiftAmt = (StoreSize - LoadSize - Offset) * 8;
-  if (ShiftAmt)
-    SrcVal = Helper.CreateLShr(SrcVal,
-                               ConstantInt::get(SrcVal->getType(), ShiftAmt));
+    ShiftBytes = (StoreSize - LoadSize) / 8 - Offset;
+  if (ShiftBytes)
+    SrcVal = Helper.CreateLShr(
+        SrcVal, ConstantInt::get(SrcVal->getType(), ShiftBytes * 8));
 
   if (LoadSize != StoreSize)
-    SrcVal = Helper.CreateTruncOrBitCast(SrcVal,
-                                         IntegerType::get(Ctx, LoadSize * 8));
+    SrcVal =
+        Helper.CreateTruncOrBitCast(SrcVal, IntegerType::get(Ctx, LoadSize));
   return SrcVal;
 }
 
@@ -526,7 +525,7 @@
                                 Type *LoadTy, HelperClass &Helper,
                                 const DataLayout &DL) {
   LLVMContext &Ctx = LoadTy->getContext();
-  uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy).getFixedSize() / 8;
+  uint64_t LoadSize = DL.getTypeStoreSize(LoadTy).getFixedSize();
 
   // We know that this method is only called when the mem transfer fully
   // provides the bits for the load.
diff --git a/llvm/test/Transforms/GVN/pr10820.ll b/llvm/test/Transforms/GVN/pr10820.ll
--- a/llvm/test/Transforms/GVN/pr10820.ll
+++ b/llvm/test/Transforms/GVN/pr10820.ll
@@ -1,8 +1,5 @@
-; RUN: opt < %s -basic-aa -gvn -S | FileCheck %s
-
-target datalayout =
-"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-f128:128:128-n8:16:32:64"
-target triple = "x86_64-unknown-linux-gnu"
+; RUN: opt -mtriple="x86_64-unknown-linux-gnu" --data-layout="e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-f128:128:128-n8:16:32:64" < %s -basic-aa -gvn -S | FileCheck %s
+; RUN: opt -mtriple="powerpc64-unknown-linux-gnu" --data-layout="E-m:e-i64:64-n32:64" < %s -basic-aa -gvn -S | FileCheck %s
 
 @g = external global i31
 
@@ -16,3 +13,17 @@
   store i31 %0, i31* undef, align 1
   unreachable
 }
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1)
+
+define i7 @memset_forward(i7 *%p) {
+; CHECK-LABEL: @memset_forward(
+; CHECK-NEXT:    [[CONV:%.*]] = bitcast i7* [[P:%.*]] to i8*
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[CONV]], i8 -120, i64 1, i1 false)
+; CHECK-NEXT:    ret i7 8
+;
+  %conv = bitcast i7* %p to i8*
+  call void @llvm.memset.p0i8.i64(i8* %conv, i8 -120, i64 1, i1 false)
+  %load = load i7, i7* %p
+  ret i7 %load
+}
diff --git a/llvm/test/Transforms/NewGVN/pr10820-xfail.ll b/llvm/test/Transforms/NewGVN/pr10820-xfail.ll
deleted file mode 100644
--- a/llvm/test/Transforms/NewGVN/pr10820-xfail.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; XFAIL: *
-; RUN: opt < %s -basic-aa -newgvn -S | FileCheck %s
-; NewGVN fails this due to missing load coercion
-target datalayout =
-"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-f128:128:128-n8:16:32:64"
-target triple = "x86_64-unknown-linux-gnu"
-
-@g = external global i31
-
-define void @main() nounwind uwtable {
-entry:
-; CHECK: store i32
-  store i32 402662078, i32* bitcast (i31* @g to i32*), align 8
-; CHECK-NOT: load i31
-  %0 = load i31, i31* @g, align 8
-; CHECK: store i31
-  store i31 %0, i31* undef, align 1
-  unreachable
-}
diff --git a/llvm/test/Transforms/NewGVN/pr10820.ll b/llvm/test/Transforms/NewGVN/pr10820.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/NewGVN/pr10820.ll
@@ -0,0 +1,29 @@
+; RUN: opt -mtriple="x86_64-unknown-linux-gnu" --data-layout="e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-f128:128:128-n8:16:32:64" < %s -basic-aa -newgvn -S | FileCheck %s
+; RUN: opt -mtriple="powerpc64-unknown-linux-gnu" --data-layout="E-m:e-i64:64-n32:64" < %s -basic-aa -newgvn -S | FileCheck %s
+
+@g = external global i31
+
+define void @main() nounwind uwtable {
+entry:
+; CHECK: store i32
+  store i32 402662078, i32* bitcast (i31* @g to i32*), align 8
+; CHECK-NOT: load i31
+  %0 = load i31, i31* @g, align 8
+; CHECK: store i31
+  store i31 %0, i31* undef, align 1
+  unreachable
+}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1)
+
+define i7 @memset_forward(i7 *%p) {
+; CHECK-LABEL: @memset_forward(
+; CHECK-NEXT:    [[CONV:%.*]] = bitcast i7* [[P:%.*]] to i8*
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[CONV]], i8 -120, i64 1, i1 false)
+; CHECK-NEXT:    ret i7 8
+;
+  %conv = bitcast i7* %p to i8*
+  call void @llvm.memset.p0i8.i64(i8* %conv, i8 -120, i64 1, i1 false)
+  %load = load i7, i7* %p
+  ret i7 %load
+}