diff --git a/llvm/lib/Transforms/Utils/VNCoercion.cpp b/llvm/lib/Transforms/Utils/VNCoercion.cpp
--- a/llvm/lib/Transforms/Utils/VNCoercion.cpp
+++ b/llvm/lib/Transforms/Utils/VNCoercion.cpp
@@ -9,22 +9,18 @@
 namespace llvm {
 namespace VNCoercion {
 
-static bool isFirstClassAggregateOrScalableType(Type *Ty) {
-  return Ty->isStructTy() || Ty->isArrayTy() || isa<ScalableVectorType>(Ty);
+static bool isBitCastable(Type *Ty) {
+  return !(Ty->isStructTy() || Ty->isArrayTy() || isa<ScalableVectorType>(Ty));
 }
 
-/// Return true if coerceAvailableValueToLoadType will succeed.
-bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
-                                     const DataLayout &DL) {
-  Type *StoredTy = StoredVal->getType();
-
+bool canCoerceMustAliasedTypeToLoad(Type *StoredTy, Type *LoadTy,
+                                    const DataLayout &DL) {
   if (StoredTy == LoadTy)
     return true;
 
   // If the loaded/stored value is a first class array/struct, or scalable type,
   // don't try to transform them. We need to be able to bitcast to integer.
-  if (isFirstClassAggregateOrScalableType(LoadTy) ||
-      isFirstClassAggregateOrScalableType(StoredTy))
+  if (!isBitCastable(LoadTy) || !isBitCastable(StoredTy))
     return false;
 
   uint64_t StoreSize = DL.getTypeSizeInBits(StoredTy).getFixedSize();
@@ -40,29 +36,40 @@
   bool StoredNI = DL.isNonIntegralPointerType(StoredTy->getScalarType());
   bool LoadNI = DL.isNonIntegralPointerType(LoadTy->getScalarType());
   // Don't coerce non-integral pointers to integers or vice versa.
-  if (StoredNI != LoadNI) {
-    // As a special case, allow coercion of memset used to initialize
-    // an array w/null.  Despite non-integral pointers not generally having a
-    // specific bit pattern, we do assume null is zero.
-    if (auto *CI = dyn_cast<Constant>(StoredVal))
-      return CI->isNullValue();
+  if (StoredNI != LoadNI)
     return false;
-  } else if (StoredNI && LoadNI &&
-             StoredTy->getPointerAddressSpace() !=
-                 LoadTy->getPointerAddressSpace()) {
+  else if (StoredNI && LoadNI &&
+           StoredTy->getPointerAddressSpace() !=
+               LoadTy->getPointerAddressSpace())
     return false;
-  }
-
 
   // The implementation below uses inttoptr for vectors of unequal size; we
   // can't allow this for non integral pointers. We could teach it to extract
-  // exact subvectors if desired. 
+  // exact subvectors if desired.
   if (StoredNI && StoreSize != DL.getTypeSizeInBits(LoadTy).getFixedSize())
     return false;
 
   return true;
 }
 
+/// Return true if coerceAvailableValueToLoadType will succeed.
+bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
+                                     const DataLayout &DL) {
+  Type *StoredTy = StoredVal->getType();
+  // If we know the value is available too, then, unlike for
+  // canCoerceMustAliasedTypeToLoad, we expect the caller to not need to
+  // or try to check that the size/offset is inbounds.
+  if (auto *CI = dyn_cast<Constant>(StoredVal))
+    // All callers are expected to be able to detect and handle the zeros case.
+    if (CI->isNullValue())
+      if (isa<ScalableVectorType>(StoredTy) ==
+              isa<ScalableVectorType>(LoadTy) &&
+          DL.getTypeSizeInBits(StoredTy).getKnownMinSize() >=
+              DL.getTypeSizeInBits(LoadTy).getKnownMinSize())
+        return true;
+  return canCoerceMustAliasedTypeToLoad(StoredTy, LoadTy, DL);
+}
+
 template <class T, class HelperClass>
 static T *coerceAvailableValueToLoadTypeHelper(T *StoredVal, Type *LoadedTy,
                                                HelperClass &Helper,
@@ -72,8 +79,11 @@
   if (auto *C = dyn_cast<Constant>(StoredVal))
     StoredVal = ConstantFoldConstant(C, DL);
 
-  // If this is already the right type, just return it.
+  // If this is already the right type, just return it,
+  // before trying to check the size of the SVE.
   Type *StoredValTy = StoredVal->getType();
+  if (StoredValTy == LoadedTy)
+    return StoredVal;
 
   uint64_t StoredValSize = DL.getTypeSizeInBits(StoredValTy).getFixedSize();
   uint64_t LoadedValSize = DL.getTypeSizeInBits(LoadedTy).getFixedSize();
@@ -111,7 +121,7 @@
   // extract out a piece from it.  If the available value is too small, then we
   // can't do anything.
   assert(StoredValSize >= LoadedValSize &&
-         "canCoerceMustAliasedValueToLoad fail");
+         "canCoerceMustAliasedTypeToLoad fail");
 
   // Convert source pointers to integers, which can be manipulated.
   if (StoredValTy->isPtrOrPtrVectorTy()) {
@@ -177,11 +187,6 @@
                                           Value *WritePtr,
                                           uint64_t WriteSizeInBits,
                                           const DataLayout &DL) {
-  // If the loaded/stored value is a first class array/struct, or scalable type,
-  // don't try to transform them. We need to be able to bitcast to integer.
-  if (isFirstClassAggregateOrScalableType(LoadTy))
-    return -1;
-
   int64_t StoreOffset = 0, LoadOffset = 0;
   Value *StoreBase =
       GetPointerBaseWithConstantOffset(WritePtr, StoreOffset, DL);
@@ -215,8 +220,9 @@
                                    StoreInst *DepSI, const DataLayout &DL) {
   auto *StoredVal = DepSI->getValueOperand();
 
-  // Cannot handle reading from store of first-class aggregate or scalable type.
-  if (isFirstClassAggregateOrScalableType(StoredVal->getType()))
+  // We need to be able to look at the size of the result.
+  if (isa<ScalableVectorType>(StoredVal->getType()) ||
+      isa<ScalableVectorType>(LoadTy))
     return -1;
 
   if (!canCoerceMustAliasedValueToLoad(StoredVal, LoadTy, DL))
@@ -224,7 +230,7 @@
 
   Value *StorePtr = DepSI->getPointerOperand();
   uint64_t StoreSize =
-      DL.getTypeSizeInBits(DepSI->getValueOperand()->getType()).getFixedSize();
+      DL.getTypeSizeInBits(StoredVal->getType()).getFixedSize();
   return analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, StorePtr, StoreSize,
                                         DL);
 }
@@ -319,21 +325,26 @@
 /// the other load can feed into the second load.
 int analyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, LoadInst *DepLI,
                                   const DataLayout &DL) {
-  // Cannot handle reading from store of first-class aggregate yet.
-  if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy())
-    return -1;
+  Value *DepPtr = DepLI->getPointerOperand();
 
-  if (!canCoerceMustAliasedValueToLoad(DepLI, LoadTy, DL))
+  // We need to be able to look at the size of the result.
+  if (isa<ScalableVectorType>(DepLI->getType()) ||
+      isa<ScalableVectorType>(LoadTy))
     return -1;
 
-  Value *DepPtr = DepLI->getPointerOperand();
-  uint64_t DepSize = DL.getTypeSizeInBits(DepLI->getType()).getFixedSize();
-  int R = analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, DepSize, DL);
-  if (R != -1)
-    return R;
+  if (canCoerceMustAliasedTypeToLoad(DepLI->getType(), LoadTy, DL)) {
+    uint64_t DepSize = DL.getTypeSizeInBits(DepLI->getType()).getFixedSize();
+    int R =
+        analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, DepSize, DL);
+    if (R != -1)
+      return R;
+  }
 
-  // If we have a load/load clobber an DepLI can be widened to cover this load,
+  // If we have a load/load clobber and DepLI can be widened to cover this load,
   // then we should widen it!
+  if (!canCoerceMustAliasedTypeToLoad(LoadTy, DepLI->getType(), DL))
+    return -1;
+
   int64_t LoadOffs = 0;
   const Value *LoadBase =
       GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, DL);
@@ -360,16 +371,25 @@
     return -1;
   uint64_t MemSizeInBits = SizeCst->getZExtValue() * 8;
 
+  // We need to be able to look at the size of the result.
+  if (isa<ScalableVectorType>(LoadTy))
+    return -1;
+
   // If this is memset, we just need to see if the offset is valid in the size
   // of the memset..
   if (MI->getIntrinsicID() == Intrinsic::memset) {
-    if (DL.isNonIntegralPointerType(LoadTy->getScalarType())) {
-      auto *CI = dyn_cast<ConstantInt>(cast<MemSetInst>(MI)->getValue());
-      if (!CI || !CI->isZero())
-        return -1;
-    }
-    return analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, MI->getDest(),
-                                          MemSizeInBits, DL);
+    Value *StoredVal = cast<MemSetInst>(MI)->getValue();
+    bool CanCast = ([&]() {
+      if (auto *CI = dyn_cast<Constant>(StoredVal))
+        if (CI->isNullValue())
+          return true;
+      Type *StoreTy = IntegerType::get(LoadTy->getContext(), MemSizeInBits);
+      return canCoerceMustAliasedTypeToLoad(StoreTy, LoadTy, DL);
+    })();
+    if (CanCast)
+      return analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, MI->getDest(),
+                                            MemSizeInBits, DL);
+    return -1;
   }
 
   // If we have a memcpy/memmove, the only case we can handle is if this is a
@@ -404,6 +424,9 @@
                                      HelperClass &Helper,
                                      const DataLayout &DL) {
   LLVMContext &Ctx = SrcVal->getType()->getContext();
+  if (auto *CI = dyn_cast<Constant>(getUnderlyingObject(SrcVal)))
+    if (CI->isNullValue())
+      return Constant::getNullValue(LoadTy);
 
   // If two pointers are in the same address space, they have the same size,
   // so we don't need to do any truncation, etc. This avoids introducing
@@ -534,6 +557,14 @@
     // memset(P, 'x', 1234) -> splat('x'), even if x is a variable, and
     // independently of what the offset is.
     T *Val = cast<T>(MSI->getValue());
+    if (auto *CI = dyn_cast<Constant>(Val)) {
+      // memset(P, '\0', 1234) -> just directly create the null value for the
+      // use of *P, bypassing any later validity checks for using BitCast to
+      // make this constant
+      if (CI->isNullValue())
+        return Constant::getNullValue(LoadTy);
+    }
+
     if (LoadSize != 1)
       Val =
           Helper.CreateZExtOrBitCast(Val, IntegerType::get(Ctx, LoadSize * 8));
diff --git a/llvm/test/Transforms/GVN/non-integral-pointers.ll b/llvm/test/Transforms/GVN/non-integral-pointers.ll
--- a/llvm/test/Transforms/GVN/non-integral-pointers.ll
+++ b/llvm/test/Transforms/GVN/non-integral-pointers.ll
@@ -108,6 +108,21 @@
   ret i8 addrspace(4)* %ref
 }
 
+define [2 x i8 addrspace(4)*] @forward_memset_zero_fca([2 x i8 addrspace(4)*] addrspace(4)* %loc) {
+; CHECK-LABEL: @forward_memset_zero_fca(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOC_BC:%.*]] = bitcast [2 x i8 addrspace(4)*] addrspace(4)* [[LOC:%.*]] to i8 addrspace(4)*
+; CHECK-NEXT:    call void @llvm.memset.p4i8.i64(i8 addrspace(4)* align 4 [[LOC_BC]], i8 0, i64 16, i1 false)
+; CHECK-NEXT:    ret [2 x i8 addrspace(4)*] zeroinitializer
+;
+  entry:
+  %loc.bc = bitcast [2 x i8 addrspace(4)*] addrspace(4)* %loc to i8 addrspace(4)*
+  call void @llvm.memset.p4i8.i64(i8 addrspace(4)* align 4 %loc.bc, i8 0, i64 16, i1 false)
+  %ref = load [2 x i8 addrspace(4)*], [2 x i8 addrspace(4)*] addrspace(4)* %loc
+  ret [2 x i8 addrspace(4)*] %ref
+}
+
+
 ; Can't forward as the load might be dead.  (Pretend we wrote out the alwaysfalse idiom above.)
 define i8 addrspace(4)* @neg_forward_store(i8 addrspace(4)* addrspace(4)* %loc) {
 ; CHECK-LABEL: @neg_forward_store(
diff --git a/llvm/test/Transforms/GVN/vscale.ll b/llvm/test/Transforms/GVN/vscale.ll
--- a/llvm/test/Transforms/GVN/vscale.ll
+++ b/llvm/test/Transforms/GVN/vscale.ll
@@ -139,8 +139,8 @@
 
 ; Analyze Load from clobbering Store.
 
-define <vscale x 4 x i32> @store_forward_to_load(<vscale x 4 x i32>* %p) {
-; CHECK-LABEL: @store_forward_to_load(
+define <vscale x 4 x i32> @nullstore_forward_to_load(<vscale x 4 x i32>* %p) {
+; CHECK-LABEL: @nullstore_forward_to_load(
 ; CHECK-NEXT:    store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* [[P:%.*]], align 16
 ; CHECK-NEXT:    ret <vscale x 4 x i32> zeroinitializer
 ;
@@ -149,6 +149,29 @@
   ret <vscale x 4 x i32> %load
 }
 
+define <vscale x 16 x i8> @nullstore_forward_to_load_casted(<vscale x 4 x i32>* %p) {
+; CHECK-LABEL: @nullstore_forward_to_load_casted(
+; CHECK-NEXT:    store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* [[P:%.*]], align 16
+; CHECK-NEXT:    ret <vscale x 16 x i8> zeroinitializer
+;
+  store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* %p
+  %conv = bitcast <vscale x 4 x i32>* %p to <vscale x 16 x i8>*
+  %load = load <vscale x 16 x i8>, <vscale x 16 x i8>* %conv
+  ret <vscale x 16 x i8> %load
+}
+
+define <vscale x 2 x i8> @nullstore_forward_to_load_trunc(<vscale x 4 x i32>* %p) {
+; CHECK-LABEL: @nullstore_forward_to_load_trunc(
+; CHECK-NEXT:    store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* [[P:%.*]], align 16
+; CHECK-NEXT:    ret <vscale x 2 x i8> zeroinitializer
+;
+  store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* %p
+  %conv = bitcast <vscale x 4 x i32>* %p to <vscale x 2 x i8>*
+  %load = load <vscale x 2 x i8>, <vscale x 2 x i8>* %conv
+  ret <vscale x 2 x i8> %load
+}
+
+
 define <vscale x 4 x i32> @store_forward_to_load_sideeffect(<vscale x 4 x i32>* %p) {
 ; CHECK-LABEL: @store_forward_to_load_sideeffect(
 ; CHECK-NEXT:    store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* [[P:%.*]], align 16
@@ -177,6 +200,30 @@
   ret i32 %load
 }
 
+define <vscale x 16 x i8> @store_forward_to_load_casted(<vscale x 4 x i32>* %p, <vscale x 4 x i32> %v) {
+; CHECK-LABEL: @store_forward_to_load_casted(
+; CHECK-NEXT:    store <vscale x 4 x i32> %v, <vscale x 4 x i32>* [[P:%.*]], align 16
+; CHECK-NEXT:    %conv = bitcast <vscale x 4 x i32>* %p to <vscale x 16 x i8>*
+; CHECK-NEXT:    %load = load <vscale x 16 x i8>, <vscale x 16 x i8>* %conv
+; CHECK-NEXT:    ret <vscale x 16 x i8> %load
+;
+  store <vscale x 4 x i32> %v, <vscale x 4 x i32>* %p
+  %conv = bitcast <vscale x 4 x i32>* %p to <vscale x 16 x i8>*
+  %load = load <vscale x 16 x i8>, <vscale x 16 x i8>* %conv
+  ret <vscale x 16 x i8> %load
+}
+
+define <vscale x 4 x i32> @store_forward_to_load(<vscale x 4 x i32>* %p, <vscale x 4 x i32> %v) {
+; CHECK-LABEL: @store_forward_to_load(
+; CHECK-NEXT:    store <vscale x 4 x i32> %v, <vscale x 4 x i32>* [[P:%.*]], align 16
+; CHECK-NEXT:    ret <vscale x 4 x i32> %v
+;
+  store <vscale x 4 x i32> %v, <vscale x 4 x i32>* %p
+  %load = load <vscale x 4 x i32>, <vscale x 4 x i32>* %p
+  ret <vscale x 4 x i32> %load
+}
+
+
 ; Analyze Load from clobbering MemInst.
 
 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1)
@@ -194,6 +241,20 @@
   ret i32 %load
 }
 
+define <vscale x 4 x i32> @memset_clobber_load_vscale(<vscale x 4 x i32> *%p) {
+; CHECK-LABEL: @memset_clobber_load_vscale(
+; CHECK-NEXT:    [[CONV:%.*]] = bitcast <vscale x 4 x i32>* [[P:%.*]] to i8*
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[CONV]], i8 0, i64 128, i1 false)
+; CHECK-NEXT:    %load = load <vscale x 4 x i32>, <vscale x 4 x i32>* %p
+; CHECK-NEXT:    ret <vscale x 4 x i32> %load
+;
+  %conv = bitcast <vscale x 4 x i32>* %p to i8*
+  call void @llvm.memset.p0i8.i64(i8* %conv, i8 0, i64 128, i1 false)
+  %load = load <vscale x 4 x i32>, <vscale x 4 x i32>* %p
+  ret <vscale x 4 x i32> %load
+}
+
+
 define i32 @memset_clobber_load_vscaled_base(<vscale x 4 x i32> *%p) {
 ; CHECK-LABEL: @memset_clobber_load_vscaled_base(
 ; CHECK-NEXT:    [[CONV:%.*]] = bitcast <vscale x 4 x i32>* [[P:%.*]] to i8*