diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
--- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -539,6 +539,12 @@
     bool GepHasConstantOffset = true;
     for (User::const_op_iterator I = GEPOp->op_begin() + 1, E = GEPOp->op_end();
          I != E; ++I, ++GTI) {
+      Type *GEPIdxedTy = GTI.getIndexedType();
+      if (GEPIdxedTy->isVectorTy() && GEPIdxedTy->getVectorIsScalable()) {
+        GepHasConstantOffset = false;
+        break;
+      }
+
       const Value *Index = *I;
       // Compute the (potentially symbolic) offset in bytes for this index.
       if (StructType *STy = GTI.getStructTypeOrNull()) {
@@ -557,15 +563,16 @@
         if (CIdx->isZero())
           continue;
         Decomposed.OtherOffset +=
-          (DL.getTypeAllocSize(GTI.getIndexedType()) *
-            CIdx->getValue().sextOrSelf(MaxPointerSize))
-              .sextOrTrunc(MaxPointerSize);
+            (DL.getTypeAllocSize(GEPIdxedTy).getFixedSize() *
+             CIdx->getValue().sextOrSelf(MaxPointerSize))
+                .sextOrTrunc(MaxPointerSize);
         continue;
       }
 
       GepHasConstantOffset = false;
 
-      APInt Scale(MaxPointerSize, DL.getTypeAllocSize(GTI.getIndexedType()));
+      APInt Scale(MaxPointerSize,
+                  DL.getTypeAllocSize(GEPIdxedTy).getFixedSize());
       unsigned ZExtBits = 0, SExtBits = 0;
 
       // If the integer type is smaller than the pointer size, it is implicitly
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1877,7 +1877,7 @@
     // If the element type has zero size then any index over it is equivalent
     // to an index of zero, so replace it with zero if it is not zero already.
     Type *EltTy = GTI.getIndexedType();
-    if (EltTy->isSized() && DL.getTypeAllocSize(EltTy) == 0)
+    if (EltTy->isSized() && !DL.getTypeAllocSize(EltTy).isNonZero())
       if (!isa<Constant>(*I) || !match(I->get(), m_Zero())) {
         *I = Constant::getNullValue(NewIndexType);
         MadeChange = true;
diff --git a/llvm/lib/Transforms/Utils/VNCoercion.cpp b/llvm/lib/Transforms/Utils/VNCoercion.cpp
--- a/llvm/lib/Transforms/Utils/VNCoercion.cpp
+++ b/llvm/lib/Transforms/Utils/VNCoercion.cpp
@@ -20,17 +20,19 @@
   // If the loaded or stored value is an first class array or struct, don't try
   // to transform them.  We need to be able to bitcast to integer.
   if (LoadTy->isStructTy() || LoadTy->isArrayTy() || StoredTy->isStructTy() ||
-      StoredTy->isArrayTy())
+      StoredTy->isArrayTy() ||
+      (LoadTy->isVectorTy() && LoadTy->getVectorIsScalable()) ||
+      (StoredTy->isVectorTy() && StoredTy->getVectorIsScalable()))
     return false;
 
-  uint64_t StoreSize = DL.getTypeSizeInBits(StoredTy);
+  uint64_t StoreSize = DL.getTypeSizeInBits(StoredTy).getFixedSize();
 
   // The store size must be byte-aligned to support future type casts.
   if (llvm::alignTo(StoreSize, 8) != StoreSize)
     return false;
 
   // The store has to be at least as big as the load.
-  if (StoreSize < DL.getTypeSizeInBits(LoadTy))
+  if (StoreSize < DL.getTypeSizeInBits(LoadTy).getFixedSize())
     return false;
 
   // Don't coerce non-integral pointers to integers or vice versa.
@@ -59,8 +61,8 @@
   // If this is already the right type, just return it.
   Type *StoredValTy = StoredVal->getType();
 
-  uint64_t StoredValSize = DL.getTypeSizeInBits(StoredValTy);
-  uint64_t LoadedValSize = DL.getTypeSizeInBits(LoadedTy);
+  uint64_t StoredValSize = DL.getTypeSizeInBits(StoredValTy).getFixedSize();
+  uint64_t LoadedValSize = DL.getTypeSizeInBits(LoadedTy).getFixedSize();
 
   // If the store and reload are the same size, we can always reuse it.
   if (StoredValSize == LoadedValSize) {
@@ -112,8 +114,8 @@
   // If this is a big-endian system, we need to shift the value down to the low
   // bits so that a truncate will work.
   if (DL.isBigEndian()) {
-    uint64_t ShiftAmt = DL.getTypeStoreSizeInBits(StoredValTy) -
-                        DL.getTypeStoreSizeInBits(LoadedTy);
+    uint64_t ShiftAmt = DL.getTypeStoreSizeInBits(StoredValTy).getFixedSize() -
+                        DL.getTypeStoreSizeInBits(LoadedTy).getFixedSize();
     StoredVal = Helper.CreateLShr(
         StoredVal, ConstantInt::get(StoredVal->getType(), ShiftAmt));
   }
@@ -162,7 +164,8 @@
                                           const DataLayout &DL) {
   // If the loaded or stored value is a first class array or struct, don't try
   // to transform them.  We need to be able to bitcast to integer.
-  if (LoadTy->isStructTy() || LoadTy->isArrayTy())
+  if (LoadTy->isStructTy() || LoadTy->isArrayTy() ||
+      (LoadTy->isVectorTy() && LoadTy->getVectorIsScalable()))
     return -1;
 
   int64_t StoreOffset = 0, LoadOffset = 0;
@@ -180,7 +183,7 @@
   // If the load and store don't overlap at all, the store doesn't provide
   // anything to the load.  In this case, they really don't alias at all, AA
   // must have gotten confused.
-  uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy);
+  uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy).getFixedSize();
 
   if ((WriteSizeInBits & 7) | (LoadSize & 7))
     return -1;
@@ -216,8 +219,9 @@
   auto *StoredVal = DepSI->getValueOperand();
   
   // Cannot handle reading from store of first-class aggregate yet.
-  if (StoredVal->getType()->isStructTy() ||
-      StoredVal->getType()->isArrayTy())
+  if (StoredVal->getType()->isStructTy() || StoredVal->getType()->isArrayTy() ||
+      (StoredVal->getType()->isVectorTy() &&
+       StoredVal->getType()->getVectorIsScalable()))
     return -1;
 
   // Don't coerce non-integral pointers to integers or vice versa.
@@ -231,7 +235,7 @@
 
   Value *StorePtr = DepSI->getPointerOperand();
   uint64_t StoreSize =
-      DL.getTypeSizeInBits(DepSI->getValueOperand()->getType());
+      DL.getTypeSizeInBits(DepSI->getValueOperand()->getType()).getFixedSize();
   return analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, StorePtr, StoreSize,
                                         DL);
 }
@@ -336,7 +340,7 @@
     return -1;
 
   Value *DepPtr = DepLI->getPointerOperand();
-  uint64_t DepSize = DL.getTypeSizeInBits(DepLI->getType());
+  uint64_t DepSize = DL.getTypeSizeInBits(DepLI->getType()).getFixedSize();
   int R = analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, DepSize, DL);
   if (R != -1)
     return R;
@@ -346,7 +350,7 @@
   int64_t LoadOffs = 0;
   const Value *LoadBase =
       GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, DL);
-  unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
+  unsigned LoadSize = DL.getTypeStoreSize(LoadTy).getFixedSize();
 
   unsigned Size =
       getLoadLoadClobberFullWidthSize(LoadBase, LoadOffs, LoadSize, DepLI);
@@ -436,8 +440,9 @@
     return SrcVal;
   }
 
-  uint64_t StoreSize = (DL.getTypeSizeInBits(SrcVal->getType()) + 7) / 8;
-  uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy) + 7) / 8;
+  uint64_t StoreSize =
+      (DL.getTypeSizeInBits(SrcVal->getType()).getFixedSize() + 7) / 8;
+  uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy).getFixedSize() + 7) / 8;
   // Compute which bits of the stored value are being used by the load.  Convert
   // to an integer type to start with.
   if (SrcVal->getType()->isPtrOrPtrVectorTy())
@@ -489,8 +494,9 @@
                            Instruction *InsertPt, const DataLayout &DL) {
   // If Offset+LoadTy exceeds the size of SrcVal, then we must be wanting to
   // widen SrcVal out to a larger load.
-  unsigned SrcValStoreSize = DL.getTypeStoreSize(SrcVal->getType());
-  unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
+  unsigned SrcValStoreSize =
+      DL.getTypeStoreSize(SrcVal->getType()).getFixedSize();
+  unsigned LoadSize = DL.getTypeStoreSize(LoadTy).getFixedSize();
   if (Offset + LoadSize > SrcValStoreSize) {
     assert(SrcVal->isSimple() && "Cannot widen volatile/atomic load!");
     assert(SrcVal->getType()->isIntegerTy() && "Can't widen non-integer load");
@@ -533,8 +539,9 @@
 
 Constant *getConstantLoadValueForLoad(Constant *SrcVal, unsigned Offset,
                                       Type *LoadTy, const DataLayout &DL) {
-  unsigned SrcValStoreSize = DL.getTypeStoreSize(SrcVal->getType());
-  unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
+  unsigned SrcValStoreSize =
+      DL.getTypeStoreSize(SrcVal->getType()).getFixedSize();
+  unsigned LoadSize = DL.getTypeStoreSize(LoadTy).getFixedSize();
   if (Offset + LoadSize > SrcValStoreSize)
     return nullptr;
   return getConstantStoreValueForLoad(SrcVal, Offset, LoadTy, DL);
@@ -545,7 +552,7 @@
                                 Type *LoadTy, HelperClass &Helper,
                                 const DataLayout &DL) {
   LLVMContext &Ctx = LoadTy->getContext();
-  uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy) / 8;
+  uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy).getFixedSize() / 8;
 
   // We know that this method is only called when the mem transfer fully
   // provides the bits for the load.
diff --git a/llvm/test/Transforms/GVN/vscale.ll b/llvm/test/Transforms/GVN/vscale.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/GVN/vscale.ll
@@ -0,0 +1,337 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S < %s -basicaa -gvn -dce | FileCheck %s
+
+; Analyze Load from clobbering Load.
+
+define <vscale x 4 x i32> @load_store_clobber_load(<vscale x 4 x i32> *%p)  {
+; CHECK-LABEL: @load_store_clobber_load(
+; CHECK-NEXT:    [[LOAD1:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[P:%.*]]
+; CHECK-NEXT:    store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* undef
+; CHECK-NEXT:    [[ADD:%.*]] = add <vscale x 4 x i32> [[LOAD1]], [[LOAD1]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD]]
+;
+  %load1 = load <vscale x 4 x i32>, <vscale x 4 x i32>* %p
+  store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* undef
+  %load2 = load <vscale x 4 x i32>, <vscale x 4 x i32>* %p ; <- load to be eliminated
+  %add = add <vscale x 4 x i32> %load1, %load2
+  ret <vscale x 4 x i32> %add
+}
+
+define <vscale x 4 x i32> @load_store_clobber_load_mayalias(<vscale x 4 x i32>* %p, <vscale x 4 x i32>* %p2) {
+; CHECK-LABEL: @load_store_clobber_load_mayalias(
+; CHECK-NEXT:    [[LOAD1:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[P:%.*]]
+; CHECK-NEXT:    store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* [[P2:%.*]]
+; CHECK-NEXT:    [[LOAD2:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[P]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub <vscale x 4 x i32> [[LOAD1]], [[LOAD2]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[SUB]]
+;
+  %load1 = load <vscale x 4 x i32>, <vscale x 4 x i32>* %p
+  store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* %p2
+  %load2 = load <vscale x 4 x i32>, <vscale x 4 x i32>* %p
+  %sub = sub <vscale x 4 x i32> %load1, %load2
+  ret <vscale x 4 x i32> %sub
+}
+
+define <vscale x 4 x i32> @load_store_clobber_load_noalias(<vscale x 4 x i32>* noalias %p, <vscale x 4 x i32>* noalias %p2) {
+; CHECK-LABEL: @load_store_clobber_load_noalias(
+; CHECK-NEXT:    [[LOAD1:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[P:%.*]]
+; CHECK-NEXT:    store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* [[P2:%.*]]
+; CHECK-NEXT:    [[ADD:%.*]] = add <vscale x 4 x i32> [[LOAD1]], [[LOAD1]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD]]
+;
+  %load1 = load <vscale x 4 x i32>, <vscale x 4 x i32>* %p
+  store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* %p2
+  %load2 = load <vscale x 4 x i32>, <vscale x 4 x i32>* %p ; <- load to be eliminated
+  %add = add <vscale x 4 x i32> %load1, %load2
+  ret <vscale x 4 x i32> %add
+}
+
+; TODO: %load2 could be eliminated
+define i32 @load_clobber_load_gep1(<vscale x 4 x i32>* %p) {
+; CHECK-LABEL: @load_clobber_load_gep1(
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* [[P:%.*]], i64 0, i64 1
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i32, i32* [[GEP1]]
+; CHECK-NEXT:    [[P2:%.*]] = bitcast <vscale x 4 x i32>* [[P]] to i32*
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i32, i32* [[P2]], i64 1
+; CHECK-NEXT:    [[LOAD2:%.*]] = load i32, i32* [[GEP2]]
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[LOAD1]], [[LOAD2]]
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+  %gep1 = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %p, i64 0, i64 1
+  %load1 = load i32, i32* %gep1
+  %p2 = bitcast <vscale x 4 x i32>* %p to i32*
+  %gep2 = getelementptr i32, i32* %p2, i64 1
+  %load2 = load i32, i32* %gep2 ; <- load could be eliminated
+  %add = add i32 %load1, %load2
+  ret i32 %add
+}
+
+define i32 @load_clobber_load_gep2(<vscale x 4 x i32>* %p) {
+; CHECK-LABEL: @load_clobber_load_gep2(
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* [[P:%.*]], i64 1, i64 0
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i32, i32* [[GEP1]]
+; CHECK-NEXT:    [[P2:%.*]] = bitcast <vscale x 4 x i32>* [[P]] to i32*
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i32, i32* [[P2]], i64 4
+; CHECK-NEXT:    [[LOAD2:%.*]] = load i32, i32* [[GEP2]]
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[LOAD1]], [[LOAD2]]
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+  %gep1 = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %p, i64 1, i64 0
+  %load1 = load i32, i32* %gep1
+  %p2 = bitcast <vscale x 4 x i32>* %p to i32*
+  %gep2 = getelementptr i32, i32* %p2, i64 4
+  %load2 = load i32, i32* %gep2 ; <- can not determine at compile-time if %load1 and %load2 are same addr
+  %add = add i32 %load1, %load2
+  ret i32 %add
+}
+
+define i32 @load_clobber_load_gep3(<vscale x 4 x i32>* %p) {
+; CHECK-LABEL: @load_clobber_load_gep3(
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* [[P:%.*]], i64 1, i64 0
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i32, i32* [[GEP1]]
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[LOAD1]], [[LOAD1]]
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+  %gep1 = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %p, i64 1, i64 0
+  %load1 = load i32, i32* %gep1
+  %p2 = bitcast <vscale x 4 x i32>* %p to <vscale x 4 x float>*
+  %gep2 = getelementptr <vscale x 4 x float>, <vscale x 4 x float>* %p2, i64 1, i64 0
+  %load2 = load float, float* %gep2 ; <- load to be eliminated
+  %cast = bitcast float %load2 to i32
+  %add = add i32 %load1, %cast
+  ret i32 %add
+}
+
+define <vscale x 4 x i32> @load_clobber_load_fence(<vscale x 4 x i32>* %p) {
+; CHECK-LABEL: @load_clobber_load_fence(
+; CHECK-NEXT:    [[LOAD1:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[P:%.*]]
+; CHECK-NEXT:    call void asm "", "~{memory}"()
+; CHECK-NEXT:    [[LOAD2:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[P]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub <vscale x 4 x i32> [[LOAD1]], [[LOAD2]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[SUB]]
+;
+  %load1 = load <vscale x 4 x i32>, <vscale x 4 x i32>* %p
+  call void asm "", "~{memory}"()
+  %load2 = load <vscale x 4 x i32>, <vscale x 4 x i32>* %p
+  %sub = sub <vscale x 4 x i32> %load1, %load2
+  ret <vscale x 4 x i32> %sub
+}
+
+define <vscale x 4 x i32> @load_clobber_load_sideeffect(<vscale x 4 x i32>* %p) {
+; CHECK-LABEL: @load_clobber_load_sideeffect(
+; CHECK-NEXT:    [[LOAD1:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[P:%.*]]
+; CHECK-NEXT:    call void asm sideeffect "", ""()
+; CHECK-NEXT:    [[LOAD2:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[P]]
+; CHECK-NEXT:    [[ADD:%.*]] = add <vscale x 4 x i32> [[LOAD1]], [[LOAD2]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD]]
+;
+  %load1 = load <vscale x 4 x i32>, <vscale x 4 x i32>* %p
+  call void asm sideeffect "", ""()
+  %load2 = load <vscale x 4 x i32>, <vscale x 4 x i32>* %p
+  %add = add <vscale x 4 x i32> %load1, %load2
+  ret <vscale x 4 x i32> %add
+}
+
+; Analyze Load from clobbering Store.
+
+define <vscale x 4 x i32> @store_forward_to_load(<vscale x 4 x i32>* %p) {
+; CHECK-LABEL: @store_forward_to_load(
+; CHECK-NEXT:    store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* [[P:%.*]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> zeroinitializer
+;
+  store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* %p
+  %load = load <vscale x 4 x i32>, <vscale x 4 x i32>* %p
+  ret <vscale x 4 x i32> %load
+}
+
+define <vscale x 4 x i32> @store_forward_to_load_sideeffect(<vscale x 4 x i32>* %p) {
+; CHECK-LABEL: @store_forward_to_load_sideeffect(
+; CHECK-NEXT:    store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* [[P:%.*]]
+; CHECK-NEXT:    call void asm sideeffect "", ""()
+; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[P]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[LOAD]]
+;
+  store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* %p
+  call void asm sideeffect "", ""()
+  %load = load <vscale x 4 x i32>, <vscale x 4 x i32>* %p
+  ret <vscale x 4 x i32> %load
+}
+
+define i32 @store_clobber_load() {
+; CHECK-LABEL: @store_clobber_load(
+; CHECK-NEXT:    [[ALLOC:%.*]] = alloca <vscale x 4 x i32>
+; CHECK-NEXT:    store <vscale x 4 x i32> undef, <vscale x 4 x i32>* [[ALLOC]]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* [[ALLOC]], i32 0, i32 1
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[PTR]]
+; CHECK-NEXT:    ret i32 [[LOAD]]
+;
+  %alloc = alloca <vscale x 4 x i32>
+  store <vscale x 4 x i32> undef, <vscale x 4 x i32>* %alloc
+  %ptr = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %alloc, i32 0, i32 1
+  %load = load i32, i32* %ptr
+  ret i32 %load
+}
+
+; Analyze Load from clobbering MemInst.
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1)
+
+define i32 @memset_clobber_load(<vscale x 4 x i32> *%p) {
+; CHECK-LABEL: @memset_clobber_load(
+; CHECK-NEXT:    [[CONV:%.*]] = bitcast <vscale x 4 x i32>* [[P:%.*]] to i8*
+; CHECK-NEXT:    tail call void @llvm.memset.p0i8.i64(i8* [[CONV]], i8 1, i64 200, i1 false)
+; CHECK-NEXT:    ret i32 16843009
+;
+  %conv = bitcast <vscale x 4 x i32>* %p to i8*
+  tail call void @llvm.memset.p0i8.i64(i8* %conv, i8 1, i64 200, i1 false)
+  %gep = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %p, i64 0, i64 5
+  %load = load i32, i32* %gep
+  ret i32 %load
+}
+
+define i32 @memset_clobber_load_vscaled_base(<vscale x 4 x i32> *%p) {
+; CHECK-LABEL: @memset_clobber_load_vscaled_base(
+; CHECK-NEXT:    [[CONV:%.*]] = bitcast <vscale x 4 x i32>* [[P:%.*]] to i8*
+; CHECK-NEXT:    tail call void @llvm.memset.p0i8.i64(i8* [[CONV]], i8 1, i64 200, i1 false)
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* [[P]], i64 1, i64 1
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[GEP]]
+; CHECK-NEXT:    ret i32 [[LOAD]]
+;
+  %conv = bitcast <vscale x 4 x i32>* %p to i8*
+  tail call void @llvm.memset.p0i8.i64(i8* %conv, i8 1, i64 200, i1 false)
+  %gep = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %p, i64 1, i64 1
+  %load = load i32, i32* %gep
+  ret i32 %load
+}
+
+define i32 @memset_clobber_load_nonconst_index(<vscale x 4 x i32> *%p, i64 %idx1, i64 %idx2) {
+; CHECK-LABEL: @memset_clobber_load_nonconst_index(
+; CHECK-NEXT:    [[CONV:%.*]] = bitcast <vscale x 4 x i32>* [[P:%.*]] to i8*
+; CHECK-NEXT:    tail call void @llvm.memset.p0i8.i64(i8* [[CONV]], i8 1, i64 200, i1 false)
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* [[P]], i64 [[IDX1:%.*]], i64 [[IDX2:%.*]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, i32* [[GEP]]
+; CHECK-NEXT:    ret i32 [[LOAD]]
+;
+  %conv = bitcast <vscale x 4 x i32>* %p to i8*
+  tail call void @llvm.memset.p0i8.i64(i8* %conv, i8 1, i64 200, i1 false)
+  %gep = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %p, i64 %idx1, i64 %idx2
+  %load = load i32, i32* %gep
+  ret i32 %load
+}
+
+
+; Load elimination across BBs
+
+define <vscale x 4 x i32>* @load_from_alloc_replaced_with_undef() {
+; CHECK-LABEL: @load_from_alloc_replaced_with_undef(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca <vscale x 4 x i32>
+; CHECK-NEXT:    br i1 undef, label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* [[A]]
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret <vscale x 4 x i32>* [[A]]
+;
+entry:
+  %a = alloca <vscale x 4 x i32>
+  %gep = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %a, i64 0, i64 1
+  %load = load i32, i32* %gep ; <- load to be eliminated
+  %tobool = icmp eq i32 %load, 0 ; <- icmp to be eliminated
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:
+  store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* %a
+  br label %if.end
+
+if.end:
+  ret <vscale x 4 x i32>* %a
+}
+
+define i32 @redundant_load_elimination_1(<vscale x 4 x i32>* %p) {
+; CHECK-LABEL: @redundant_load_elimination_1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* [[P:%.*]], i64 1, i64 1
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i32, i32* [[GEP]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[LOAD1]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret i32 [[LOAD1]]
+;
+entry:
+  %gep = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %p, i64 1, i64 1
+  %load1 = load i32, i32* %gep
+  %cmp = icmp eq i32 %load1, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %load2 = load i32, i32* %gep ; <- load to be eliminated
+  %add = add i32 %load1, %load2
+  br label %if.end
+
+if.end:
+  %result = phi i32 [ %add, %if.then ], [ %load1, %entry ]
+  ret i32 %result
+}
+
+define void @redundant_load_elimination_2(i1 %c, <vscale x 4 x i32>* %p, i32* %q, <vscale x 4 x i32> %v) {
+; CHECK-LABEL: @redundant_load_elimination_2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* [[P:%.*]], i64 1, i64 1
+; CHECK-NEXT:    store i32 0, i32* [[GEP1]]
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* [[P]], i64 1, i64 0
+; CHECK-NEXT:    store i32 1, i32* [[GEP2]]
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    store i32 0, i32* [[Q:%.*]]
+; CHECK-NEXT:    ret void
+; CHECK:       if.else:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %gep1 = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %p, i64 1, i64 1
+  store i32 0, i32* %gep1
+  %gep2 = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %p, i64 1, i64 0
+  store i32 1, i32* %gep2
+  br i1 %c, label %if.else, label %if.then
+
+if.then:
+  %t = load i32, i32* %gep1 ; <- load to be eliminated
+  store i32 %t, i32* %q
+  ret void
+
+if.else:
+  ret void
+}
+
+; TODO: load in if.then could have been eliminated
+define void @missing_load_elimination(i1 %c, <vscale x 4 x i32>* %p, <vscale x 4 x i32>* %q, <vscale x 4 x i32> %v) {
+; CHECK-LABEL: @missing_load_elimination(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* [[P:%.*]]
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* [[P]], i64 1
+; CHECK-NEXT:    store <vscale x 4 x i32> [[V:%.*]], <vscale x 4 x i32>* [[P1]]
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[T:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[P]]
+; CHECK-NEXT:    store <vscale x 4 x i32> [[T]], <vscale x 4 x i32>* [[Q:%.*]]
+; CHECK-NEXT:    ret void
+; CHECK:       if.else:
+; CHECK-NEXT:    ret void
+;
+entry:
+  store <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32>* %p
+  %p1 = getelementptr <vscale x 4 x i32>, <vscale x 4 x i32>* %p, i64 1
+  store <vscale x 4 x i32> %v, <vscale x 4 x i32>* %p1
+  br i1 %c, label %if.else, label %if.then
+
+if.then:
+  %t = load <vscale x 4 x i32>, <vscale x 4 x i32>* %p ; load could be eliminated
+  store <vscale x 4 x i32> %t, <vscale x 4 x i32>* %q
+  ret void
+
+if.else:
+  ret void
+}