diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -28,7 +28,10 @@
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/InstSimplifyFolder.h"
+#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/IRBuilder.h"
@@ -38,6 +41,7 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Pass.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
 
 #define DEBUG_TYPE "amdgpu-promote-alloca"
 
@@ -45,20 +49,20 @@
 
 namespace {
 
-static cl::opt<bool> DisablePromoteAllocaToVector(
-  "disable-promote-alloca-to-vector",
-  cl::desc("Disable promote alloca to vector"),
-  cl::init(false));
+static cl::opt<bool>
+    DisablePromoteAllocaToVector("disable-promote-alloca-to-vector",
+                                 cl::desc("Disable promote alloca to vector"),
+                                 cl::init(false));
 
-static cl::opt<bool> DisablePromoteAllocaToLDS(
-  "disable-promote-alloca-to-lds",
-  cl::desc("Disable promote alloca to LDS"),
-  cl::init(false));
+static cl::opt<bool>
+    DisablePromoteAllocaToLDS("disable-promote-alloca-to-lds",
+                              cl::desc("Disable promote alloca to LDS"),
+                              cl::init(false));
 
 static cl::opt<unsigned> PromoteAllocaToVectorLimit(
-  "amdgpu-promote-alloca-to-vector-limit",
-  cl::desc("Maximum byte size to consider promote alloca to vector"),
-  cl::init(0));
+    "amdgpu-promote-alloca-to-vector-limit",
+    cl::desc("Maximum byte size to consider promote alloca to vector"),
+    cl::init(0));
 
 // Shared implementation which can do both promotion to vector and to LDS.
 class AMDGPUPromoteAllocaImpl {
@@ -80,17 +84,16 @@
 
   /// BaseAlloca is the alloca root the search started from.
   /// Val may be that alloca or a recursive user of it.
-  bool collectUsesWithPtrTypes(Value *BaseAlloca,
-                               Value *Val,
-                               std::vector<Value*> &WorkList) const;
+  bool collectUsesWithPtrTypes(Value *BaseAlloca, Value *Val,
+                               std::vector<Value *> &WorkList) const;
 
   /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
   /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
   /// Returns true if both operands are derived from the same alloca. Val should
   /// be the same value as one of the input operands of UseInst.
   bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
-                                       Instruction *UseInst,
-                                       int OpIdx0, int OpIdx1) const;
+                                       Instruction *UseInst, int OpIdx0,
+                                       int OpIdx1) const;
 
   /// Check whether we have enough local memory for promotion.
   bool hasSufficientLocalMem(const Function &F);
@@ -253,6 +256,10 @@
       Changed = true;
   }
 
+  // NOTE: tryPromoteAllocaToVector removes the alloca, so Allocas contains
+  // dangling pointers. If we want to reuse it past this point, the loop above
+  // would need to be updated to remove successfully promoted allocas.
+
   return Changed;
 }
 
@@ -269,6 +276,10 @@
   using namespace PatternMatch;
   // For now we only care about non-volatile memsets that affect the whole type
   // (start at index 0 and fill the whole alloca).
+  //
+  // TODO: Now that we moved to PromoteAlloca we could handle any memsets
+  // (except maybe volatile ones?) - we just need to use shufflevector if it
+  // only affects a subset of the vector.
   const unsigned Size = DL.getTypeStoreSize(AI->getAllocatedType());
   return I->getOperand(0) == AI &&
          match(I->getOperand(2), m_SpecificInt(Size)) && !I->isVolatile();
@@ -319,6 +330,200 @@
   return ConstantInt::get(GEP->getContext(), Quot);
 }
 
+/// Promotes a single user of the alloca to a vector form.
+///
+/// \param Inst           Instruction to be promoted.
+/// \param DL             Module Data Layout.
+/// \param VectorTy       Vectorized Type.
+/// \param VecStoreSize   Size of \p VectorTy in bytes.
+/// \param ElementSize    Size of \p VectorTy element type in bytes.
+/// \param TransferInfo   MemTransferInst info map.
+/// \param GEPVectorIdx   GEP -> VectorIdx cache.
+/// \param CurVal         Current value of the vector (e.g. last stored value)
+/// \param[out]  DeferredLoads \p Inst is added to this vector if it can't
+///              be promoted now. This happens when promoting requires \p
+///              CurVal, but \p CurVal is nullptr.
+/// \return the stored value if \p Inst would have written to the alloca, or
+///         nullptr otherwise.
+static Value *promoteAllocaUserToVector(
+    Instruction *Inst, const DataLayout &DL, FixedVectorType *VectorTy,
+    unsigned VecStoreSize, unsigned ElementSize,
+    DenseMap<MemTransferInst *, MemTransferInfo> &TransferInfo,
+    std::map<GetElementPtrInst *, Value *> &GEPVectorIdx, Value *CurVal,
+    SmallVectorImpl<LoadInst *> &DeferredLoads) {
+  // Note: we use InstSimplifyFolder because it can leverage the DataLayout
+  // to do more folding, especially in the case of vector splats.
+  IRBuilder<InstSimplifyFolder> Builder(Inst->getContext(),
+                                        InstSimplifyFolder(DL));
+  Builder.SetInsertPoint(Inst);
+
+  const auto GetOrLoadCurrentVectorValue = [&]() -> Value * {
+    if (CurVal)
+      return CurVal;
+
+    // If the current value is not known, insert a dummy load and lower it on
+    // the second pass.
+    LoadInst *Dummy =
+        Builder.CreateLoad(VectorTy, PoisonValue::get(Builder.getPtrTy()),
+                           "promotealloca.dummyload");
+    DeferredLoads.push_back(Dummy);
+    return Dummy;
+  };
+
+  const auto CreateTempPtrIntCast =
+      [&Builder, VecStoreSize](Value *Val, Type *PtrTy) -> Value * {
+    const unsigned TempIntSize = (VecStoreSize * 8);
+    if (!PtrTy->isVectorTy())
+      return Builder.CreateBitOrPointerCast(Val,
+                                            Builder.getIntNTy(TempIntSize));
+    const unsigned NumPtrElts = cast<FixedVectorType>(PtrTy)->getNumElements();
+    // If we want to cast to cast, e.g. a <2 x ptr> into a <4 x i32>, we need to
+    // first cast the ptr vector to <2 x i64>.
+    assert(alignTo(TempIntSize, NumPtrElts) == TempIntSize &&
+           "Vector size not divisble");
+    Type *EltTy = Builder.getIntNTy(TempIntSize / NumPtrElts);
+    return Builder.CreateBitOrPointerCast(
+        Val, FixedVectorType::get(EltTy, NumPtrElts));
+  };
+
+  Type *VecEltTy = VectorTy->getElementType();
+  switch (Inst->getOpcode()) {
+  case Instruction::Load: {
+    // Loads can only be lowered if the value is known.
+    if (!CurVal) {
+      DeferredLoads.push_back(cast<LoadInst>(Inst));
+      return nullptr;
+    }
+
+    Value *Index = calculateVectorIndex(
+        cast<LoadInst>(Inst)->getPointerOperand(), GEPVectorIdx);
+
+    // We're loading the full vector.
+    if (DL.getTypeStoreSize(Inst->getType()) == VecStoreSize) {
+      assert(cast<Constant>(Index)->isZeroValue());
+      Type *InstTy = Inst->getType();
+      if (InstTy->isPtrOrPtrVectorTy())
+        CurVal = CreateTempPtrIntCast(CurVal, InstTy);
+      Value *NewVal = Builder.CreateBitOrPointerCast(CurVal, InstTy);
+      Inst->replaceAllUsesWith(NewVal);
+      return nullptr;
+    }
+
+    // We're loading one element.
+    Value *ExtractElement = Builder.CreateExtractElement(CurVal, Index);
+    if (Inst->getType() != VecEltTy)
+      ExtractElement =
+          Builder.CreateBitOrPointerCast(ExtractElement, Inst->getType());
+
+    Inst->replaceAllUsesWith(ExtractElement);
+    return nullptr;
+  }
+  case Instruction::Store: {
+    // For stores, it's a bit trickier and it depends on whether we're storing
+    // the full vector or not. If we're storing the full vector, we don't need
+    // to know the current value. If this is a store of a single element, we
+    // need to know the value.
+    StoreInst *SI = cast<StoreInst>(Inst);
+    Value *Index = calculateVectorIndex(SI->getPointerOperand(), GEPVectorIdx);
+    Value *Val = SI->getValueOperand();
+
+    // We're storing the full vector, we can handle this without knowing CurVal.
+    if (DL.getTypeStoreSize(Val->getType()) == VecStoreSize) {
+      assert(cast<Constant>(Index)->isZeroValue());
+      Type *SrcTy = Val->getType();
+      if (SrcTy->isPtrOrPtrVectorTy())
+        Val = CreateTempPtrIntCast(Val, SrcTy);
+      return Builder.CreateBitOrPointerCast(Val, VectorTy);
+    }
+
+    if (Val->getType() != VecEltTy)
+      Val = Builder.CreateBitOrPointerCast(Val, VecEltTy);
+    return Builder.CreateInsertElement(GetOrLoadCurrentVectorValue(), Val,
+                                       Index);
+  }
+  case Instruction::Call: {
+    if (auto *MTI = dyn_cast<MemTransferInst>(Inst)) {
+      // For memcpy, we need to know curval.
+      ConstantInt *Length = cast<ConstantInt>(MTI->getLength());
+      unsigned NumCopied = Length->getZExtValue() / ElementSize;
+      MemTransferInfo *TI = &TransferInfo[MTI];
+      unsigned SrcBegin = TI->SrcIndex->getZExtValue();
+      unsigned DestBegin = TI->DestIndex->getZExtValue();
+
+      SmallVector<int> Mask;
+      for (unsigned Idx = 0; Idx < VectorTy->getNumElements(); ++Idx) {
+        if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
+          Mask.push_back(SrcBegin++);
+        } else {
+          Mask.push_back(Idx);
+        }
+      }
+
+      return Builder.CreateShuffleVector(GetOrLoadCurrentVectorValue(), Mask);
+    }
+
+    if (auto *MSI = dyn_cast<MemSetInst>(Inst)) {
+      // For memset, we don't need to know the previous value because we
+      // currently only allow memsets that cover the whole alloca.
+      Value *Elt = MSI->getOperand(1);
+      if (DL.getTypeStoreSize(VecEltTy) > 1) {
+        Value *EltBytes =
+            Builder.CreateVectorSplat(DL.getTypeStoreSize(VecEltTy), Elt);
+        Elt = Builder.CreateBitCast(EltBytes, VecEltTy);
+      }
+
+      return Builder.CreateVectorSplat(VectorTy->getElementCount(), Elt);
+    }
+
+    llvm_unreachable("Unsupported call when promoting alloca to vector");
+  }
+
+  default:
+    llvm_unreachable("Inconsistency in instructions promotable to vector");
+  }
+
+  llvm_unreachable("Did not return after promoting instruction!");
+}
+
+/// Iterates over an instruction worklist that may contain multiple instructions
+/// from the same basic block, but in a different order.
+template <typename InstContainer>
+static void forEachWorkListItem(const InstContainer &WorkList,
+                                std::function<void(Instruction *)> Fn) {
+  // Bucket up uses of the alloca by the block they occur in.
+  // This is important because we have to handle multiple defs/uses in a block
+  // ourselves: SSAUpdater is purely for cross-block references.
+  DenseMap<BasicBlock *, SmallDenseSet<Instruction *>> UsesByBlock;
+  for (Instruction *User : WorkList)
+    UsesByBlock[User->getParent()].insert(User);
+
+  for (Instruction *User : WorkList) {
+    BasicBlock *BB = User->getParent();
+    auto &BlockUses = UsesByBlock[BB];
+
+    // Already processed, skip.
+    if (BlockUses.empty())
+      continue;
+
+    // Only user in the block, directly process it.
+    if (BlockUses.size() == 1) {
+      Fn(User);
+      continue;
+    }
+
+    // Multiple users in the block, do a linear scan to see users in order.
+    for (Instruction &Inst : *BB) {
+      if (!BlockUses.contains(&Inst))
+        continue;
+
+      Fn(&Inst);
+    }
+
+    // Clear the block so we know it's been processed.
+    BlockUses.clear();
+  }
+}
+
 // FIXME: Should try to pick the most likely to be profitable allocas first.
 bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
   LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << '\n');
@@ -365,6 +570,7 @@
 
   std::map<GetElementPtrInst *, Value *> GEPVectorIdx;
   SmallVector<Instruction *> WorkList;
+  SmallVector<Instruction *> UsersToRemove;
   SmallVector<Instruction *> DeferredInsts;
   SmallVector<Use *, 8> Uses;
   DenseMap<MemTransferInst *, MemTransferInfo> TransferInfo;
@@ -393,12 +599,18 @@
         return RejectUser(Inst, "pointer is being stored");
 
       Type *AccessTy = getLoadStoreType(Inst);
+      if (AccessTy->isAggregateType())
+        return RejectUser(Inst, "unsupported load/store as aggregate");
+      assert(!AccessTy->isAggregateType() || AccessTy->isArrayTy());
+
       Ptr = Ptr->stripPointerCasts();
 
-      // Alloca already accessed as vector, leave alone.
+      // Alloca already accessed as vector.
       if (Ptr == &Alloca && DL->getTypeStoreSize(Alloca.getAllocatedType()) ==
-                                DL->getTypeStoreSize(AccessTy))
+                                DL->getTypeStoreSize(AccessTy)) {
+        WorkList.push_back(Inst);
         continue;
+      }
 
       // Check that this is a simple access of a vector element.
       bool IsSimple = isa<LoadInst>(Inst) ? cast<LoadInst>(Inst)->isSimple()
@@ -416,6 +628,7 @@
       // Look through bitcasts.
       for (Use &U : Inst->uses())
         Uses.push_back(&U);
+      UsersToRemove.push_back(Inst);
       continue;
     }
 
@@ -429,6 +642,7 @@
       GEPVectorIdx[GEP] = Index;
       for (Use &U : Inst->uses())
         Uses.push_back(&U);
+      UsersToRemove.push_back(Inst);
       continue;
     }
 
@@ -481,13 +695,17 @@
     }
 
     // Ignore assume-like intrinsics and comparisons used in assumes.
-    if (isAssumeLikeIntrinsic(Inst))
+    if (isAssumeLikeIntrinsic(Inst)) {
+      UsersToRemove.push_back(Inst);
       continue;
+    }
 
     if (isa<ICmpInst>(Inst) && all_of(Inst->users(), [](User *U) {
           return isAssumeLikeIntrinsic(cast<Instruction>(U));
-        }))
+        })) {
+      UsersToRemove.push_back(Inst);
       continue;
+    }
 
     return RejectUser(Inst, "unhandled alloca user");
   }
@@ -505,75 +723,60 @@
 
   LLVM_DEBUG(dbgs() << "  Converting alloca to vector " << *AllocaTy << " -> "
                     << *VectorTy << '\n');
+  const unsigned VecStoreSize = DL->getTypeStoreSize(VectorTy);
+
+  // Alloca is uninitialized memory. Imitate that by making the first value
+  // undef.
+  SSAUpdater Updater;
+  Updater.Initialize(VectorTy, "promotealloca");
+  Updater.AddAvailableValue(Alloca.getParent(), UndefValue::get(VectorTy));
+
+  // First handle the initial worklist.
+  SmallVector<LoadInst *, 4> DeferredLoads;
+  forEachWorkListItem(WorkList, [&](Instruction *I) {
+    BasicBlock *BB = I->getParent();
+    // On the first pass, we only take values that are trivially known, i.e.
+    // where AddAvailableValue was already called in this block.
+    Value *Result = promoteAllocaUserToVector(
+        I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx,
+        Updater.FindValueForBlock(BB), DeferredLoads);
+    if (Result)
+      Updater.AddAvailableValue(BB, Result);
+  });
+
+  // Then handle deferred loads.
+  forEachWorkListItem(DeferredLoads, [&](Instruction *I) {
+    SmallVector<LoadInst *, 0> NewDLs;
+    BasicBlock *BB = I->getParent();
+    // On the second pass, we use GetValueInMiddleOfBlock to guarantee we always
+    // get a value, inserting PHIs as needed.
+    Value *Result = promoteAllocaUserToVector(
+        I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx,
+        Updater.GetValueInMiddleOfBlock(I->getParent()), NewDLs);
+    if (Result)
+      Updater.AddAvailableValue(BB, Result);
+    assert(NewDLs.empty() && "No more deferred loads should be queued!");
+  });
+
+  // Delete all instructions. On the first pass, new dummy loads may have been
+  // added so we need to collect them too.
+  DenseSet<Instruction *> InstsToDelete(WorkList.begin(), WorkList.end());
+  InstsToDelete.insert(DeferredLoads.begin(), DeferredLoads.end());
+  for (Instruction *I : InstsToDelete) {
+    assert(I->use_empty());
+    I->eraseFromParent();
+  }
 
-  for (Instruction *Inst : WorkList) {
-    IRBuilder<> Builder(Inst);
-    switch (Inst->getOpcode()) {
-    case Instruction::Load: {
-      Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand();
-      Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
-      Value *VecValue =
-          Builder.CreateAlignedLoad(VectorTy, &Alloca, Alloca.getAlign());
-      Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
-      if (Inst->getType() != VecEltTy)
-        ExtractElement =
-            Builder.CreateBitOrPointerCast(ExtractElement, Inst->getType());
-      Inst->replaceAllUsesWith(ExtractElement);
-      Inst->eraseFromParent();
-      break;
-    }
-    case Instruction::Store: {
-      StoreInst *SI = cast<StoreInst>(Inst);
-      Value *Ptr = SI->getPointerOperand();
-      Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
-      Value *VecValue =
-          Builder.CreateAlignedLoad(VectorTy, &Alloca, Alloca.getAlign());
-      Value *Elt = SI->getValueOperand();
-      if (Elt->getType() != VecEltTy)
-        Elt = Builder.CreateBitOrPointerCast(Elt, VecEltTy);
-      Value *NewVecValue = Builder.CreateInsertElement(VecValue, Elt, Index);
-      Builder.CreateAlignedStore(NewVecValue, &Alloca, Alloca.getAlign());
-      Inst->eraseFromParent();
-      break;
-    }
-    case Instruction::Call: {
-      if (const MemTransferInst *MTI = dyn_cast<MemTransferInst>(Inst)) {
-        ConstantInt *Length = cast<ConstantInt>(MTI->getLength());
-        unsigned NumCopied = Length->getZExtValue() / ElementSize;
-        MemTransferInfo *TI = &TransferInfo[cast<MemTransferInst>(Inst)];
-        unsigned SrcBegin = TI->SrcIndex->getZExtValue();
-        unsigned DestBegin = TI->DestIndex->getZExtValue();
-
-        SmallVector<int> Mask;
-        for (unsigned Idx = 0; Idx < VectorTy->getNumElements(); ++Idx) {
-          if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
-            Mask.push_back(SrcBegin++);
-          } else {
-            Mask.push_back(Idx);
-          }
-        }
-        Value *VecValue =
-            Builder.CreateAlignedLoad(VectorTy, &Alloca, Alloca.getAlign());
-        Value *NewVecValue = Builder.CreateShuffleVector(VecValue, Mask);
-        Builder.CreateAlignedStore(NewVecValue, &Alloca, Alloca.getAlign());
-
-        Inst->eraseFromParent();
-      } else if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) {
-        // Ensure the length parameter of the memsets matches the new vector
-        // type's. In general, the type size shouldn't change so this is a
-        // no-op, but it's better to be safe.
-        MSI->setOperand(2, Builder.getInt64(DL->getTypeStoreSize(VectorTy)));
-      } else {
-        llvm_unreachable("Unsupported call when promoting alloca to vector");
-      }
-      break;
-    }
-
-    default:
-      llvm_unreachable("Inconsistency in instructions promotable to vector");
-    }
+  // Delete all the users that are known to be removeable.
+  for (Instruction *I : reverse(UsersToRemove)) {
+    I->dropDroppableUses();
+    assert(I->use_empty());
+    I->eraseFromParent();
   }
 
+  // Alloca should now be dead too.
+  assert(Alloca.use_empty());
+  Alloca.eraseFromParent();
   return true;
 }
 
@@ -1061,7 +1264,7 @@
 
   CurrentLocalMemUsage = NewSize;
 
-  std::vector<Value*> WorkList;
+  std::vector<Value *> WorkList;
 
   if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
     LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n");
@@ -1204,10 +1407,9 @@
     assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove);
 
     MemTransferInst *MI = cast<MemTransferInst>(Intr);
-    auto *B =
-      Builder.CreateMemTransferInst(ID, MI->getRawDest(), MI->getDestAlign(),
-                                    MI->getRawSource(), MI->getSourceAlign(),
-                                    MI->getLength(), MI->isVolatile());
+    auto *B = Builder.CreateMemTransferInst(
+        ID, MI->getRawDest(), MI->getDestAlign(), MI->getRawSource(),
+        MI->getSourceAlign(), MI->getLength(), MI->isVolatile());
 
     for (unsigned I = 0; I != 2; ++I) {
       if (uint64_t Bytes = Intr->getParamDereferenceableBytes(I)) {
diff --git a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
--- a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
@@ -13,37 +13,20 @@
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s16, s33
-; GCN-NEXT:    s_add_i32 s33, s32, 0xfc0
-; GCN-NEXT:    s_and_b32 s33, s33, 0xfffff000
+; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
-; GCN-NEXT:    s_addk_i32 s32, 0x3000
+; GCN-NEXT:    s_addk_i32 s32, 0x400
 ; GCN-NEXT:    v_writelane_b32 v42, s16, 2
 ; GCN-NEXT:    s_getpc_b64 s[16:17]
 ; GCN-NEXT:    s_add_u32 s16, s16, extern_func@gotpcrel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, extern_func@gotpcrel32@hi+12
 ; GCN-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    v_writelane_b32 v42, s30, 0
-; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:92
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:88
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:84
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:80
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:76
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:72
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:68
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:64
-; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, v8
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    v_writelane_b32 v42, s31, 1
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
@@ -57,9 +40,9 @@
 ; GCN-NEXT:    v_readlane_b32 s30, v42, 0
 ; GCN-NEXT:    v_readlane_b32 s4, v42, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-NEXT:    s_addk_i32 s32, 0xd000
+; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=sroa,amdgpu-promote-alloca < %s | FileCheck %s
 
 ; Make sure that array alloca loaded and stored as multi-element aggregates are handled correctly
 ; Strictly the promote-alloca pass shouldn't have to deal with this case as it is non-canonical, but
@@ -16,19 +16,16 @@
 
 define amdgpu_vs void @promote_1d_aggr() #0 {
 ; CHECK-LABEL: @promote_1d_aggr(
-; CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4, addrspace(5)
 ; CHECK-NEXT:    [[F1:%.*]] = alloca [1 x float], align 4, addrspace(5)
 ; CHECK-NEXT:    [[FOO:%.*]] = getelementptr [[BLOCK:%.*]], ptr addrspace(1) @block, i32 0, i32 1
 ; CHECK-NEXT:    [[FOO1:%.*]] = load i32, ptr addrspace(1) [[FOO]], align 4
-; CHECK-NEXT:    store i32 [[FOO1]], ptr addrspace(5) [[I]], align 4
 ; CHECK-NEXT:    [[FOO3:%.*]] = load [1 x float], ptr addrspace(1) @block, align 4
-; CHECK-NEXT:    store [1 x float] [[FOO3]], ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[FOO4:%.*]] = load i32, ptr addrspace(5) [[I]], align 4
-; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [1 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
+; CHECK-NEXT:    [[FOO3_FCA_0_EXTRACT:%.*]] = extractvalue [1 x float] [[FOO3]], 0
+; CHECK-NEXT:    [[FOO3_FCA_0_GEP:%.*]] = getelementptr inbounds [1 x float], ptr addrspace(5) [[F1]], i32 0, i32 0
+; CHECK-NEXT:    store float [[FOO3_FCA_0_EXTRACT]], ptr addrspace(5) [[FOO3_FCA_0_GEP]], align 4
+; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [1 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO1]]
 ; CHECK-NEXT:    [[FOO6:%.*]] = load float, ptr addrspace(5) [[FOO5]], align 4
-; CHECK-NEXT:    [[FOO7:%.*]] = alloca <4 x float>, align 16, addrspace(5)
-; CHECK-NEXT:    [[FOO8:%.*]] = load <4 x float>, ptr addrspace(5) [[FOO7]], align 16
-; CHECK-NEXT:    [[FOO9:%.*]] = insertelement <4 x float> [[FOO8]], float [[FOO6]], i32 0
+; CHECK-NEXT:    [[FOO9:%.*]] = insertelement <4 x float> undef, float [[FOO6]], i32 0
 ; CHECK-NEXT:    [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[FOO6]], i32 1
 ; CHECK-NEXT:    [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[FOO6]], i32 2
 ; CHECK-NEXT:    [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[FOO6]], i32 3
@@ -60,22 +57,12 @@
 
 define amdgpu_vs void @promote_store_aggr() #0 {
 ; CHECK-LABEL: @promote_store_aggr(
-; CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4, addrspace(5)
-; CHECK-NEXT:    [[F1:%.*]] = alloca [2 x float], align 4, addrspace(5)
 ; CHECK-NEXT:    [[FOO1:%.*]] = load i32, ptr addrspace(1) @block2, align 4
-; CHECK-NEXT:    store i32 [[FOO1]], ptr addrspace(5) [[I]], align 4
-; CHECK-NEXT:    [[FOO2:%.*]] = load i32, ptr addrspace(5) [[I]], align 4
-; CHECK-NEXT:    [[FOO3:%.*]] = sitofp i32 [[FOO2]] to float
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[FOO3]], i32 0
-; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [2 x float], ptr addrspace(5) [[F1]], i32 0, i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float 2.000000e+00, i64 1
-; CHECK-NEXT:    store <2 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[FOO6:%.*]] = load [2 x float], ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[FOO3:%.*]] = sitofp i32 [[FOO1]] to float
+; CHECK-NEXT:    [[FOO6_FCA_0_INSERT:%.*]] = insertvalue [2 x float] poison, float [[FOO3]], 0
+; CHECK-NEXT:    [[FOO6_FCA_1_INSERT:%.*]] = insertvalue [2 x float] [[FOO6_FCA_0_INSERT]], float 2.000000e+00, 1
 ; CHECK-NEXT:    [[FOO7:%.*]] = getelementptr [[BLOCK2:%.*]], ptr addrspace(1) @block2, i32 0, i32 1
-; CHECK-NEXT:    store [2 x float] [[FOO6]], ptr addrspace(1) [[FOO7]], align 4
+; CHECK-NEXT:    store [2 x float] [[FOO6_FCA_1_INSERT]], ptr addrspace(1) [[FOO7]], align 4
 ; CHECK-NEXT:    store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, ptr addrspace(1) @pv, align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -100,23 +87,18 @@
 
 define amdgpu_vs void @promote_load_from_store_aggr() #0 {
 ; CHECK-LABEL: @promote_load_from_store_aggr(
-; CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4, addrspace(5)
-; CHECK-NEXT:    [[F1:%.*]] = alloca [2 x float], align 4, addrspace(5)
 ; CHECK-NEXT:    [[FOO:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 1
 ; CHECK-NEXT:    [[FOO1:%.*]] = load i32, ptr addrspace(1) [[FOO]], align 4
-; CHECK-NEXT:    store i32 [[FOO1]], ptr addrspace(5) [[I]], align 4
 ; CHECK-NEXT:    [[FOO3:%.*]] = load [2 x float], ptr addrspace(1) @block3, align 4
-; CHECK-NEXT:    store [2 x float] [[FOO3]], ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[FOO4:%.*]] = load i32, ptr addrspace(5) [[I]], align 4
-; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [2 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 [[FOO4]]
-; CHECK-NEXT:    [[FOO7:%.*]] = alloca <4 x float>, align 16, addrspace(5)
-; CHECK-NEXT:    [[FOO8:%.*]] = load <4 x float>, ptr addrspace(5) [[FOO7]], align 16
-; CHECK-NEXT:    [[FOO9:%.*]] = insertelement <4 x float> [[FOO8]], float [[TMP2]], i32 0
-; CHECK-NEXT:    [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[TMP2]], i32 2
-; CHECK-NEXT:    [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[TMP2]], i32 3
+; CHECK-NEXT:    [[FOO3_FCA_0_EXTRACT:%.*]] = extractvalue [2 x float] [[FOO3]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> undef, float [[FOO3_FCA_0_EXTRACT]], i32 0
+; CHECK-NEXT:    [[FOO3_FCA_1_EXTRACT:%.*]] = extractvalue [2 x float] [[FOO3]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[FOO3_FCA_1_EXTRACT]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 [[FOO1]]
+; CHECK-NEXT:    [[FOO9:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0
+; CHECK-NEXT:    [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[TMP3]], i32 1
+; CHECK-NEXT:    [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[TMP3]], i32 3
 ; CHECK-NEXT:    store <4 x float> [[FOO12]], ptr addrspace(1) @pv, align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -142,22 +124,7 @@
 
 define amdgpu_vs void @promote_memmove_aggr() #0 {
 ; CHECK-LABEL: @promote_memmove_aggr(
-; CHECK-NEXT:    [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5)
-; CHECK-NEXT:    store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[FOO1:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 1.000000e+00, i64 1
-; CHECK-NEXT:    store <5 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[FOO2:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3
-; CHECK-NEXT:    [[TMP3:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <5 x float> [[TMP3]], float 2.000000e+00, i64 3
-; CHECK-NEXT:    store <5 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <5 x float> [[TMP5]], <5 x float> poison, <5 x i32> <i32 1, i32 2, i32 3, i32 4, i32 4>
-; CHECK-NEXT:    store <5 x float> [[TMP6]], ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <5 x float> [[TMP7]], i32 0
-; CHECK-NEXT:    store float [[TMP8]], ptr addrspace(1) @pv, align 4
+; CHECK-NEXT:    store float 1.000000e+00, ptr addrspace(1) @pv, align 4
 ; CHECK-NEXT:    ret void
 ;
   %f1 = alloca [5 x float], addrspace(5)
@@ -174,24 +141,12 @@
 
 define amdgpu_vs void @promote_memcpy_aggr() #0 {
 ; CHECK-LABEL: @promote_memcpy_aggr(
-; CHECK-NEXT:    [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5)
-; CHECK-NEXT:    store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[FOO2:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3
-; CHECK-NEXT:    [[TMP1:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 2.000000e+00, i64 3
-; CHECK-NEXT:    store <5 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4
 ; CHECK-NEXT:    [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0
 ; CHECK-NEXT:    [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4
-; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <5 x float> [[TMP3]], float 3.000000e+00, i32 [[FOO4]]
-; CHECK-NEXT:    store <5 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <5 x float> [[TMP5]], <5 x float> poison, <5 x i32> <i32 3, i32 4, i32 2, i32 3, i32 4>
-; CHECK-NEXT:    store <5 x float> [[TMP6]], ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <5 x float> [[TMP7]], i32 0
-; CHECK-NEXT:    store float [[TMP8]], ptr addrspace(1) @pv, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <5 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 2.000000e+00, float 0.000000e+00>, float 3.000000e+00, i32 [[FOO4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <5 x float> [[TMP1]], <5 x float> poison, <5 x i32> <i32 3, i32 4, i32 2, i32 3, i32 4>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <5 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    store float [[TMP3]], ptr addrspace(1) @pv, align 4
 ; CHECK-NEXT:    ret void
 ;
   %f1 = alloca [5 x float], addrspace(5)
@@ -213,22 +168,7 @@
 
 define amdgpu_vs void @promote_memcpy_identity_aggr() #0 {
 ; CHECK-LABEL: @promote_memcpy_identity_aggr(
-; CHECK-NEXT:    [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5)
-; CHECK-NEXT:    store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[FOO1:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 1.000000e+00, i64 1
-; CHECK-NEXT:    store <5 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[FOO2:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3
-; CHECK-NEXT:    [[TMP3:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <5 x float> [[TMP3]], float 2.000000e+00, i64 3
-; CHECK-NEXT:    store <5 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <5 x float> [[TMP5]], <5 x float> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4>
-; CHECK-NEXT:    store <5 x float> [[TMP6]], ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <5 x float> [[TMP7]], i32 0
-; CHECK-NEXT:    store float [[TMP8]], ptr addrspace(1) @pv, align 4
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(1) @pv, align 4
 ; CHECK-NEXT:    ret void
 ;
   %f1 = alloca [5 x float], addrspace(5)
@@ -248,8 +188,26 @@
 ; CHECK-LABEL: @promote_memcpy_two_aggrs(
 ; CHECK-NEXT:    [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5)
 ; CHECK-NEXT:    [[F2:%.*]] = alloca [5 x float], align 4, addrspace(5)
-; CHECK-NEXT:    store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    store [5 x float] zeroinitializer, ptr addrspace(5) [[F2]], align 4
+; CHECK-NEXT:    [[DOTFCA_0_GEP1:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 0
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_0_GEP1]], align 4
+; CHECK-NEXT:    [[DOTFCA_1_GEP2:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 1
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_1_GEP2]], align 4
+; CHECK-NEXT:    [[DOTFCA_2_GEP3:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 2
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_2_GEP3]], align 4
+; CHECK-NEXT:    [[DOTFCA_3_GEP4:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_3_GEP4]], align 4
+; CHECK-NEXT:    [[DOTFCA_4_GEP5:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 4
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_4_GEP5]], align 4
+; CHECK-NEXT:    [[DOTFCA_0_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 0
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_0_GEP]], align 4
+; CHECK-NEXT:    [[DOTFCA_1_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 1
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_1_GEP]], align 4
+; CHECK-NEXT:    [[DOTFCA_2_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 2
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_2_GEP]], align 4
+; CHECK-NEXT:    [[DOTFCA_3_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 3
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_3_GEP]], align 4
+; CHECK-NEXT:    [[DOTFCA_4_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 4
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_4_GEP]], align 4
 ; CHECK-NEXT:    [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0
 ; CHECK-NEXT:    [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4
 ; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
@@ -283,7 +241,16 @@
 define amdgpu_vs void @promote_memcpy_p1p5_aggr(ptr addrspace(1) inreg %src) #0 {
 ; CHECK-LABEL: @promote_memcpy_p1p5_aggr(
 ; CHECK-NEXT:    [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5)
-; CHECK-NEXT:    store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[DOTFCA_0_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 0
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_0_GEP]], align 4
+; CHECK-NEXT:    [[DOTFCA_1_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 1
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_1_GEP]], align 4
+; CHECK-NEXT:    [[DOTFCA_2_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 2
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_2_GEP]], align 4
+; CHECK-NEXT:    [[DOTFCA_3_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_3_GEP]], align 4
+; CHECK-NEXT:    [[DOTFCA_4_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 4
+; CHECK-NEXT:    store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_4_GEP]], align 4
 ; CHECK-NEXT:    [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0
 ; CHECK-NEXT:    [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4
 ; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
@@ -305,21 +272,12 @@
 
 define amdgpu_vs void @promote_memcpy_inline_aggr() #0 {
 ; CHECK-LABEL: @promote_memcpy_inline_aggr(
-; CHECK-NEXT:    [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5)
-; CHECK-NEXT:    store [5 x float] zeroinitializer, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[FOO2:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3
 ; CHECK-NEXT:    [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0
 ; CHECK-NEXT:    [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4
-; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 3.000000e+00, i32 [[FOO4]]
-; CHECK-NEXT:    store <5 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <5 x float> [[TMP3]], <5 x float> poison, <5 x i32> <i32 3, i32 4, i32 2, i32 3, i32 4>
-; CHECK-NEXT:    store <5 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = load <5 x float>, ptr addrspace(5) [[F1]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <5 x float> [[TMP5]], i32 0
-; CHECK-NEXT:    store float [[TMP6]], ptr addrspace(1) @pv, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <5 x float> zeroinitializer, float 3.000000e+00, i32 [[FOO4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <5 x float> [[TMP1]], <5 x float> poison, <5 x i32> <i32 3, i32 4, i32 2, i32 3, i32 4>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <5 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    store float [[TMP3]], ptr addrspace(1) @pv, align 4
 ; CHECK-NEXT:    ret void
 ;
   %f1 = alloca [5 x float], addrspace(5)
@@ -347,30 +305,16 @@
 
 define amdgpu_ps void @promote_double_aggr() #0 {
 ; CHECK-LABEL: @promote_double_aggr(
-; CHECK-NEXT:    [[S:%.*]] = alloca [2 x double], align 8, addrspace(5)
 ; CHECK-NEXT:    [[FOO:%.*]] = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 0
 ; CHECK-NEXT:    [[FOO1:%.*]] = load double, ptr addrspace(1) [[FOO]], align 8
 ; CHECK-NEXT:    [[FOO2:%.*]] = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 1
 ; CHECK-NEXT:    [[FOO3:%.*]] = load double, ptr addrspace(1) [[FOO2]], align 8
 ; CHECK-NEXT:    [[FOO4:%.*]] = insertvalue [2 x double] undef, double [[FOO1]], 0
 ; CHECK-NEXT:    [[FOO5:%.*]] = insertvalue [2 x double] [[FOO4]], double [[FOO3]], 1
-; CHECK-NEXT:    store [2 x double] [[FOO5]], ptr addrspace(5) [[S]], align 8
-; CHECK-NEXT:    [[FOO6:%.*]] = getelementptr [2 x double], ptr addrspace(5) [[S]], i32 0, i32 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i64 1
-; CHECK-NEXT:    [[FOO8:%.*]] = getelementptr [2 x double], ptr addrspace(5) [[S]], i32 0, i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i64 1
-; CHECK-NEXT:    [[FOO10:%.*]] = fadd double [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[FOO10]], i32 0
-; CHECK-NEXT:    store <2 x double> [[TMP6]], ptr addrspace(5) [[S]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
-; CHECK-NEXT:    [[FOO14:%.*]] = getelementptr [2 x double], ptr addrspace(5) [[S]], i32 0, i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP9]], i64 1
-; CHECK-NEXT:    [[FOO16:%.*]] = fadd double [[TMP8]], [[TMP10]]
+; CHECK-NEXT:    [[FOO5_FCA_0_EXTRACT:%.*]] = extractvalue [2 x double] [[FOO5]], 0
+; CHECK-NEXT:    [[FOO5_FCA_1_EXTRACT:%.*]] = extractvalue [2 x double] [[FOO5]], 1
+; CHECK-NEXT:    [[FOO10:%.*]] = fadd double [[FOO5_FCA_1_EXTRACT]], [[FOO5_FCA_1_EXTRACT]]
+; CHECK-NEXT:    [[FOO16:%.*]] = fadd double [[FOO10]], [[FOO5_FCA_1_EXTRACT]]
 ; CHECK-NEXT:    [[FOO17:%.*]] = fptrunc double [[FOO16]] to float
 ; CHECK-NEXT:    [[FOO18:%.*]] = insertelement <4 x float> undef, float [[FOO17]], i32 0
 ; CHECK-NEXT:    [[FOO19:%.*]] = insertelement <4 x float> [[FOO18]], float [[FOO17]], i32 1
@@ -410,21 +354,6 @@
 define amdgpu_kernel void @alloca_struct() #0 {
 ; CHECK-LABEL: @alloca_struct(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !invariant.load !0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 2
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !range [[RNG1:![0-9]+]], !invariant.load !0
-; CHECK-NEXT:    [[TMP5:%.*]] = lshr i32 [[TMP2]], 16
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.amdgcn.workitem.id.x(), !range [[RNG2:![0-9]+]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.amdgcn.workitem.id.y(), !range [[RNG2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.workitem.id.z(), !range [[RNG2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw nsw i32 [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], [[TMP6]]
-; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP4]]
-; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP8]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [1024 x [2 x %struct]], ptr addrspace(3) @alloca_struct.alloca, i32 0, i32 [[TMP13]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll
@@ -6,7 +6,7 @@
 @global_array1 = internal unnamed_addr addrspace(3) global [750 x [10 x i32]] undef, align 4
 
 ; IR-LABEL: define amdgpu_kernel void @promote_alloca_size_256(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) {
-; IR: alloca [10 x i32]
+; IR-NOT: alloca [10 x i32]
 ; ASM-LABEL: {{^}}promote_alloca_size_256:
 ; ASM: .amdgpu_lds llvm.amdgcn.kernel.promote_alloca_size_256.lds, 60000, 16
 ; ASM-NOT: .amdgpu_lds
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll
@@ -0,0 +1,161 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
+
+define amdgpu_kernel void @test_overwrite(i64 %val, i1 %cond) {
+; CHECK-LABEL: define amdgpu_kernel void @test_overwrite
+; CHECK-SAME: (i64 [[VAL:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND]], label [[LOOP:%.*]], label [[END:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[PROMOTEALLOCA:%.*]] = phi <3 x i64> [ [[TMP2:%.*]], [[LOOP]] ], [ <i64 43, i64 undef, i64 undef>, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <3 x i64> [[PROMOTEALLOCA]], i64 68, i32 0
+; CHECK-NEXT:    [[TMP2]] = insertelement <3 x i64> [[TMP1]], i64 32, i32 0
+; CHECK-NEXT:    [[LOOP_CC:%.*]] = icmp ne i64 [[TMP0]], 68
+; CHECK-NEXT:    br i1 [[LOOP_CC]], label [[LOOP]], label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PROMOTEALLOCA1:%.*]] = phi <3 x i64> [ [[TMP2]], [[LOOP]] ], [ <i64 43, i64 undef, i64 undef>, [[ENTRY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA1]], i32 0
+; CHECK-NEXT:    ret void
+;
+entry:
+  %stack = alloca [3 x i64], align 4, addrspace(5)
+  store i64 43, ptr addrspace(5) %stack
+  br i1 %cond, label %loop, label %end
+
+loop:
+  %load.0 = load i64, ptr addrspace(5) %stack
+  store i64 68, ptr addrspace(5) %stack
+  %load.1 = load i64, ptr addrspace(5) %stack
+  store i64 32, ptr addrspace(5) %stack
+  %loop.cc = icmp ne i64 %load.0, %load.1
+  br i1 %loop.cc, label %loop, label %end
+
+end:
+  %reload = load i64, ptr addrspace(5) %stack
+  ret void
+}
+
+define amdgpu_kernel void @test_no_overwrite(i64 %val, i1 %cond) {
+; CHECK-LABEL: define amdgpu_kernel void @test_no_overwrite
+; CHECK-SAME: (i64 [[VAL:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND]], label [[LOOP:%.*]], label [[END:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[PROMOTEALLOCA:%.*]] = phi <3 x i64> [ [[TMP1:%.*]], [[LOOP]] ], [ <i64 43, i64 undef, i64 undef>, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA]], i32 0
+; CHECK-NEXT:    [[TMP1]] = insertelement <3 x i64> [[PROMOTEALLOCA]], i64 32, i32 1
+; CHECK-NEXT:    [[LOOP_CC:%.*]] = icmp ne i64 [[TMP0]], 32
+; CHECK-NEXT:    br i1 [[LOOP_CC]], label [[LOOP]], label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PROMOTEALLOCA1:%.*]] = phi <3 x i64> [ [[TMP1]], [[LOOP]] ], [ <i64 43, i64 undef, i64 undef>, [[ENTRY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA1]], i32 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %stack = alloca [3 x i64], align 4, addrspace(5)
+  %stack.1 = getelementptr inbounds i64, ptr addrspace(5) %stack, i32 1
+  store i64 43, ptr addrspace(5) %stack
+  br i1 %cond, label %loop, label %end
+
+loop:
+  %load = load i64, ptr addrspace(5) %stack
+  store i64 32, ptr addrspace(5) %stack.1
+  %loop.cc = icmp ne i64 %load, 32
+  br i1 %loop.cc, label %loop, label %end
+
+end:
+  %reload = load i64, ptr addrspace(5) %stack
+  %reload.1 = load i64, ptr addrspace(5) %stack.1
+  ret void
+}
+
+define ptr @alloca_load_store_ptr64_full_ivec(ptr %arg) {
+; CHECK-LABEL: define ptr @alloca_load_store_ptr64_full_ivec
+; CHECK-SAME: (ptr [[ARG:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[ARG]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <8 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT:    ret ptr [[TMP3]]
+;
+entry:
+  %alloca = alloca [8 x i8], align 8, addrspace(5)
+  store ptr %arg, ptr addrspace(5) %alloca, align 8
+  %tmp = load ptr, ptr addrspace(5) %alloca, align 8
+  ret ptr %tmp
+}
+
+define ptr addrspace(3) @alloca_load_store_ptr32_full_ivec(ptr addrspace(3) %arg) {
+; CHECK-LABEL: define ptr addrspace(3) @alloca_load_store_ptr32_full_ivec
+; CHECK-SAME: (ptr addrspace(3) [[ARG:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr addrspace(3) [[ARG]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[TMP0]] to <4 x i8>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i8> [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
+; CHECK-NEXT:    ret ptr addrspace(3) [[TMP3]]
+;
+entry:
+  %alloca = alloca [4 x i8], align 8, addrspace(5)
+  store ptr addrspace(3) %arg, ptr addrspace(5) %alloca, align 8
+  %tmp = load ptr addrspace(3), ptr addrspace(5) %alloca, align 8
+  ret ptr addrspace(3) %tmp
+}
+
+define <4 x ptr addrspace(3)> @alloca_load_store_ptr_mixed_full_ptrvec(<2 x ptr> %arg) {
+; CHECK-LABEL: define <4 x ptr addrspace(3)> @alloca_load_store_ptr_mixed_full_ptrvec
+; CHECK-SAME: (<2 x ptr> [[ARG:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint <2 x ptr> [[ARG]] to <2 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <4 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr <4 x i32> [[TMP1]] to <4 x ptr addrspace(3)>
+; CHECK-NEXT:    ret <4 x ptr addrspace(3)> [[TMP2]]
+;
+entry:
+  %alloca = alloca [4 x i32], align 8, addrspace(5)
+  store <2 x ptr> %arg, ptr addrspace(5) %alloca, align 8
+  %tmp = load <4 x ptr addrspace(3)>, ptr addrspace(5) %alloca, align 8
+  ret <4 x ptr addrspace(3)> %tmp
+}
+
+; Currently rejected due to the store not being cast-able.
+; TODO: We should probably be able to vectorize this
+define void @alloca_load_store_ptr_mixed_ptrvec(<2 x ptr addrspace(3)> %arg) {
+; CHECK-LABEL: define void @alloca_load_store_ptr_mixed_ptrvec
+; CHECK-SAME: (<2 x ptr addrspace(3)> [[ARG:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [8 x i32], align 8, addrspace(5)
+; CHECK-NEXT:    store <2 x ptr addrspace(3)> [[ARG]], ptr addrspace(5) [[ALLOCA]], align 8
+; CHECK-NEXT:    [[TMP:%.*]] = load <2 x ptr addrspace(3)>, ptr addrspace(5) [[ALLOCA]], align 8
+; CHECK-NEXT:    [[TMP_FULL:%.*]] = load <4 x ptr addrspace(3)>, ptr addrspace(5) [[ALLOCA]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %alloca = alloca [8 x i32], align 8, addrspace(5)
+  store <2 x ptr addrspace(3)> %arg, ptr addrspace(5) %alloca, align 8
+  %tmp = load <2 x ptr addrspace(3)>, ptr addrspace(5) %alloca, align 8
+  %tmp.full = load <4 x ptr addrspace(3)>, ptr addrspace(5) %alloca, align 8
+  ret void
+}
+
+; Will not vectorize because we're accessing a 64 bit vector with a 32 bits pointer.
+define ptr addrspace(3) @alloca_load_store_ptr_mixed_full_ivec(ptr addrspace(3) %arg) {
+; CHECK-LABEL: define ptr addrspace(3) @alloca_load_store_ptr_mixed_full_ivec
+; CHECK-SAME: (ptr addrspace(3) [[ARG:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [8 x i8], align 8, addrspace(5)
+; CHECK-NEXT:    store ptr addrspace(3) [[ARG]], ptr addrspace(5) [[ALLOCA]], align 8
+; CHECK-NEXT:    [[TMP:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[ALLOCA]], align 8
+; CHECK-NEXT:    ret ptr addrspace(3) [[TMP]]
+;
+entry:
+  %alloca = alloca [8 x i8], align 8, addrspace(5)
+  store ptr addrspace(3) %arg, ptr addrspace(5) %alloca, align 8
+  %tmp = load ptr addrspace(3), ptr addrspace(5) %alloca, align 8
+  ret ptr addrspace(3) %tmp
+}
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
@@ -1,19 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-promote-alloca,sroa < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s
 
 ; Checks that memsets don't block PromoteAlloca.
 
-; Note: memsets are just updated with the new type size. They are not eliminated which means
-; the original alloca also stay. This puts a bit more load on SROA.
-; If PromoteAlloca is moved to SSAUpdater, we could just entirely replace the memsets with
-; e.g. ConstantAggregate.
-
 define amdgpu_kernel void @memset_all_zero(i64 %val) {
 ; CHECK-LABEL: @memset_all_zero(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <6 x i64> zeroinitializer, i64 [[VAL:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <6 x i64> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <6 x i64> [[TMP0]], i64 [[VAL]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <6 x i64> [[TMP0]], i64 [[VAL]], i64 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -30,8 +24,7 @@
 ; CHECK-LABEL: @memset_all_5(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i64> <i64 361700864190383365, i64 361700864190383365, i64 361700864190383365, i64 361700864190383365>, i64 [[VAL:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[VAL]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[VAL]], i64 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -47,11 +40,9 @@
 define amdgpu_kernel void @memset_volatile_nopromote(i64 %val) {
 ; CHECK-LABEL: @memset_volatile_nopromote(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[STACK_SROA_0:%.*]] = alloca i64, align 8, addrspace(5)
-; CHECK-NEXT:    [[STACK_SROA_2:%.*]] = alloca [3 x i64], align 8, addrspace(5)
-; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 [[STACK_SROA_0]], i8 0, i64 8, i1 true)
-; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 [[STACK_SROA_2]], i8 0, i64 24, i1 true)
-; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK_SROA_0]], align 8
+; CHECK-NEXT:    [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) [[STACK]], i8 0, i64 32, i1 true)
+; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -64,11 +55,9 @@
 define amdgpu_kernel void @memset_badsize_nopromote(i64 %val) {
 ; CHECK-LABEL: @memset_badsize_nopromote(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[STACK_SROA_0:%.*]] = alloca i64, align 8, addrspace(5)
-; CHECK-NEXT:    [[STACK_SROA_2:%.*]] = alloca [23 x i8], align 4, addrspace(5)
-; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 [[STACK_SROA_0]], i8 0, i64 8, i1 true)
-; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) align 4 [[STACK_SROA_2]], i8 0, i64 23, i1 true)
-; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK_SROA_0]], align 8
+; CHECK-NEXT:    [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) [[STACK]], i8 0, i64 31, i1 true)
+; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -81,8 +70,10 @@
 define amdgpu_kernel void @memset_offset_ptr_nopromote(i64 %val) {
 ; CHECK-LABEL: @memset_offset_ptr_nopromote(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[STACK_SROA_1:%.*]] = alloca [3 x i64], align 8, addrspace(5)
-; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) align 8 [[STACK_SROA_1]], i8 0, i64 24, i1 true)
+; CHECK-NEXT:    [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr [4 x i64], ptr addrspace(5) [[STACK]], i64 0, i64 1
+; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) [[GEP]], i8 0, i64 24, i1 true)
+; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-pointer-array.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-pointer-array.ll
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-pointer-array.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-pointer-array.ll
@@ -4,15 +4,10 @@
 define i64 @test_pointer_array(i64 %v) {
 ; OPT-LABEL: @test_pointer_array(
 ; OPT-NEXT:  entry:
-; OPT-NEXT:    [[A:%.*]] = alloca [3 x ptr], align 16, addrspace(5)
-; OPT-NEXT:    [[TMP0:%.*]] = load <3 x ptr>, ptr addrspace(5) [[A]], align 16
-; OPT-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[V:%.*]] to ptr
-; OPT-NEXT:    [[TMP2:%.*]] = insertelement <3 x ptr> [[TMP0]], ptr [[TMP1]], i32 0
-; OPT-NEXT:    store <3 x ptr> [[TMP2]], ptr addrspace(5) [[A]], align 16
-; OPT-NEXT:    [[TMP3:%.*]] = load <3 x ptr>, ptr addrspace(5) [[A]], align 16
-; OPT-NEXT:    [[TMP4:%.*]] = extractelement <3 x ptr> [[TMP3]], i32 0
-; OPT-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[TMP4]] to i64
-; OPT-NEXT:    ret i64 [[TMP5]]
+; OPT-NEXT:    [[TMP0:%.*]] = inttoptr i64 [[V:%.*]] to ptr
+; OPT-NEXT:    [[TMP1:%.*]] = insertelement <3 x ptr> undef, ptr [[TMP0]], i32 0
+; OPT-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[TMP0]] to i64
+; OPT-NEXT:    ret i64 [[TMP2]]
 ;
 entry:
   %a = alloca [3 x ptr], align 16, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
-; RUN: opt -S -mtriple=amdgcn-- -data-layout=A5 -mcpu=fiji -passes=amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -S -mtriple=amdgcn-- -data-layout=A5 -mcpu=fiji -passes=sroa,amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s
 
 ; GCN-LABEL: {{^}}float4_alloca_store4:
 ; OPT-LABEL: define amdgpu_kernel void @float4_alloca_store4
@@ -11,11 +11,8 @@
 ; GCN: v_cndmask_b32_e32 [[RES:v[0-9]+]], 4.0,
 ; GCN: store_dword v{{.+}}, [[RES]]
 
-; OPT:  %gep = getelementptr inbounds <4 x float>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
-; OPT:  store <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>, ptr addrspace(5) %alloca, align 4
-; OPT:  %0 = load <4 x float>, ptr addrspace(5) %alloca
-; OPT:  %1 = extractelement <4 x float> %0, i32 %sel2
-; OPT:  store float %1, ptr addrspace(1) %out, align 4
+; OPT:  %0 = extractelement <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>, i32 %sel2
+; OPT:  store float %0, ptr addrspace(1) %out, align 4
 
 define amdgpu_kernel void @float4_alloca_store4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
 entry:
@@ -46,12 +43,8 @@
 ; GCN:     v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
 ; GCN:     store_dwordx4 v{{.+}},
 
-; OPT: %gep = getelementptr inbounds <4 x float>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
-; OPT: %0 = load <4 x float>, ptr addrspace(5) %alloca
-; OPT: %1 = insertelement <4 x float> %0, float 1.000000e+00, i32 %sel2
-; OPT: store <4 x float> %1, ptr addrspace(5) %alloca
-; OPT: %load = load <4 x float>, ptr addrspace(5) %alloca, align 4
-; OPT:  store <4 x float> %load, ptr addrspace(1) %out, align 4
+; OPT: %0 = insertelement <4 x float> undef, float 1.000000e+00, i32 %sel2
+; OPT: store <4 x float> %0, ptr addrspace(1) %out, align 4
 
 define amdgpu_kernel void @float4_alloca_load4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
 entry:
@@ -77,11 +70,8 @@
 ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x40003c00
 ; GCN:     v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s[[[SL]]:[[SH]]]
 
-; OPT: %gep = getelementptr inbounds <4 x half>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
-; OPT: store <4 x half> <half 0xH3C00, half 0xH4000, half 0xH4200, half 0xH4400>, ptr addrspace(5) %alloca, align 2
-; OPT: %0 = load <4 x half>, ptr addrspace(5) %alloca
-; OPT: %1 = extractelement <4 x half> %0, i32 %sel2
-; OPT: store half %1, ptr addrspace(1) %out, align 2
+; OPT: %0 = extractelement <4 x half> <half 0xH3C00, half 0xH4000, half 0xH4200, half 0xH4400>, i32 %sel2
+; OPT: store half %0, ptr addrspace(1) %out, align 2
 
 define amdgpu_kernel void @half4_alloca_store4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
 entry:
@@ -105,12 +95,8 @@
 ; GCN-NOT: buffer_
 ; GCN:     s_mov_b64 s[{{[0-9:]+}}], 0xffff
 
-; OPT: %gep = getelementptr inbounds <4 x half>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
-; OPT: %0 = load <4 x half>, ptr addrspace(5) %alloca
-; OPT: %1 = insertelement <4 x half> %0, half 0xH3C00, i32 %sel2
-; OPT: store <4 x half> %1, ptr addrspace(5) %alloca
-; OPT: %load = load <4 x half>, ptr addrspace(5) %alloca, align 2
-; OPT: store <4 x half> %load, ptr addrspace(1) %out, align 2
+; OPT: %0 = insertelement <4 x half> undef, half 0xH3C00, i32 %sel2
+; OPT: store <4 x half> %0, ptr addrspace(1) %out, align 2
 
 define amdgpu_kernel void @half4_alloca_load4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
 entry:
@@ -136,11 +122,8 @@
 ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x20001
 ; GCN:     v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s[[[SL]]:[[SH]]]
 
-; OPT: %gep = getelementptr inbounds <4 x i16>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
-; OPT: store <4 x i16> <i16 1, i16 2, i16 3, i16 4>, ptr addrspace(5) %alloca, align 2
-; OPT: %0 = load <4 x i16>, ptr addrspace(5) %alloca
-; OPT: %1 = extractelement <4 x i16> %0, i32 %sel2
-; OPT: store i16 %1, ptr addrspace(1) %out, align 2
+; OPT: %0 = extractelement <4 x i16> <i16 1, i16 2, i16 3, i16 4>, i32 %sel2
+; OPT: store i16 %0, ptr addrspace(1) %out, align 2
 
 define amdgpu_kernel void @short4_alloca_store4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
 entry:
@@ -164,12 +147,8 @@
 ; GCN-NOT: buffer_
 ; GCN:     s_mov_b64 s[{{[0-9:]+}}], 0xffff
 
-; OPT: %gep = getelementptr inbounds <4 x i16>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
-; OPT: %0 = load <4 x i16>, ptr addrspace(5) %alloca
-; OPT: %1 = insertelement <4 x i16> %0, i16 1, i32 %sel2
-; OPT: store <4 x i16> %1, ptr addrspace(5) %alloca
-; OPT: %load = load <4 x i16>, ptr addrspace(5) %alloca, align 2
-; OPT: store <4 x i16> %load, ptr addrspace(1) %out, align 2
+; OPT: %0 = insertelement <4 x i16> undef, i16 1, i32 %sel2
+; OPT: store <4 x i16> %0, ptr addrspace(1) %out, align 2
 
 define amdgpu_kernel void @short4_alloca_load4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
 entry:
@@ -193,8 +172,7 @@
 ; GCN-NOT: buffer_
 ; GCN: v_mov_b32_e32 v1, 0
 
-; OPT: %private_iptr = alloca <2 x i32>, align 8, addrspace(5)
-; OPT: %tmp1 = load i64, ptr addrspace(5) %private_iptr, align 8
+; OPT: ret i64 undef
 
 define i64 @ptr_alloca_bitcast() {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll b/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll
--- a/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll
+++ b/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll
@@ -11,7 +11,7 @@
 ; so that we do not need to fully unroll it.
 
 ; FUNC-LABEL: @private_memory
-; LOOP-NOT: alloca
+; LOOP-NOT: = alloca
 ; LOOP: loop.header:
 ; LOOP: br i1 %{{[^,]+}}, label %exit, label %loop.header
 
diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
--- a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
@@ -1,6 +1,7 @@
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-ALLOCA %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-PROMOTE %s
-; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -S -mtriple=amdgcn- -passes=sroa %s -o %t.sroa.ll
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %t.sroa.ll | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-ALLOCA %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %t.sroa.ll | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-PROMOTE %s
+; RUN: opt -S -mtriple=amdgcn-- -passes='sroa,amdgpu-promote-alloca,instcombine' < %s | FileCheck -check-prefix=OPT %s
 
 target datalayout = "A5"
 
@@ -75,8 +76,8 @@
 ; OPT-LABEL: @vector_write_read_bitcast_to_float(
 ; OPT-NOT:   alloca
 ; OPT: bb2:
-; OPT:  %tmp.sroa.0.0 = phi <6 x float> [ undef, %bb ], [ %0, %bb2 ]
-; OPT:  %0 = insertelement <6 x float> %tmp.sroa.0.0, float %tmp72, i32 %tmp10
+; OPT:  %promotealloca = phi <6 x float> [ undef, %bb ], [ %0, %bb2 ]
+; OPT:  %0 = insertelement <6 x float> %promotealloca, float %tmp71, i32 %tmp10
 ; OPT: .preheader:
 ; OPT:  %bc = bitcast <6 x float> %0 to <6 x i32>
 ; OPT:  %1 = extractelement <6 x i32> %bc, i32 %tmp20
@@ -84,24 +85,13 @@
 ; GCN-LABEL: {{^}}vector_write_read_bitcast_to_float:
 ; GCN-ALLOCA: buffer_store_dword
 
-; GCN-PROMOTE-COUNT-6: v_cmp_eq_u16
-; GCN-PROMOTE-COUNT-6: v_cndmask
+; GCN-PROMOTE: v_cmp_eq_u16
+; GCN-PROMOTE: v_cndmask
 
 ; GCN: s_cbranch
 
 ; GCN-ALLOCA: buffer_load_dword
 
-; GCN-PROMOTE: v_cmp_eq_u16
-; GCN-PROMOTE: v_cndmask
-; GCN-PROMOTE: v_cmp_eq_u16
-; GCN-PROMOTE: v_cndmask
-; GCN-PROMOTE: v_cmp_eq_u16
-; GCN-PROMOTE: v_cndmask
-; GCN-PROMOTE: v_cmp_eq_u16
-; GCN-PROMOTE: v_cndmask
-; GCN-PROMOTE: v_cmp_eq_u16
-; GCN-PROMOTE: v_cndmask
-
 ; GCN-PROMOTE: ScratchSize: 0
 
 define amdgpu_kernel void @vector_write_read_bitcast_to_float(ptr addrspace(1) %arg) {
@@ -147,8 +137,8 @@
 ; OPT-LABEL: @vector_write_read_bitcast_to_double(
 ; OPT-NOT:   alloca
 ; OPT: bb2:
-; OPT:  %tmp.sroa.0.0 = phi <6 x double> [ undef, %bb ], [ %0, %bb2 ]
-; OPT:  %0 = insertelement <6 x double> %tmp.sroa.0.0, double %tmp72, i32 %tmp10
+; OPT:  %promotealloca = phi <6 x double> [ undef, %bb ], [ %0, %bb2 ]
+; OPT:  %0 = insertelement <6 x double> %promotealloca, double %tmp71, i32 %tmp10
 ; OPT: .preheader:
 ; OPT:  %bc = bitcast <6 x double> %0 to <6 x i64>
 ; OPT:  %1 = extractelement <6 x i64> %bc, i32 %tmp20
@@ -208,8 +198,8 @@
 ; OPT-LABEL: @vector_write_read_bitcast_to_i64(
 ; OPT-NOT:   alloca
 ; OPT: bb2:
-; OPT:  %tmp.sroa.0.0 = phi <6 x i64> [ undef, %bb ], [ %0, %bb2 ]
-; OPT:  %0 = insertelement <6 x i64> %tmp.sroa.0.0, i64 %tmp6, i32 %tmp9
+; OPT:  %promotealloca = phi <6 x i64> [ undef, %bb ], [ %0, %bb2 ]
+; OPT:  %0 = insertelement <6 x i64> %promotealloca, i64 %tmp6, i32 %tmp9
 ; OPT: .preheader:
 ; OPT:  %1 = extractelement <6 x i64> %0, i32 %tmp18
 
@@ -272,7 +262,7 @@
 ; OPT: store i32 %0, ptr addrspace(1) %out, align 4
 
 ; GCN-LABEL: {{^}}vector_read_alloca_bitcast_assume:
-; GCN-COUNT-4: buffer_store_dword
+; GCN-COUNT: buffer_store_dword
 
 define amdgpu_kernel void @vector_read_alloca_bitcast_assume(ptr addrspace(1) %out, i32 %index) {
 entry: