diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -21,6 +21,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsR600.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/Pass.h"
 #include "llvm/Target/TargetMachine.h"
 
@@ -384,6 +385,28 @@
   ConstantInt *DestIndex = nullptr;
 };
 
+// Checks if the instruction I is a memset user of the alloca AI that we can
+// deal with. Currently, only non-volatile memsets that affect the whole alloca
+// are handled.
+static bool isSupportedMemset(Instruction *I, AllocaInst *AI,
+                              unsigned AllocaSize) {
+  using namespace PatternMatch;
+  Value *Ptr = nullptr;
+  uint64_t Len;
+  uint64_t ByteVal;
+  if (!match(I, m_Intrinsic<Intrinsic::memset>(m_Value(Ptr),
+                                               m_ConstantInt(ByteVal),
+                                               m_ConstantInt(Len), m_Zero())) &&
+      !match(I, m_Intrinsic<Intrinsic::memset_inline>(
+                    m_Value(Ptr), m_ConstantInt(ByteVal), m_ConstantInt(Len),
+                    m_Zero())))
+    return false;
+
+  // For now we only care about memsets that affect the whole type (start at
+  // index 0 and fill the whole alloca).
+  return Ptr == AI && Len == AllocaSize;
+}
+
 static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
                                      unsigned MaxVGPRs) {
 
@@ -535,6 +558,11 @@
         }))
       continue;
 
+    if (isSupportedMemset(Inst, Alloca, DL.getTypeSizeInBits(AllocaTy) / 8)) {
+      WorkList.push_back(Inst);
+      continue;
+    }
+
     // Unknown user.
     return false;
   }
@@ -586,6 +614,7 @@
       break;
     }
     case Instruction::Call: {
+      IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst);
       if (const MemTransferInst *MTI = dyn_cast<MemTransferInst>(Inst)) {
         ConstantInt *Length = cast<ConstantInt>(MTI->getLength());
         unsigned NumCopied = Length->getZExtValue() / ElementSize;
@@ -609,6 +638,12 @@
         Builder.CreateAlignedStore(NewVecValue, BitCast, Alloca->getAlign());
 
         Inst->eraseFromParent();
+      } else if (II->getIntrinsicID() == Intrinsic::memset ||
+                 II->getIntrinsicID() == Intrinsic::memset_inline) {
+        // Ensure the length parameter of the memsets matches the new vector
+        // type's. In general, the type size shouldn't change so this is a
+        // no-op, but it's better to be safe.
+        II->setOperand(2, Builder.getInt64(DL.getTypeSizeInBits(VectorTy) / 8));
       } else {
         llvm_unreachable("Unsupported call when promoting alloca to vector");
       }
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
@@ -0,0 +1,62 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s
+
+; Checks that memsets don't block PromoteAlloca.
+
+; Note: memsets are just updated with the new type size. They are not eliminated which means
+; the original alloca also stay. This puts a bit more load on SROA.
+; If PromoteAlloca is moved to SSAUpdater, we could just entirely replace the memsets with
+; e.g. ConstantAggregate.
+
+; CHECK-LABEL: @memset_all_zero(
+; CHECK: <6 x double>
+define amdgpu_kernel void @memset_all_zero(double %val) {
+entry:
+  %stack = alloca [6 x double], align 4, addrspace(5)
+  call void @llvm.memset.p5.double(ptr addrspace(5) %stack, i8 0, i64 48, i1 false)
+  store double %val, ptr addrspace(5) %stack
+  ret void
+}
+
+; CHECK-LABEL: @memset_all_5(
+; CHECK: <4 x i64>
+define amdgpu_kernel void @memset_all_5(i64 %val) {
+entry:
+  %stack = alloca [4 x i64], align 4, addrspace(5)
+  call void @llvm.memset.p5.i64(ptr addrspace(5) %stack, i8 5, i64 32, i1 false)
+  store i64 %val, ptr addrspace(5) %stack
+  ret void
+}
+
+; CHECK-LABEL: @memset_volatile_nopromote(
+; CHECK-NOT: <4 x i64>
+define amdgpu_kernel void @memset_volatile_nopromote(i64 %val) {
+entry:
+  %stack = alloca [4 x i64], align 4, addrspace(5)
+  call void @llvm.memset.p5.i64(ptr addrspace(5) %stack, i8 0, i64 32, i1 true)
+  store i64 %val, ptr addrspace(5) %stack
+  ret void
+}
+
+; CHECK-LABEL: @memset_badsize_nopromote(
+; CHECK-NOT: <4 x i64>
+define amdgpu_kernel void @memset_badsize_nopromote(i64 %val) {
+entry:
+  %stack = alloca [4 x i64], align 4, addrspace(5)
+  call void @llvm.memset.p5.i64(ptr addrspace(5) %stack, i8 0, i64 31, i1 true)
+  store i64 %val, ptr addrspace(5) %stack
+  ret void
+}
+
+; CHECK-LABEL: @memset_offset_ptr_nopromote(
+; CHECK-NOT: <4 x i64>
+define amdgpu_kernel void @memset_offset_ptr_nopromote(i64 %val) {
+entry:
+  %stack = alloca [4 x i64], align 4, addrspace(5)
+  %gep = getelementptr [4 x i64], ptr addrspace(5) %stack, i64 0, i64 1
+  call void @llvm.memset.p5.i64(ptr addrspace(5) %gep, i8 0, i64 24, i1 true)
+  store i64 %val, ptr addrspace(5) %stack
+  ret void
+}
+
+declare void @llvm.memset.p5.double(ptr addrspace(5) nocapture writeonly, i8, i64, i1 immarg)
+declare void @llvm.memset.p5.i64(ptr addrspace(5) nocapture writeonly, i8, i64, i1 immarg)