Index: llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -468,7 +468,7 @@
     IRBuilder<> Builder(Inst);
     switch (Inst->getOpcode()) {
     case Instruction::Load: {
-      if (Inst->getType() == AllocaTy)
+      if (Inst->getType() == AllocaTy || Inst->getType()->isVectorTy())
         break;
 
       Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
@@ -486,7 +486,8 @@
     }
     case Instruction::Store: {
       StoreInst *SI = cast<StoreInst>(Inst);
-      if (SI->getValueOperand()->getType() == AllocaTy)
+      if (SI->getValueOperand()->getType() == AllocaTy ||
+          SI->getValueOperand()->getType()->isVectorTy())
         break;
 
       Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
Index: llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
+++ llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
@@ -345,6 +345,50 @@
   ret void
 }
 
+; OPT-LABEL: @bitcast_vector_to_vector(
+; OPT-NOT:   alloca
+; OPT:       store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(1)* %out, align 16
+
+; GCN-LABEL: {{^}}bitcast_vector_to_vector:
+; GCN: v_mov_b32_e32 v0, 1
+; GCN: v_mov_b32_e32 v1, 2
+; GCN: v_mov_b32_e32 v2, 3
+; GCN: v_mov_b32_e32 v3, 4
+
+; GCN: ScratchSize: 0
+
+define amdgpu_kernel void @bitcast_vector_to_vector(<4 x i32> addrspace(1)* %out)  {
+.entry:
+  %alloca = alloca <4 x float>, align 16, addrspace(5)
+  %cast = bitcast <4 x float> addrspace(5)* %alloca to <4 x i32> addrspace(5)*
+  store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %cast
+  %load = load <4 x i32>, <4 x i32> addrspace(5)* %cast, align 16
+  store <4 x i32> %load, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; OPT-LABEL: @vector_bitcast_from_alloca_array(
+; OPT-NOT:   alloca
+; OPT:       store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(1)* %out, align 16
+
+; GCN-LABEL: {{^}}vector_bitcast_from_alloca_array:
+; GCN: v_mov_b32_e32 v0, 1
+; GCN: v_mov_b32_e32 v1, 2
+; GCN: v_mov_b32_e32 v2, 3
+; GCN: v_mov_b32_e32 v3, 4
+
+; GCN: ScratchSize: 0
+
+define amdgpu_kernel void @vector_bitcast_from_alloca_array(<4 x i32> addrspace(1)* %out)  {
+.entry:
+  %alloca = alloca [4 x float], align 16, addrspace(5)
+  %cast = bitcast [4 x float] addrspace(5)* %alloca to <4 x i32> addrspace(5)*
+  store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %cast
+  %load = load <4 x i32>, <4 x i32> addrspace(5)* %cast, align 16
+  store <4 x i32> %load, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
 declare void @llvm.lifetime.start.p5i8(i64 immarg, i8 addrspace(5)* nocapture)
 
 declare void @llvm.lifetime.end.p5i8(i64 immarg, i8 addrspace(5)* nocapture)