Index: lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -316,25 +316,53 @@
   return GEP->getOperand(2);
 }
 
-// Not an instruction handled below to turn into a vector.
+// NOTE: We mainly check whether a load or a store is vectorizable here.
+// A special case here is BITCAST of a GEP, in which case we check
+// whether all users of the BITCAST is vectorizable.
 //
 // TODO: Check isTriviallyVectorizable for calls and handle other
 // instructions.
-static bool canVectorizeInst(Instruction *Inst, User *User) {
+static bool canVectorizeInst(Instruction *Inst, User *Used,
+                             std::vector<Value*> &WorkList) {
   switch (Inst->getOpcode()) {
   case Instruction::Load: {
     LoadInst *LI = cast<LoadInst>(Inst);
-    // Currently only handle the case where the Pointer Operand is a GEP so check for that case.
-    return isa<GetElementPtrInst>(LI->getPointerOperand()) && !LI->isVolatile();
+    if (LI->isVolatile())
+      return false;
+    // Currently only handle the case where the Pointer Operand is a GEP
+    // or a BITCAST.
+    if (LI->getPointerOperand() != Used ||
+        (!isa<GetElementPtrInst>(Used) && !isa<BitCastInst>(Used)))
+      return false;
+    WorkList.push_back(Inst);
+    return true;
+  }
+  case Instruction::BitCast: {
+    if (isa<GetElementPtrInst>(Used)) {
+      for (User *BCUser : Inst->users()) {
+        if (!canVectorizeInst(cast<Instruction>(BCUser), Inst, WorkList))
+          return false;
+      }
+      return true;
+    }
+    // Fallthrough otherwise.
+    // TODO: we do not actually have logic to handle general bitcast and
+    //       addrspacecast. We may have to be conservative here to avoid
+    //       unexpected results.
   }
-  case Instruction::BitCast:
   case Instruction::AddrSpaceCast:
     return true;
   case Instruction::Store: {
-    // Must be the stored pointer operand, not a stored value, plus
-    // since it should be canonical form, the User should be a GEP.
     StoreInst *SI = cast<StoreInst>(Inst);
-    return (SI->getPointerOperand() == User) && isa<GetElementPtrInst>(User) && !SI->isVolatile();
+    if (SI->isVolatile())
+      return false;
+    // Currently only handle the case where the Pointer Operand is a GEP
+    // or a BITCAST.
+    if (SI->getPointerOperand() != Used ||
+        (!isa<GetElementPtrInst>(Used) && !isa<BitCastInst>(Used)))
+      return false;
+    WorkList.push_back(Inst);
+    return true;
   }
   default:
     return false;
@@ -369,10 +397,8 @@
   for (User *AllocaUser : Alloca->users()) {
     GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
     if (!GEP) {
-      if (!canVectorizeInst(cast<Instruction>(AllocaUser), Alloca))
+      if (!canVectorizeInst(cast<Instruction>(AllocaUser), Alloca, WorkList))
         return false;
-
-      WorkList.push_back(AllocaUser);
       continue;
     }
 
@@ -387,25 +413,29 @@
 
     GEPVectorIdx[GEP] = Index;
     for (User *GEPUser : AllocaUser->users()) {
-      if (!canVectorizeInst(cast<Instruction>(GEPUser), AllocaUser))
+      if (!canVectorizeInst(cast<Instruction>(GEPUser), AllocaUser, WorkList))
         return false;
-
-      WorkList.push_back(GEPUser);
     }
   }
 
-  VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
+  VectorType *VectorT = arrayTypeToVecType(AllocaTy);
 
   DEBUG(dbgs() << "  Converting alloca to vector "
-        << *AllocaTy << " -> " << *VectorTy << '\n');
+        << *AllocaTy << " -> " << *VectorT << '\n');
 
   for (Value *V : WorkList) {
     Instruction *Inst = cast<Instruction>(V);
     IRBuilder<> Builder(Inst);
     switch (Inst->getOpcode()) {
     case Instruction::Load: {
-      Type *VecPtrTy = VectorTy->getPointerTo(AS.PRIVATE_ADDRESS);
       Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand();
+      VectorType *VectorTy = VectorT;
+      if (BitCastInst *BC = dyn_cast<BitCastInst>(Ptr)) {
+        VectorTy = VectorType::get(Ptr->getType()->getPointerElementType(),
+                                   AllocaTy->getNumElements());
+        Ptr = BC->getOperand(0);
+      }
+      Type *VecPtrTy = VectorTy->getPointerTo(AS.PRIVATE_ADDRESS);
       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
 
       Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
@@ -416,10 +446,15 @@
       break;
     }
     case Instruction::Store: {
-      Type *VecPtrTy = VectorTy->getPointerTo(AS.PRIVATE_ADDRESS);
-
       StoreInst *SI = cast<StoreInst>(Inst);
       Value *Ptr = SI->getPointerOperand();
+      VectorType *VectorTy = VectorT;
+      if (BitCastInst *BC = dyn_cast<BitCastInst>(Ptr)) {
+        VectorTy = VectorType::get(Ptr->getType()->getPointerElementType(),
+                                   AllocaTy->getNumElements());
+        Ptr = BC->getOperand(0);
+      }
+      Type *VecPtrTy = VectorTy->getPointerTo(AS.PRIVATE_ADDRESS);
       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
       Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
       Value *VecValue = Builder.CreateLoad(BitCast);
Index: test/CodeGen/AMDGPU/vector-alloca.ll
===================================================================
--- test/CodeGen/AMDGPU/vector-alloca.ll
+++ test/CodeGen/AMDGPU/vector-alloca.ll
@@ -64,33 +64,6 @@
   ret void
 }
 
-; This test should be optimize to:
-; store i32 0, i32 addrspace(1)* %out
-
-; OPT-LABEL: @bitcast_gep(
-; OPT-LABEL: store i32 0, i32 addrspace(1)* %out, align 4
-
-; FUNC-LABEL: {{^}}bitcast_gep:
-; EG: STORE_RAW
-define amdgpu_kernel void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
-entry:
-  %tmp = alloca [4 x i32], addrspace(5)
-  %x = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 0
-  %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
-  %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
-  %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
-  store i32 0, i32 addrspace(5)* %x
-  store i32 0, i32 addrspace(5)* %y
-  store i32 0, i32 addrspace(5)* %z
-  store i32 0, i32 addrspace(5)* %w
-  %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
-  %tmp2 = bitcast i32 addrspace(5)* %tmp1 to [4 x i32] addrspace(5)*
-  %tmp3 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp2, i32 0, i32 0
-  %tmp4 = load i32, i32 addrspace(5)* %tmp3
-  store i32 %tmp4, i32 addrspace(1)* %out
-  ret void
-}
-
 ; OPT-LABEL: @vector_read_bitcast_gep(
 ; OPT: %0 = extractelement <4 x i32> <i32 1065353216, i32 1, i32 2, i32 3>, i32 %index
 ; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
@@ -161,3 +134,81 @@
   store i32 %tmp2, i32 addrspace(1)* %out
   ret void
 }
+
+; OPT-LABEL: @write_bitcast_gep_read(
+; OPT: %0 = insertelement <3 x i32> zeroinitializer, i32 12, i32 %w_index
+; OPT: %1 = bitcast <3 x i32> %0 to <3 x float>
+; OPT: %2 = extractelement <3 x float> %1, i32 %r_index
+; OPT: store float %2, float addrspace(1)* %out, align 4
+define amdgpu_kernel void @write_bitcast_gep_read(float addrspace(1)* %out, i32 %w_index, i32 %r_index) {
+entry:
+  %scratch = alloca [3 x i32], addrspace(5)
+  %x = getelementptr [3 x i32], [3 x i32] addrspace(5)* %scratch, i32 0, i32 0
+  %y = getelementptr [3 x i32], [3 x i32] addrspace(5)* %scratch, i32 0, i32 1
+  %z = getelementptr [3 x i32], [3 x i32] addrspace(5)* %scratch, i32 0, i32 2
+  store i32 0, i32 addrspace(5)* %x
+  store i32 0, i32 addrspace(5)* %y
+  store i32 0, i32 addrspace(5)* %z
+
+  %gep_write = getelementptr inbounds [3 x i32], [3 x i32] addrspace(5)* %scratch, i32 0, i32 %w_index
+  store i32 12, i32 addrspace(5)* %gep_write, align 4
+
+  %gep_read = getelementptr [3 x i32], [3 x i32] addrspace(5)* %scratch, i32 0, i32 %r_index
+  %bc_read = bitcast i32 addrspace(5)* %gep_read to float addrspace(5)*
+  %result = load float, float addrspace(5)* %bc_read
+  store float %result, float addrspace(1)* %out
+
+  ret void
+}
+
+; OPT-LABEL: @bitcast_gep_write_read(
+; OPT: %0 = insertelement <3 x float> zeroinitializer, float 1.200000e+01, i32 %w_index
+; OPT: %1 = bitcast <3 x float> %0 to <3 x i32>
+; OPT: %2 = extractelement <3 x i32> %1, i32 %r_index
+; OPT: store i32 %2, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @bitcast_gep_write_read(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
+entry:
+  %scratch = alloca [3 x i32], addrspace(5)
+  %x = getelementptr [3 x i32], [3 x i32] addrspace(5)* %scratch, i32 0, i32 0
+  %y = getelementptr [3 x i32], [3 x i32] addrspace(5)* %scratch, i32 0, i32 1
+  %z = getelementptr [3 x i32], [3 x i32] addrspace(5)* %scratch, i32 0, i32 2
+  store i32 0, i32 addrspace(5)* %x
+  store i32 0, i32 addrspace(5)* %y
+  store i32 0, i32 addrspace(5)* %z
+
+  %gep_write = getelementptr inbounds [3 x i32], [3 x i32] addrspace(5)* %scratch, i32 0, i32 %w_index
+  %bc_write = bitcast i32 addrspace(5)* %gep_write to float addrspace(5)*
+  store float 12.0, float addrspace(5)* %bc_write, align 4
+
+  %gep_read = getelementptr [3 x i32], [3 x i32] addrspace(5)* %scratch, i32 0, i32 %r_index
+  %result = load i32, i32 addrspace(5)* %gep_read
+  store i32 %result, i32 addrspace(1)* %out
+
+  ret void
+}
+
+; OPT-LABEL: @bitcast_gep_write_bitcast_gep_read(
+; OPT: %0 = insertelement <3 x float> zeroinitializer, float 1.200000e+01, i32 %w_index
+; OPT: %1 = extractelement <3 x float> %0, i32 %r_index
+; OPT: store float %1, float addrspace(1)* %out, align 4
+define amdgpu_kernel void @bitcast_gep_write_bitcast_gep_read(float addrspace(1)* %out, i32 %w_index, i32 %r_index) {
+entry:
+  %scratch = alloca [3 x i32], addrspace(5)
+  %x = getelementptr [3 x i32], [3 x i32] addrspace(5)* %scratch, i32 0, i32 0
+  %y = getelementptr [3 x i32], [3 x i32] addrspace(5)* %scratch, i32 0, i32 1
+  %z = getelementptr [3 x i32], [3 x i32] addrspace(5)* %scratch, i32 0, i32 2
+  store i32 0, i32 addrspace(5)* %x
+  store i32 0, i32 addrspace(5)* %y
+  store i32 0, i32 addrspace(5)* %z
+
+  %gep_write = getelementptr inbounds [3 x i32], [3 x i32] addrspace(5)* %scratch, i32 0, i32 %w_index
+  %bc_write = bitcast i32 addrspace(5)* %gep_write to float addrspace(5)*
+  store float 12.0, float addrspace(5)* %bc_write, align 4
+
+  %gep_read = getelementptr [3 x i32], [3 x i32] addrspace(5)* %scratch, i32 0, i32 %r_index
+  %bc_read = bitcast i32 addrspace(5)* %gep_read to float addrspace(5)*
+  %result = load float, float addrspace(5)* %bc_read
+  store float %result, float addrspace(1)* %out
+
+  ret void
+}