Index: llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -468,7 +468,7 @@ IRBuilder<> Builder(Inst); switch (Inst->getOpcode()) { case Instruction::Load: { - if (Inst->getType() == AllocaTy) + if (Inst->getType() == AllocaTy || Inst->getType()->isVectorTy()) break; Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS); @@ -486,7 +486,8 @@ } case Instruction::Store: { StoreInst *SI = cast(Inst); - if (SI->getValueOperand()->getType() == AllocaTy) + if (SI->getValueOperand()->getType() == AllocaTy || + SI->getValueOperand()->getType()->isVectorTy()) break; Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS); Index: llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll +++ llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll @@ -345,6 +345,50 @@ ret void } +; OPT-LABEL: @bitcast_vector_to_vector( +; OPT-NOT: alloca +; OPT: store <4 x i32> , <4 x i32> addrspace(1)* %out, align 16 + +; GCN-LABEL: {{^}}bitcast_vector_to_vector: +; GCN: v_mov_b32_e32 v0, 1 +; GCN: v_mov_b32_e32 v1, 2 +; GCN: v_mov_b32_e32 v2, 3 +; GCN: v_mov_b32_e32 v3, 4 + +; GCN: ScratchSize: 0 + +define amdgpu_kernel void @bitcast_vector_to_vector(<4 x i32> addrspace(1)* %out) { +.entry: + %alloca = alloca <4 x float>, align 16, addrspace(5) + %cast = bitcast <4 x float> addrspace(5)* %alloca to <4 x i32> addrspace(5)* + store <4 x i32> , <4 x i32> addrspace(5)* %cast + %load = load <4 x i32>, <4 x i32> addrspace(5)* %cast, align 16 + store <4 x i32> %load, <4 x i32> addrspace(1)* %out + ret void +} + +; OPT-LABEL: @vector_bitcast_from_alloca_array( +; OPT-NOT: alloca +; OPT: store <4 x i32> , <4 x i32> addrspace(1)* %out, align 16 + +; GCN-LABEL: {{^}}vector_bitcast_from_alloca_array: +; GCN: v_mov_b32_e32 v0, 1 +; GCN: v_mov_b32_e32 v1, 2 +; GCN: v_mov_b32_e32 v2, 3 +; GCN: v_mov_b32_e32 v3, 4 + +; GCN: ScratchSize: 0 + +define amdgpu_kernel void @vector_bitcast_from_alloca_array(<4 x i32> addrspace(1)* %out) { +.entry: + %alloca = alloca [4 x float], align 16, addrspace(5) + %cast = bitcast [4 x float] addrspace(5)* %alloca to <4 x i32> addrspace(5)* + store <4 x i32> , <4 x i32> addrspace(5)* %cast + %load = load <4 x i32>, <4 x i32> addrspace(5)* %cast, align 16 + store <4 x i32> %load, <4 x i32> addrspace(1)* %out + ret void +} + declare void @llvm.lifetime.start.p5i8(i64 immarg, i8 addrspace(5)* nocapture) declare void @llvm.lifetime.end.p5i8(i64 immarg, i8 addrspace(5)* nocapture)