Index: lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -316,25 +316,53 @@ return GEP->getOperand(2); } -// Not an instruction handled below to turn into a vector. +// NOTE: We mainly check whether a load or a store is vectorizable here. +// A special case here is BITCAST of a GEP, in which case we check +// whether all users of the BITCAST is vectorizable. // // TODO: Check isTriviallyVectorizable for calls and handle other // instructions. -static bool canVectorizeInst(Instruction *Inst, User *User) { +static bool canVectorizeInst(Instruction *Inst, User *Used, + std::vector &WorkList) { switch (Inst->getOpcode()) { case Instruction::Load: { LoadInst *LI = cast(Inst); - // Currently only handle the case where the Pointer Operand is a GEP so check for that case. - return isa(LI->getPointerOperand()) && !LI->isVolatile(); + if (LI->isVolatile()) + return false; + // Currently only handle the case where the Pointer Operand is a GEP + // or a BITCAST. + if (LI->getPointerOperand() != Used || + (!isa(Used) && !isa(Used))) + return false; + WorkList.push_back(Inst); + return true; + } + case Instruction::BitCast: { + if (isa(Used)) { + for (User *BCUser : Inst->users()) { + if (!canVectorizeInst(cast(BCUser), Inst, WorkList)) + return false; + } + return true; + } + // Fallthrough otherwise. + // TODO: we do not actually have logic to handle general bitcast and + // addrspacecast. We may have to be conservative here to avoid + // unexpected results. } - case Instruction::BitCast: case Instruction::AddrSpaceCast: return true; case Instruction::Store: { - // Must be the stored pointer operand, not a stored value, plus - // since it should be canonical form, the User should be a GEP. StoreInst *SI = cast(Inst); - return (SI->getPointerOperand() == User) && isa(User) && !SI->isVolatile(); + if (SI->isVolatile()) + return false; + // Currently only handle the case where the Pointer Operand is a GEP + // or a BITCAST. + if (SI->getPointerOperand() != Used || + (!isa(Used) && !isa(Used))) + return false; + WorkList.push_back(Inst); + return true; } default: return false; @@ -369,10 +397,8 @@ for (User *AllocaUser : Alloca->users()) { GetElementPtrInst *GEP = dyn_cast(AllocaUser); if (!GEP) { - if (!canVectorizeInst(cast(AllocaUser), Alloca)) + if (!canVectorizeInst(cast(AllocaUser), Alloca, WorkList)) return false; - - WorkList.push_back(AllocaUser); continue; } @@ -387,25 +413,29 @@ GEPVectorIdx[GEP] = Index; for (User *GEPUser : AllocaUser->users()) { - if (!canVectorizeInst(cast(GEPUser), AllocaUser)) + if (!canVectorizeInst(cast(GEPUser), AllocaUser, WorkList)) return false; - - WorkList.push_back(GEPUser); } } - VectorType *VectorTy = arrayTypeToVecType(AllocaTy); + VectorType *VectorT = arrayTypeToVecType(AllocaTy); DEBUG(dbgs() << " Converting alloca to vector " - << *AllocaTy << " -> " << *VectorTy << '\n'); + << *AllocaTy << " -> " << *VectorT << '\n'); for (Value *V : WorkList) { Instruction *Inst = cast(V); IRBuilder<> Builder(Inst); switch (Inst->getOpcode()) { case Instruction::Load: { - Type *VecPtrTy = VectorTy->getPointerTo(AS.PRIVATE_ADDRESS); Value *Ptr = cast(Inst)->getPointerOperand(); + VectorType *VectorTy = VectorT; + if (BitCastInst *BC = dyn_cast(Ptr)) { + VectorTy = VectorType::get(Ptr->getType()->getPointerElementType(), + AllocaTy->getNumElements()); + Ptr = BC->getOperand(0); + } + Type *VecPtrTy = VectorTy->getPointerTo(AS.PRIVATE_ADDRESS); Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy); @@ -416,10 +446,15 @@ break; } case Instruction::Store: { - Type *VecPtrTy = VectorTy->getPointerTo(AS.PRIVATE_ADDRESS); - StoreInst *SI = cast(Inst); Value *Ptr = SI->getPointerOperand(); + VectorType *VectorTy = VectorT; + if (BitCastInst *BC = dyn_cast(Ptr)) { + VectorTy = VectorType::get(Ptr->getType()->getPointerElementType(), + AllocaTy->getNumElements()); + Ptr = BC->getOperand(0); + } + Type *VecPtrTy = VectorTy->getPointerTo(AS.PRIVATE_ADDRESS); Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy); Value *VecValue = Builder.CreateLoad(BitCast); Index: test/CodeGen/AMDGPU/vector-alloca.ll =================================================================== --- test/CodeGen/AMDGPU/vector-alloca.ll +++ test/CodeGen/AMDGPU/vector-alloca.ll @@ -64,33 +64,6 @@ ret void } -; This test should be optimize to: -; store i32 0, i32 addrspace(1)* %out - -; OPT-LABEL: @bitcast_gep( -; OPT-LABEL: store i32 0, i32 addrspace(1)* %out, align 4 - -; FUNC-LABEL: {{^}}bitcast_gep: -; EG: STORE_RAW -define amdgpu_kernel void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) { -entry: - %tmp = alloca [4 x i32], addrspace(5) - %x = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 0 - %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1 - %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2 - %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3 - store i32 0, i32 addrspace(5)* %x - store i32 0, i32 addrspace(5)* %y - store i32 0, i32 addrspace(5)* %z - store i32 0, i32 addrspace(5)* %w - %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1 - %tmp2 = bitcast i32 addrspace(5)* %tmp1 to [4 x i32] addrspace(5)* - %tmp3 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp2, i32 0, i32 0 - %tmp4 = load i32, i32 addrspace(5)* %tmp3 - store i32 %tmp4, i32 addrspace(1)* %out - ret void -} - ; OPT-LABEL: @vector_read_bitcast_gep( ; OPT: %0 = extractelement <4 x i32> , i32 %index ; OPT: store i32 %0, i32 addrspace(1)* %out, align 4 @@ -161,3 +134,81 @@ store i32 %tmp2, i32 addrspace(1)* %out ret void } + +; OPT-LABEL: @write_bitcast_gep_read( +; OPT: %0 = insertelement <3 x i32> zeroinitializer, i32 12, i32 %w_index +; OPT: %1 = bitcast <3 x i32> %0 to <3 x float> +; OPT: %2 = extractelement <3 x float> %1, i32 %r_index +; OPT: store float %2, float addrspace(1)* %out, align 4 +define amdgpu_kernel void @write_bitcast_gep_read(float addrspace(1)* %out, i32 %w_index, i32 %r_index) { +entry: + %scratch = alloca [3 x i32], addrspace(5) + %x = getelementptr [3 x i32], [3 x i32] addrspace(5)* %scratch, i32 0, i32 0 + %y = getelementptr [3 x i32], [3 x i32] addrspace(5)* %scratch, i32 0, i32 1 + %z = getelementptr [3 x i32], [3 x i32] addrspace(5)* %scratch, i32 0, i32 2 + store i32 0, i32 addrspace(5)* %x + store i32 0, i32 addrspace(5)* %y + store i32 0, i32 addrspace(5)* %z + + %gep_write = getelementptr inbounds [3 x i32], [3 x i32] addrspace(5)* %scratch, i32 0, i32 %w_index + store i32 12, i32 addrspace(5)* %gep_write, align 4 + + %gep_read = getelementptr [3 x i32], [3 x i32] addrspace(5)* %scratch, i32 0, i32 %r_index + %bc_read = bitcast i32 addrspace(5)* %gep_read to float addrspace(5)* + %result = load float, float addrspace(5)* %bc_read + store float %result, float addrspace(1)* %out + + ret void +} + +; OPT-LABEL: @bitcast_gep_write_read( +; OPT: %0 = insertelement <3 x float> zeroinitializer, float 1.200000e+01, i32 %w_index +; OPT: %1 = bitcast <3 x float> %0 to <3 x i32> +; OPT: %2 = extractelement <3 x i32> %1, i32 %r_index +; OPT: store i32 %2, i32 addrspace(1)* %out, align 4 +define amdgpu_kernel void @bitcast_gep_write_read(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) { +entry: + %scratch = alloca [3 x i32], addrspace(5) + %x = getelementptr [3 x i32], [3 x i32] addrspace(5)* %scratch, i32 0, i32 0 + %y = getelementptr [3 x i32], [3 x i32] addrspace(5)* %scratch, i32 0, i32 1 + %z = getelementptr [3 x i32], [3 x i32] addrspace(5)* %scratch, i32 0, i32 2 + store i32 0, i32 addrspace(5)* %x + store i32 0, i32 addrspace(5)* %y + store i32 0, i32 addrspace(5)* %z + + %gep_write = getelementptr inbounds [3 x i32], [3 x i32] addrspace(5)* %scratch, i32 0, i32 %w_index + %bc_write = bitcast i32 addrspace(5)* %gep_write to float addrspace(5)* + store float 12.0, float addrspace(5)* %bc_write, align 4 + + %gep_read = getelementptr [3 x i32], [3 x i32] addrspace(5)* %scratch, i32 0, i32 %r_index + %result = load i32, i32 addrspace(5)* %gep_read + store i32 %result, i32 addrspace(1)* %out + + ret void +} + +; OPT-LABEL: @bitcast_gep_write_bitcast_gep_read( +; OPT: %0 = insertelement <3 x float> zeroinitializer, float 1.200000e+01, i32 %w_index +; OPT: %1 = extractelement <3 x float> %0, i32 %r_index +; OPT: store float %1, float addrspace(1)* %out, align 4 +define amdgpu_kernel void @bitcast_gep_write_bitcast_gep_read(float addrspace(1)* %out, i32 %w_index, i32 %r_index) { +entry: + %scratch = alloca [3 x i32], addrspace(5) + %x = getelementptr [3 x i32], [3 x i32] addrspace(5)* %scratch, i32 0, i32 0 + %y = getelementptr [3 x i32], [3 x i32] addrspace(5)* %scratch, i32 0, i32 1 + %z = getelementptr [3 x i32], [3 x i32] addrspace(5)* %scratch, i32 0, i32 2 + store i32 0, i32 addrspace(5)* %x + store i32 0, i32 addrspace(5)* %y + store i32 0, i32 addrspace(5)* %z + + %gep_write = getelementptr inbounds [3 x i32], [3 x i32] addrspace(5)* %scratch, i32 0, i32 %w_index + %bc_write = bitcast i32 addrspace(5)* %gep_write to float addrspace(5)* + store float 12.0, float addrspace(5)* %bc_write, align 4 + + %gep_read = getelementptr [3 x i32], [3 x i32] addrspace(5)* %scratch, i32 0, i32 %r_index + %bc_read = bitcast i32 addrspace(5)* %gep_read to float addrspace(5)* + %result = load float, float addrspace(5)* %bc_read + store float %result, float addrspace(1)* %out + + ret void +}