Index: llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -302,10 +302,19 @@ ArrayTy->getNumElements()); } +static Value *stripBitcasts(Value *V) { + while (Instruction *I = dyn_cast(V)) { + if (I->getOpcode() != Instruction::BitCast) + break; + V = I->getOperand(0); + } + return V; +} + static Value * calculateVectorIndex(Value *Ptr, const std::map &GEPIdx) { - GetElementPtrInst *GEP = cast(Ptr); + GetElementPtrInst *GEP = cast(stripBitcasts(Ptr)); auto I = GEPIdx.find(GEP); return I == GEPIdx.end() ? nullptr : I->second; @@ -327,7 +336,8 @@ // // TODO: Check isTriviallyVectorizable for calls and handle other // instructions. -static bool canVectorizeInst(Instruction *Inst, User *User) { +static bool canVectorizeInst(Instruction *Inst, User *User, + const DataLayout &DL) { switch (Inst->getOpcode()) { case Instruction::Load: { // Currently only handle the case where the Pointer Operand is a GEP. @@ -337,7 +347,14 @@ LI->getPointerOperandType() == User->getType() && isa(LI->getType())) return true; - return isa(LI->getPointerOperand()) && LI->isSimple(); + + Instruction *PtrInst = dyn_cast(LI->getPointerOperand()); + if (!PtrInst) + return false; + + return (PtrInst->getOpcode() == Instruction::GetElementPtr || + PtrInst->getOpcode() == Instruction::BitCast) && + LI->isSimple(); } case Instruction::BitCast: return true; @@ -350,14 +367,22 @@ SI->getPointerOperandType() == User->getType() && isa(SI->getValueOperand()->getType())) return true; - return (SI->getPointerOperand() == User) && isa(User) && SI->isSimple(); + + Instruction *UserInst = dyn_cast(User); + if (!UserInst) + return false; + + return (SI->getPointerOperand() == User) && + (UserInst->getOpcode() == Instruction::GetElementPtr || + UserInst->getOpcode() == Instruction::BitCast) && + SI->isSimple(); } default: return false; } } -static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { +static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL) { if (DisablePromoteAllocaToVector) { LLVM_DEBUG(dbgs() << " Promotion alloca to vector is disabled\n"); @@ -385,13 +410,37 @@ } std::map GEPVectorIdx; - std::vector WorkList; - for (User *AllocaUser : Alloca->users()) { + std::vector WorkList; + SmallVector Users(Alloca->users()); + SmallVector UseUsers(Users.size(), Alloca); + Type *VecEltTy = VectorTy->getElementType(); + while (!Users.empty()) { + User *AllocaUser = Users.pop_back_val(); + User *UseUser = UseUsers.pop_back_val(); + Instruction *Inst = dyn_cast(AllocaUser); + GetElementPtrInst *GEP = dyn_cast(AllocaUser); if (!GEP) { - if (!canVectorizeInst(cast(AllocaUser), Alloca)) + if (!canVectorizeInst(Inst, UseUser, DL)) return false; + if (Inst->getOpcode() == Instruction::BitCast) { + Type *FromTy = Inst->getOperand(0)->getType()->getPointerElementType(); + Type *ToTy = Inst->getType()->getPointerElementType(); + if (FromTy->isAggregateType() || ToTy->isAggregateType() || + DL.getTypeSizeInBits(FromTy) != DL.getTypeSizeInBits(ToTy)) + continue; + + for (User *CastUser : Inst->users()) { + if (isAssumeLikeIntrinsic(cast(CastUser))) + continue; + Users.push_back(CastUser); + UseUsers.push_back(Inst); + } + + continue; + } + WorkList.push_back(AllocaUser); continue; } @@ -407,12 +456,8 @@ } GEPVectorIdx[GEP] = Index; - for (User *GEPUser : AllocaUser->users()) { - if (!canVectorizeInst(cast(GEPUser), AllocaUser)) - return false; - - WorkList.push_back(GEPUser); - } + Users.append(GEP->user_begin(), GEP->user_end()); + UseUsers.append(GEP->getNumUses(), GEP); } LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> " @@ -433,6 +478,8 @@ Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy); Value *VecValue = Builder.CreateLoad(VectorTy, BitCast); Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index); + if (Inst->getType() != VecEltTy) + ExtractElement = Builder.CreateBitCast(ExtractElement, Inst->getType()); Inst->replaceAllUsesWith(ExtractElement); Inst->eraseFromParent(); break; @@ -447,16 +494,14 @@ Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy); Value *VecValue = Builder.CreateLoad(VectorTy, BitCast); - Value *NewVecValue = Builder.CreateInsertElement(VecValue, - SI->getValueOperand(), - Index); + Value *Elt = SI->getValueOperand(); + if (Elt->getType() != VecEltTy) + Elt = Builder.CreateBitCast(Elt, VecEltTy); + Value *NewVecValue = Builder.CreateInsertElement(VecValue, Elt, Index); Builder.CreateStore(NewVecValue, BitCast); Inst->eraseFromParent(); break; } - case Instruction::BitCast: - case Instruction::AddrSpaceCast: - break; default: llvm_unreachable("Inconsistency in instructions promotable to vector"); @@ -721,6 +766,7 @@ if (!I.isStaticAlloca() || I.isArrayAllocation()) return false; + const DataLayout &DL = Mod->getDataLayout(); IRBuilder<> Builder(&I); // First try to replace the alloca with a vector @@ -728,7 +774,7 @@ LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n'); - if (tryPromoteAllocaToVector(&I)) + if (tryPromoteAllocaToVector(&I, DL)) return true; // Promoted to vector. if (DisablePromoteAllocaToLDS) @@ -758,8 +804,6 @@ const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, ContainingFunction); unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second; - const DataLayout &DL = Mod->getDataLayout(); - unsigned Align = I.getAlignment(); if (Align == 0) Align = DL.getABITypeAlignment(I.getAllocatedType()); Index: llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll @@ -0,0 +1,275 @@ +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-ALLOCA %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-PROMOTE %s +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-promote-alloca -sroa -instcombine < %s | FileCheck -check-prefix=OPT %s + +target datalayout = "A5" + +; OPT-LABEL: @vector_read_alloca_bitcast( +; OPT-NOT: alloca +; OPT: %0 = extractelement <4 x i32> , i32 %index +; OPT-NEXT: store i32 %0, i32 addrspace(1)* %out, align 4 + +; GCN-LABEL: {{^}}vector_read_alloca_bitcast: +; GCN-ALLOCA-COUNT-4: buffer_store_dword +; GCN-ALLOCA: buffer_load_dword + +; GCN-PROMOTE: v_cmp_eq_u32_e64 [[CC1:[^,]+]], s{{[0-9]+}}, 1 +; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND1:v[0-9]+]], 0, 1, [[CC1]] +; GCN-PROMOTE: v_cmp_ne_u32_e64 [[CC2:[^,]+]], s{{[0-9]+}}, 2 +; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND2:v[0-9]+]], 2, [[IND1]], [[CC2]] +; GCN-PROMOTE: v_cmp_ne_u32_e64 [[CC3:[^,]+]], s{{[0-9]+}}, 3 +; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND3:v[0-9]+]], 3, [[IND2]], [[CC3]] + +; GCN-PROMOTE: ScratchSize: 0 + +define amdgpu_kernel void @vector_read_alloca_bitcast(i32 addrspace(1)* %out, i32 %index) { +entry: + %tmp = alloca [4 x i32], addrspace(5) + %x = bitcast [4 x i32] addrspace(5)* %tmp to i32 addrspace(5)* + %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1 + %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2 + %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3 + store i32 0, i32 addrspace(5)* %x + store i32 1, i32 addrspace(5)* %y + store i32 2, i32 addrspace(5)* %z + store i32 3, i32 addrspace(5)* %w + %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %index + %tmp2 = load i32, i32 addrspace(5)* %tmp1 + store i32 %tmp2, i32 addrspace(1)* %out + ret void +} + +; OPT-LABEL: @vector_write_alloca_bitcast( +; OPT-NOT: alloca +; OPT: %0 = insertelement <4 x i32> zeroinitializer, i32 1, i32 %w_index +; OPT-NEXT: %1 = extractelement <4 x i32> %0, i32 %r_index +; OPT-NEXT: store i32 %1, i32 addrspace(1)* %out, align + +; GCN-LABEL: {{^}}vector_write_alloca_bitcast: +; GCN-ALLOCA-COUNT-5: buffer_store_dword +; GCN-ALLOCA: buffer_load_dword + +; GCN-PROMOTE-COUNT-7: v_cndmask + +; GCN-PROMOTE: ScratchSize: 0 + +define amdgpu_kernel void @vector_write_alloca_bitcast(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) { +entry: + %tmp = alloca [4 x i32], addrspace(5) + %x = bitcast [4 x i32] addrspace(5)* %tmp to i32 addrspace(5)* + %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1 + %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2 + %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3 + store i32 0, i32 addrspace(5)* %x + store i32 0, i32 addrspace(5)* %y + store i32 0, i32 addrspace(5)* %z + store i32 0, i32 addrspace(5)* %w + %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %w_index + store i32 1, i32 addrspace(5)* %tmp1 + %tmp2 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %r_index + %tmp3 = load i32, i32 addrspace(5)* %tmp2 + store i32 %tmp3, i32 addrspace(1)* %out + ret void +} + +; OPT-LABEL: @vector_write_read_bitcast_to_float( +; OPT-NOT: alloca +; OPT: bb2: +; OPT: %tmp.sroa.0.0 = phi <6 x float> [ undef, %bb ], [ %0, %bb2 ] +; OPT: %0 = insertelement <6 x float> %tmp.sroa.0.0, float %tmp73, i32 %tmp10 +; OPT: .preheader: +; OPT: %bc = bitcast <6 x float> %0 to <6 x i32> +; OPT: %1 = extractelement <6 x i32> %bc, i32 %tmp20 + +; GCN-LABEL: {{^}}vector_write_read_bitcast_to_float: +; GCN-ALLOCA: buffer_store_dword + +; GCN-PROMOTE-COUNT-6: v_cmp_eq_u16 +; GCN-PROMOTE-COUNT-6: v_cndmask + +; GCN: s_cbranch + +; GCN-ALLOCA: buffer_load_dword + +; GCN-PROMOTE: v_cmp_eq_u16 +; GCN-PROMOTE: v_cndmask +; GCN-PROMOTE: v_cmp_eq_u16 +; GCN-PROMOTE: v_cndmask +; GCN-PROMOTE: v_cmp_eq_u16 +; GCN-PROMOTE: v_cndmask +; GCN-PROMOTE: v_cmp_eq_u16 +; GCN-PROMOTE: v_cndmask +; GCN-PROMOTE: v_cmp_eq_u16 +; GCN-PROMOTE: v_cndmask + +; GCN-PROMOTE: ScratchSize: 0 + +define amdgpu_kernel void @vector_write_read_bitcast_to_float(float addrspace(1)* %arg) { +bb: + %tmp = alloca [6 x float], align 4, addrspace(5) + %tmp1 = bitcast [6 x float] addrspace(5)* %tmp to i8 addrspace(5)* + call void @llvm.lifetime.start.p5i8(i64 24, i8 addrspace(5)* %tmp1) #2 + br label %bb2 + +bb2: ; preds = %bb2, %bb + %tmp3 = phi i32 [ 0, %bb ], [ %tmp13, %bb2 ] + %tmp4 = zext i32 %tmp3 to i64 + %tmp5 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp4 + %tmp6 = bitcast float addrspace(1)* %tmp5 to i32 addrspace(1)* + %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4 + %tmp8 = trunc i32 %tmp3 to i16 + %tmp9 = urem i16 %tmp8, 6 + %tmp10 = zext i16 %tmp9 to i32 + %tmp11 = getelementptr inbounds [6 x float], [6 x float] addrspace(5)* %tmp, i32 0, i32 %tmp10 + %tmp12 = bitcast float addrspace(5)* %tmp11 to i32 addrspace(5)* + store i32 %tmp7, i32 addrspace(5)* %tmp12, align 4 + %tmp13 = add nuw nsw i32 %tmp3, 1 + %tmp14 = icmp eq i32 %tmp13, 1000 + br i1 %tmp14, label %.preheader, label %bb2 + +bb15: ; preds = %.preheader + call void @llvm.lifetime.end.p5i8(i64 24, i8 addrspace(5)* %tmp1) #2 + ret void + +.preheader: ; preds = %.preheader, %bb2 + %tmp16 = phi i32 [ %tmp27, %.preheader ], [ 0, %bb2 ] + %tmp17 = trunc i32 %tmp16 to i16 + %tmp18 = urem i16 %tmp17, 6 + %tmp19 = sub nuw nsw i16 5, %tmp18 + %tmp20 = zext i16 %tmp19 to i32 + %tmp21 = getelementptr inbounds [6 x float], [6 x float] addrspace(5)* %tmp, i32 0, i32 %tmp20 + %tmp22 = bitcast float addrspace(5)* %tmp21 to i32 addrspace(5)* + %tmp23 = load i32, i32 addrspace(5)* %tmp22, align 4 + %tmp24 = zext i32 %tmp16 to i64 + %tmp25 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp24 + %tmp26 = bitcast float addrspace(1)* %tmp25 to i32 addrspace(1)* + store i32 %tmp23, i32 addrspace(1)* %tmp26, align 4 + %tmp27 = add nuw nsw i32 %tmp16, 1 + %tmp28 = icmp eq i32 %tmp27, 1000 + br i1 %tmp28, label %bb15, label %.preheader +} + +; OPT-LABEL: @vector_write_read_bitcast_to_double( +; OPT-NOT: alloca +; OPT: bb2: +; OPT: %tmp.sroa.0.0 = phi <6 x double> [ undef, %bb ], [ %0, %bb2 ] +; OPT: %0 = insertelement <6 x double> %tmp.sroa.0.0, double %tmp73, i32 %tmp10 +; OPT: .preheader: +; OPT: %bc = bitcast <6 x double> %0 to <6 x i64> +; OPT: %1 = extractelement <6 x i64> %bc, i32 %tmp20 + +; TODO: Fix selection to eliminate scratch + +; GCN-LABEL: {{^}}vector_write_read_bitcast_to_double: +; GCN-COUNT-2: buffer_store_dword + +; GCN: s_cbranch + +; GCN-COUNT-2: buffer_load_dword + +define amdgpu_kernel void @vector_write_read_bitcast_to_double(double addrspace(1)* %arg) { +bb: + %tmp = alloca [6 x double], align 8, addrspace(5) + %tmp1 = bitcast [6 x double] addrspace(5)* %tmp to i8 addrspace(5)* + call void @llvm.lifetime.start.p5i8(i64 48, i8 addrspace(5)* %tmp1) #2 + br label %bb2 + +bb2: ; preds = %bb2, %bb + %tmp3 = phi i32 [ 0, %bb ], [ %tmp13, %bb2 ] + %tmp4 = zext i32 %tmp3 to i64 + %tmp5 = getelementptr inbounds double, double addrspace(1)* %arg, i64 %tmp4 + %tmp6 = bitcast double addrspace(1)* %tmp5 to i64 addrspace(1)* + %tmp7 = load i64, i64 addrspace(1)* %tmp6, align 8 + %tmp8 = trunc i32 %tmp3 to i16 + %tmp9 = urem i16 %tmp8, 6 + %tmp10 = zext i16 %tmp9 to i32 + %tmp11 = getelementptr inbounds [6 x double], [6 x double] addrspace(5)* %tmp, i32 0, i32 %tmp10 + %tmp12 = bitcast double addrspace(5)* %tmp11 to i64 addrspace(5)* + store i64 %tmp7, i64 addrspace(5)* %tmp12, align 8 + %tmp13 = add nuw nsw i32 %tmp3, 1 + %tmp14 = icmp eq i32 %tmp13, 1000 + br i1 %tmp14, label %.preheader, label %bb2 + +bb15: ; preds = %.preheader + call void @llvm.lifetime.end.p5i8(i64 48, i8 addrspace(5)* %tmp1) #2 + ret void + +.preheader: ; preds = %.preheader, %bb2 + %tmp16 = phi i32 [ %tmp27, %.preheader ], [ 0, %bb2 ] + %tmp17 = trunc i32 %tmp16 to i16 + %tmp18 = urem i16 %tmp17, 6 + %tmp19 = sub nuw nsw i16 5, %tmp18 + %tmp20 = zext i16 %tmp19 to i32 + %tmp21 = getelementptr inbounds [6 x double], [6 x double] addrspace(5)* %tmp, i32 0, i32 %tmp20 + %tmp22 = bitcast double addrspace(5)* %tmp21 to i64 addrspace(5)* + %tmp23 = load i64, i64 addrspace(5)* %tmp22, align 8 + %tmp24 = zext i32 %tmp16 to i64 + %tmp25 = getelementptr inbounds double, double addrspace(1)* %arg, i64 %tmp24 + %tmp26 = bitcast double addrspace(1)* %tmp25 to i64 addrspace(1)* + store i64 %tmp23, i64 addrspace(1)* %tmp26, align 8 + %tmp27 = add nuw nsw i32 %tmp16, 1 + %tmp28 = icmp eq i32 %tmp27, 1000 + br i1 %tmp28, label %bb15, label %.preheader +} + +; OPT-LABEL: @vector_write_read_bitcast_to_i64( +; OPT-NOT: alloca +; OPT: bb2: +; OPT: %tmp.sroa.0.0 = phi <6 x i64> [ undef, %bb ], [ %0, %bb2 ] +; OPT: %0 = insertelement <6 x i64> %tmp.sroa.0.0, i64 %tmp6, i32 %tmp9 +; OPT: .preheader: +; OPT: %1 = extractelement <6 x i64> %0, i32 %tmp18 + +; TODO: Fix selection to eliminate scratch + +; GCN-LABEL: {{^}}vector_write_read_bitcast_to_i64: +; GCN-COUNT-2: buffer_store_dword + +; GCN: s_cbranch + +; GCN-COUNT-2: buffer_load_dword + +define amdgpu_kernel void @vector_write_read_bitcast_to_i64(i64 addrspace(1)* %arg) { +bb: + %tmp = alloca [6 x i64], align 8, addrspace(5) + %tmp1 = bitcast [6 x i64] addrspace(5)* %tmp to i8 addrspace(5)* + call void @llvm.lifetime.start.p5i8(i64 48, i8 addrspace(5)* %tmp1) #2 + br label %bb2 + +bb2: ; preds = %bb2, %bb + %tmp3 = phi i32 [ 0, %bb ], [ %tmp11, %bb2 ] + %tmp4 = zext i32 %tmp3 to i64 + %tmp5 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 %tmp4 + %tmp6 = load i64, i64 addrspace(1)* %tmp5, align 8 + %tmp7 = trunc i32 %tmp3 to i16 + %tmp8 = urem i16 %tmp7, 6 + %tmp9 = zext i16 %tmp8 to i32 + %tmp10 = getelementptr inbounds [6 x i64], [6 x i64] addrspace(5)* %tmp, i32 0, i32 %tmp9 + store i64 %tmp6, i64 addrspace(5)* %tmp10, align 8 + %tmp11 = add nuw nsw i32 %tmp3, 1 + %tmp12 = icmp eq i32 %tmp11, 1000 + br i1 %tmp12, label %.preheader, label %bb2 + +bb13: ; preds = %.preheader + call void @llvm.lifetime.end.p5i8(i64 48, i8 addrspace(5)* %tmp1) #2 + ret void + +.preheader: ; preds = %.preheader, %bb2 + %tmp14 = phi i32 [ %tmp23, %.preheader ], [ 0, %bb2 ] + %tmp15 = trunc i32 %tmp14 to i16 + %tmp16 = urem i16 %tmp15, 6 + %tmp17 = sub nuw nsw i16 5, %tmp16 + %tmp18 = zext i16 %tmp17 to i32 + %tmp19 = getelementptr inbounds [6 x i64], [6 x i64] addrspace(5)* %tmp, i32 0, i32 %tmp18 + %tmp20 = load i64, i64 addrspace(5)* %tmp19, align 8 + %tmp21 = zext i32 %tmp14 to i64 + %tmp22 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 %tmp21 + store i64 %tmp20, i64 addrspace(1)* %tmp22, align 8 + %tmp23 = add nuw nsw i32 %tmp14, 1 + %tmp24 = icmp eq i32 %tmp23, 1000 + br i1 %tmp24, label %bb13, label %.preheader +} + +declare void @llvm.lifetime.start.p5i8(i64 immarg, i8 addrspace(5)* nocapture) + +declare void @llvm.lifetime.end.p5i8(i64 immarg, i8 addrspace(5)* nocapture)