Index: lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -33,6 +33,7 @@ private: const TargetMachine *TM; Module *Mod; + const DataLayout *DL; MDNode *MaxWorkGroupSizeRange; // FIXME: This should be per-kernel. @@ -44,6 +45,20 @@ std::pair getLocalSizeYZ(IRBuilder<> &Builder); Value *getWorkitemID(IRBuilder<> &Builder, unsigned N); + /// BaseAlloca is the alloca root the search started from. + /// Val may be that alloca or a recursive user of it. + bool collectUsesWithPtrTypes(Value *BaseAlloca, + Value *Val, + std::vector &WorkList) const; + + /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand + /// indices to an instruction with 2 pointer inputs (e.g. select, icmp). + /// Returns true if both operands are derived from the same alloca. Val should + /// be the same value as one of the input operands of UseInst. + bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val, + Instruction *UseInst, + int OpIdx0, int OpIdx1) const; + public: static char ID; @@ -51,6 +66,7 @@ FunctionPass(ID), TM(TM_), Mod(nullptr), + DL(nullptr), MaxWorkGroupSizeRange(nullptr), LocalMemAvailable(0), IsAMDGCN(false), @@ -63,6 +79,11 @@ return "AMDGPU Promote Alloca"; } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + FunctionPass::getAnalysisUsage(AU); + } + void visitAlloca(AllocaInst &I); }; @@ -81,6 +102,7 @@ return false; Mod = &M; + DL = &Mod->getDataLayout(); // The maximum workitem id. // @@ -134,8 +156,7 @@ if (!Use) continue; if (Use->getParent()->getParent() == &F) - LocalMemAvailable -= - Mod->getDataLayout().getTypeAllocSize(GV->getValueType()); + LocalMemAvailable -= DL->getTypeAllocSize(GV->getValueType()); } } } @@ -404,7 +425,39 @@ return true; } -static bool collectUsesWithPtrTypes(Value *Val, std::vector &WorkList) { +bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca, + Value *Val, + Instruction *Inst, + int OpIdx0, + int OpIdx1) const { + // Figure out which operand is the one we might not be promoting. + Value *OtherOp = Inst->getOperand(OpIdx0); + if (Val == OtherOp) + OtherOp = Inst->getOperand(OpIdx1); + + Value *OtherObj = GetUnderlyingObject(OtherOp, *DL); + if (!isa(OtherObj)) + return false; + + // TODO: We should be able to replace undefs with the right pointer type. + + // TODO: If we know the other base object is another promotable + // alloca, not necessarily this alloca, we can do this. The + // important part is both must have the same address space at + // the end. + if (OtherObj != BaseAlloca) { + DEBUG(dbgs() << "Found a binary instruction with another alloca object\n"); + return false; + } + + return true; +} + +bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes( + Value *BaseAlloca, + Value *Val, + std::vector &WorkList) const { + bool Success = true; for (User *User : Val->users()) { if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end()) @@ -430,12 +483,42 @@ return false; } + // Only promote a select if we know that the other select operand + // is from another pointer that will also be promoted. + if (ICmpInst *ICmp = dyn_cast_or_null(UseInst)) { + if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1)) + return false; + } + if (!User->getType()->isPointerTy()) continue; + // Only promote a select if we know that the other select operand + // is from another pointer that will also be promoted. + if (SelectInst *SI = dyn_cast_or_null(UseInst)) { + if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2)) + return false; + } + + // Repeat for phis. + if (PHINode *Phi = dyn_cast_or_null(UseInst)) { + // TODO: Handle more complex cases. We should be able to replace loops + // over arrays. + switch (Phi->getNumIncomingValues()) { + case 1: + break; + case 2: + if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1)) + return false; + break; + default: + return false; + } + } + WorkList.push_back(User); - Success &= collectUsesWithPtrTypes(User, WorkList); + Success &= collectUsesWithPtrTypes(BaseAlloca, User, WorkList); } return Success; } @@ -470,7 +553,7 @@ std::vector WorkList; - if (!collectUsesWithPtrTypes(&I, WorkList)) { + if (!collectUsesWithPtrTypes(&I, &I, WorkList)) { DEBUG(dbgs() << " Do not know how to convert all uses\n"); return; } Index: test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll @@ -0,0 +1,38 @@ +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck %s + +; This normally would be fixed by instcombine to be compare to the GEP +; indices + +; CHECK-LABEL: @lds_promoted_alloca_icmp_same_derived_pointer( +; CHECK: [[ARRAYGEP:%[0-9]+]] = getelementptr [256 x i32], [256 x i32] addrspace(3)* @alloca, i32 0, i32 %{{[0-9]+}} +; CHECK: %ptr0 = getelementptr inbounds i32, i32 addrspace(3)* [[ARRAYGEP]], i32 %a +; CHECK: %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* [[ARRAYGEP]], i32 %b +; CHECK: %cmp = icmp eq i32 addrspace(3)* %ptr0, %ptr1 +define void @lds_promoted_alloca_icmp_same_derived_pointer(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { + %alloca = alloca i32, i32 16, align 4 + %ptr0 = getelementptr inbounds i32, i32* %alloca, i32 %a + %ptr1 = getelementptr inbounds i32, i32* %alloca, i32 %b + %cmp = icmp eq i32* %ptr0, %ptr1 + %zext = zext i1 %cmp to i32 + store volatile i32 %zext, i32 addrspace(1)* %out + ret void +} + +; CHECK-LABEL: @lds_promoted_alloca_icmp_unknown_ptr( +; CHECK: %alloca = alloca i32, i32 16, align 4 +; CHECK: %ptr0 = getelementptr inbounds i32, i32* %alloca, i32 %a +; CHECK: %ptr1 = call i32* @get_unknown_pointer() +; CHECK: %cmp = icmp eq i32* %ptr0, %ptr1 +define void @lds_promoted_alloca_icmp_unknown_ptr(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { + %alloca = alloca i32, i32 16, align 4 + %ptr0 = getelementptr inbounds i32, i32* %alloca, i32 %a + %ptr1 = call i32* @get_unknown_pointer() + %cmp = icmp eq i32* %ptr0, %ptr1 + %zext = zext i1 %cmp to i32 + store volatile i32 %zext, i32 addrspace(1)* %out + ret void +} + +declare i32* @get_unknown_pointer() #0 + +attributes #0 = { nounwind } Index: test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll @@ -0,0 +1,170 @@ +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck %s + + +; CHECK-LABEL: @branch_ptr_var_same_alloca( +; CHECK: getelementptr [256 x [64 x i32]], [256 x [64 x i32]] addrspace(3)* @alloca, i32 0, i32 %{{[0-9]+}} + +; CHECK: if: +; CHECK: %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* %{{[0-9]+}}, i32 0, i32 %a + +; CHECK: else: +; CHECK: %arrayidx1 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* %15, i32 0, i32 %b + +; CHECK: endif: +; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %if ], [ %arrayidx1, %else ] +; CHECK: store i32 0, i32 addrspace(3)* %phi.ptr, align 4 +define void @branch_ptr_var_same_alloca(i32 %a, i32 %b) #0 { +entry: + %alloca = alloca [64 x i32], align 4 + br i1 undef, label %if, label %else + +if: + %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %a + br label %endif + +else: + %arrayidx1 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %b + br label %endif + +endif: + %phi.ptr = phi i32* [ %arrayidx0, %if ], [ %arrayidx1, %else ] + store i32 0, i32* %phi.ptr, align 4 + ret void +} + +; CHECK-LABEL: @one_phi_value( +; CHECK: getelementptr [256 x [64 x i32]], [256 x [64 x i32]] addrspace(3)* @alloca.1, i32 0, i32 %14 +; CHECK: %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* %{{[0-9]+}}, i32 0, i32 %a + +; CHECK: br label %exit +; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %entry ] +; CHECK: store i32 0, i32 addrspace(3)* %phi.ptr, align 4 +define void @one_phi_value(i32 %a) #0 { +entry: + %alloca = alloca [64 x i32], align 4 + %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %a + br label %exit + +exit: + %phi.ptr = phi i32* [ %arrayidx0, %entry ] + store i32 0, i32* %phi.ptr, align 4 + ret void +} + +; CHECK-LABEL: @branch_ptr_alloca_unknown_obj( +; CHECK: %alloca = alloca [64 x i32], align 4 + +; CHECK: if: +; CHECK: %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %a + +; CHECK: else: +; CHECK: %arrayidx1 = call i32* @get_unknown_pointer() + +; CHECK: endif: +; CHECK: %phi.ptr = phi i32* [ %arrayidx0, %if ], [ %arrayidx1, %else ] +; CHECK: store i32 0, i32* %phi.ptr, align 4 +define void @branch_ptr_alloca_unknown_obj(i32 %a, i32 %b) #0 { +entry: + %alloca = alloca [64 x i32], align 4 + br i1 undef, label %if, label %else + +if: + %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %a + br label %endif + +else: + %arrayidx1 = call i32* @get_unknown_pointer() + br label %endif + +endif: + %phi.ptr = phi i32* [ %arrayidx0, %if ], [ %arrayidx1, %else ] + store i32 0, i32* %phi.ptr, align 4 + ret void +} + +; kernel void ptr_induction_var_same_alloca(void) +; { +; int alloca[64]; +; int i = 0; + +; #pragma nounroll +; for (int* p = &alloca[2], *e = &alloca[48]; p != e; ++p, ++i) +; { +; *p = i; +; } +; } + +; FIXME: This should be promotable. We need to use +; GetUnderlyingObjects when looking at the icmp user. + +; CHECK-LABEL: @ptr_induction_var_same_alloca( +; CHECK: %alloca = alloca [64 x i32], align 4 +; CHECK: phi i32* [ %arrayidx, %entry ], [ %incdec.ptr, %for.body ] +define void @ptr_induction_var_same_alloca() #0 { +entry: + %alloca = alloca [64 x i32], align 4 + %arrayidx = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 2 + %arrayidx1 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 48 + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %for.body, %entry + %i.09 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %p.08 = phi i32* [ %arrayidx, %entry ], [ %incdec.ptr, %for.body ] + store i32 %i.09, i32* %p.08, align 4 + %incdec.ptr = getelementptr inbounds i32, i32* %p.08, i32 1 + %inc = add nuw nsw i32 %i.09, 1 + %cmp = icmp eq i32* %incdec.ptr, %arrayidx1 + br i1 %cmp, label %for.cond.cleanup, label %for.body +} + + +; extern int* get_unknown_pointer(void); + +; kernel void ptr_induction_var_alloca_unknown(void) +; { +; int alloca[64]; +; int i = 0; +; +; for (int* p = &alloca[2], *e = get_unknown_pointer(); p != e; ++p, ++i) +; { +; *p = i; +; } +; } + +; CHECK-LABEL: @ptr_induction_var_alloca_unknown( +; CHECK: %alloca = alloca [64 x i32], align 4 +; CHECK: %p.08 = phi i32* [ %incdec.ptr, %for.body ], [ %arrayidx, %for.body.preheader ] +; CHECK: %cmp = icmp eq i32* %incdec.ptr, %call +define void @ptr_induction_var_alloca_unknown() #0 { +entry: + %alloca = alloca [64 x i32], align 4 + %arrayidx = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 2 + %call = tail call i32* @get_unknown_pointer() #2 + %cmp.7 = icmp eq i32* %arrayidx, %call + br i1 %cmp.7, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.body, %for.body.preheader + %i.09 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %p.08 = phi i32* [ %incdec.ptr, %for.body ], [ %arrayidx, %for.body.preheader ] + store i32 %i.09, i32* %p.08, align 4 + %incdec.ptr = getelementptr inbounds i32, i32* %p.08, i32 1 + %inc = add nuw nsw i32 %i.09, 1 + %cmp = icmp eq i32* %incdec.ptr, %call + br i1 %cmp, label %for.cond.cleanup.loopexit, label %for.body +} + +declare i32* @get_unknown_pointer() #0 + +attributes #0 = { nounwind } Index: test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll @@ -0,0 +1,102 @@ +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck %s + +; CHECK-LABEL: @lds_promoted_alloca_select_invalid_pointer_operand( +; CHECK: %alloca = alloca i32 +; CHECK: select i1 undef, i32* undef, i32* %alloca +define void @lds_promoted_alloca_select_invalid_pointer_operand() #0 { + %alloca = alloca i32, align 4 + %select = select i1 undef, i32* undef, i32* %alloca + store i32 0, i32* %select, align 4 + ret void +} + +; CHECK-LABEL: @lds_promote_alloca_select_two_derived_pointers( +; CHECK: [[ARRAYGEP:%[0-9]+]] = getelementptr [256 x i32], [256 x i32] addrspace(3)* @alloca, i32 0, i32 %{{[0-9]+}} +; CHECK: %ptr0 = getelementptr inbounds i32, i32 addrspace(3)* [[ARRAYGEP]], i32 %a +; CHECK: %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* [[ARRAYGEP]], i32 %b +; CHECK: %select = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1 +; CHECK: store i32 0, i32 addrspace(3)* %select, align 4 +define void @lds_promote_alloca_select_two_derived_pointers(i32 %a, i32 %b) #0 { + %alloca = alloca i32, i32 16, align 4 + %ptr0 = getelementptr inbounds i32, i32* %alloca, i32 %a + %ptr1 = getelementptr inbounds i32, i32* %alloca, i32 %b + %select = select i1 undef, i32* %ptr0, i32* %ptr1 + store i32 0, i32* %select, align 4 + ret void +} + +; FIXME: This should be promotable but requires knowing that both will be promoted first. + +; CHECK-LABEL: @lds_promote_alloca_select_two_allocas( +; CHECK: %alloca0 = alloca i32, i32 16, align 4 +; CHECK: %alloca1 = alloca i32, i32 16, align 4 +; CHECK: %ptr0 = getelementptr inbounds i32, i32* %alloca0, i32 %a +; CHECK: %ptr1 = getelementptr inbounds i32, i32* %alloca1, i32 %b +; CHECK: %select = select i1 undef, i32* %ptr0, i32* %ptr1 +define void @lds_promote_alloca_select_two_allocas(i32 %a, i32 %b) #0 { + %alloca0 = alloca i32, i32 16, align 4 + %alloca1 = alloca i32, i32 16, align 4 + %ptr0 = getelementptr inbounds i32, i32* %alloca0, i32 %a + %ptr1 = getelementptr inbounds i32, i32* %alloca1, i32 %b + %select = select i1 undef, i32* %ptr0, i32* %ptr1 + store i32 0, i32* %select, align 4 + ret void +} + +; TODO: Maybe this should be canonicalized to select on the constant and GPE after. +; CHECK-LABEL: @lds_promote_alloca_select_two_derived_constant_pointers( +; CHECK: [[ARRAYGEP:%[0-9]+]] = getelementptr [256 x i32], [256 x i32] addrspace(3)* @alloca.1, i32 0, i32 %{{[0-9]+}} +; CHECK: %ptr0 = getelementptr inbounds i32, i32 addrspace(3)* [[ARRAYGEP]], i32 1 +; CHECK: %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* [[ARRAYGEP]], i32 3 +; CHECK: %select = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1 +; CHECK: store i32 0, i32 addrspace(3)* %select, align 4 +define void @lds_promote_alloca_select_two_derived_constant_pointers() #0 { + %alloca = alloca i32, i32 16, align 4 + %ptr0 = getelementptr inbounds i32, i32* %alloca, i32 1 + %ptr1 = getelementptr inbounds i32, i32* %alloca, i32 3 + %select = select i1 undef, i32* %ptr0, i32* %ptr1 + store i32 0, i32* %select, align 4 + ret void +} + +; CHECK-LABEL: @lds_promoted_alloca_select_input_select( +; CHECK: getelementptr [256 x i32], [256 x i32] addrspace(3)* @alloca.2, i32 0, i32 %{{[0-9]+}} +; CHECK: %ptr0 = getelementptr inbounds i32, i32 addrspace(3)* %{{[0-9]+}}, i32 %a +; CHECK: %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %{{[0-9]+}}, i32 %b +; CHECK: %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %{{[0-9]+}}, i32 %c +; CHECK: %select0 = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1 +; CHECK: %select1 = select i1 undef, i32 addrspace(3)* %select0, i32 addrspace(3)* %ptr2 +; CHECK: store i32 0, i32 addrspace(3)* %select1, align 4 +define void @lds_promoted_alloca_select_input_select(i32 %a, i32 %b, i32 %c) #0 { + %alloca = alloca i32, i32 16, align 4 + %ptr0 = getelementptr inbounds i32, i32* %alloca, i32 %a + %ptr1 = getelementptr inbounds i32, i32* %alloca, i32 %b + %ptr2 = getelementptr inbounds i32, i32* %alloca, i32 %c + %select0 = select i1 undef, i32* %ptr0, i32* %ptr1 + %select1 = select i1 undef, i32* %select0, i32* %ptr2 + store i32 0, i32* %select1, align 4 + ret void +} + +define void @lds_promoted_alloca_select_input_phi(i32 %a, i32 %b, i32 %c) #0 { +entry: + %alloca = alloca i32, i32 16, align 4 + %ptr0 = getelementptr inbounds i32, i32* %alloca, i32 %a + %ptr1 = getelementptr inbounds i32, i32* %alloca, i32 %b + store i32 0, i32* %ptr0 + br i1 undef, label %bb1, label %bb2 + +bb1: + %ptr2 = getelementptr inbounds i32, i32* %alloca, i32 %c + %select0 = select i1 undef, i32* undef, i32* %ptr2 + store i32 0, i32* %ptr1 + br label %bb2 + +bb2: + %phi.ptr = phi i32* [ %ptr0, %entry ], [ %select0, %bb1 ] + %select1 = select i1 undef, i32* %phi.ptr, i32* %ptr1 + store i32 0, i32* %select1, align 4 + ret void +} + +attributes #0 = { norecurse nounwind }