diff --git a/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp @@ -180,6 +180,37 @@ return LDSPointer; } + // As an optimization, split the entry block of kernel K just after top most + // static alloca cluster. + // + // From the better code transformation/optimization perspective, it is + // expected that all static alloca appear as a single contiguous cluster at + // the start of the entry block. If this canonical form is *not* maintained, + // then few static alloca may become dynamic after the entry block split here. + Instruction *findEntryBlockSplitPoint(Function *K) { + auto &EBB = K->getEntryBlock(); + auto Iter = EBB.getFirstInsertionPt(); + auto *EI = &*Iter; + auto *TI = EBB.getTerminator(); + + // Locate the first static alloca. + while ((&*Iter != TI) && !isa(&*Iter) && !isa(&*Iter)) + ++Iter; + + // Top most static alloca cluster not found? Split the block at the first + // insertion point. + if (!isa(&*Iter)) + return EI; + + // We have found (at least one) static alloca at the top of the entry block. + // Skip them. + while (isa(&*Iter)) + ++Iter; + + // Split the block just after top most static alloca cluster. + return &*Iter; + } + // Split entry basic block in such a way that only lane 0 of each wave does // the LDS pointer initialization, and return newly created basic block. BasicBlock *activateLaneZero(Function *K) { @@ -189,9 +220,8 @@ if (!BasicBlockEntry.second) return BasicBlockEntry.first->second; - // Split entry basic block of kernel K. - auto *EI = &(*(K->getEntryBlock().getFirstInsertionPt())); - IRBuilder<> Builder(EI); + // Split entry basic block of kernel K just after top static alloca cluster. + IRBuilder<> Builder(findEntryBlockSplitPoint(K)); Value *Mbcnt = Builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -295,7 +295,7 @@ static cl::opt EnableLDSReplaceWithPointer( "amdgpu-enable-lds-replace-with-pointer", - cl::desc("Enable LDS replace with pointer pass"), cl::init(false), + cl::desc("Enable LDS replace with pointer pass"), cl::init(true), cl::Hidden); static cl::opt EnableLowerModuleLDS( diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -42,6 +42,7 @@ ; GCN-O0-NEXT: Inliner for always_inline functions ; GCN-O0-NEXT: A No-Op Barrier Pass ; GCN-O0-NEXT: Lower OpenCL enqueued blocks +; GCN-O0-NEXT: Replace within non-kernel function use of LDS with pointer ; GCN-O0-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O0-NEXT: FunctionPass Manager ; GCN-O0-NEXT: Expand Atomic instructions @@ -178,6 +179,7 @@ ; GCN-O1-NEXT: Inliner for always_inline functions ; GCN-O1-NEXT: A No-Op Barrier Pass ; GCN-O1-NEXT: Lower OpenCL enqueued blocks +; GCN-O1-NEXT: Replace within non-kernel function use of LDS with pointer ; GCN-O1-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O1-NEXT: FunctionPass Manager ; GCN-O1-NEXT: Infer address spaces @@ -429,6 +431,7 @@ ; GCN-O1-OPTS-NEXT: Inliner for always_inline functions ; GCN-O1-OPTS-NEXT: A No-Op Barrier Pass ; GCN-O1-OPTS-NEXT: Lower OpenCL enqueued blocks +; GCN-O1-OPTS-NEXT: Replace within non-kernel function use of LDS with pointer ; GCN-O1-OPTS-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O1-OPTS-NEXT: FunctionPass Manager ; GCN-O1-OPTS-NEXT: Infer address spaces @@ -713,6 +716,7 @@ ; GCN-O2-NEXT: Inliner for always_inline functions ; GCN-O2-NEXT: A No-Op Barrier Pass ; GCN-O2-NEXT: Lower OpenCL enqueued blocks +; GCN-O2-NEXT: Replace within non-kernel function use of LDS with pointer ; GCN-O2-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O2-NEXT: FunctionPass Manager ; GCN-O2-NEXT: Infer address spaces @@ -999,6 +1003,7 @@ ; GCN-O3-NEXT: Inliner for always_inline functions ; GCN-O3-NEXT: A No-Op Barrier Pass ; GCN-O3-NEXT: Lower OpenCL enqueued blocks +; GCN-O3-NEXT: Replace within non-kernel function use of LDS with pointer ; GCN-O3-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O3-NEXT: FunctionPass Manager ; GCN-O3-NEXT: Infer address spaces diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-diamond-shape.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-diamond-shape.ll --- a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-diamond-shape.ll +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-diamond-shape.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer -amdgpu-enable-lds-replace-with-pointer=true < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s ; DESCRIPTION: ; diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-selected_functions.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-selected_functions.ll --- a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-selected_functions.ll +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-selected_functions.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer -amdgpu-enable-lds-replace-with-pointer=true < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s ; DESCRIPTION: ; diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-to-declare-only-func.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-to-declare-only-func.ll --- a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-to-declare-only-func.ll +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-call-to-declare-only-func.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer -amdgpu-enable-lds-replace-with-pointer=true < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s ; DESCRIPTION: ; diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-global-scope-use.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-global-scope-use.ll --- a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-global-scope-use.ll +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-global-scope-use.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer -amdgpu-enable-lds-replace-with-pointer=true < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s ; DESCRIPTION: ; diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-inline-asm-call.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-inline-asm-call.ll --- a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-inline-asm-call.ll +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-inline-asm-call.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer -amdgpu-enable-lds-replace-with-pointer=true < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s ; DESCRIPTION: ; diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-kernel-only-used-lds.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-kernel-only-used-lds.ll --- a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-kernel-only-used-lds.ll +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-kernel-only-used-lds.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer -amdgpu-enable-lds-replace-with-pointer=true < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s ; DESCRIPTION ; ; diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-not-reachable-lds.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-not-reachable-lds.ll --- a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-not-reachable-lds.ll +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-not-reachable-lds.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer -amdgpu-enable-lds-replace-with-pointer=true < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s ; DESCRIPTION ; ; diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-small-lds.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-small-lds.ll --- a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-small-lds.ll +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-ignore-small-lds.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer -amdgpu-enable-lds-replace-with-pointer=true < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s ; DESCRIPTION ; ; diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-diamond-shape.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-diamond-shape.ll --- a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-diamond-shape.ll +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-diamond-shape.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer -amdgpu-enable-lds-replace-with-pointer=true < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s ; DESCRIPTION: ; diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-selected_functions.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-selected_functions.ll --- a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-selected_functions.ll +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-selected_functions.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer -amdgpu-enable-lds-replace-with-pointer=true < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s ; DESCRIPTION: ; diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-signature-match.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-signature-match.ll --- a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-signature-match.ll +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-indirect-call-signature-match.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer -amdgpu-enable-lds-replace-with-pointer=true < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s ; DESCRIPTION: ; diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-const-expr1.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-split-entry-bb-after-top-static-alloca-cluster.ll copy from llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-const-expr1.ll copy to llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-split-entry-bb-after-top-static-alloca-cluster.ll --- a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-const-expr1.ll +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-split-entry-bb-after-top-static-alloca-cluster.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer -amdgpu-enable-lds-replace-with-pointer=true < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s ; DESCRIPTION: ; @@ -7,6 +7,9 @@ ; reachable from kernel. Hence nested constant expression should to be converted into a ; series of instructons and pointer replacement should take place. ; +; Further the entry basic block of the kernel @k0 contains alloca instruction. Hence the +; entry basic splitting for pointer initialization should happen after alloca. +; ; Original LDS should exist. ; CHECK: @used_only_within_func = addrspace(3) global [4 x i32] undef, align 4 @@ -36,19 +39,31 @@ ; Pointer initialization code shoud be added define amdgpu_kernel void @k0() { ; CHECK-LABEL: entry: -; CHECK: %0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; CHECK: %1 = icmp eq i32 %0, 0 -; CHECK: br i1 %1, label %2, label %3 +; CHECK: %0 = alloca i64, align 8, addrspace(5) +; CHECK: %1 = alloca i64, align 8, addrspace(5) +; CHECK: %2 = alloca i64, align 8, addrspace(5) +; CHECK: %3 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) +; CHECK: %4 = icmp eq i32 %3, 0 +; CHECK: br i1 %4, label %5, label %6 ; -; CHECK-LABEL: 2: +; CHECK-LABEL: 5: ; CHECK: store i16 ptrtoint ([4 x i32] addrspace(3)* @used_only_within_func to i16), i16 addrspace(3)* @used_only_within_func.ptr, align 2 -; CHECK: br label %3 -; -; CHECK-LABEL: 3: +; CHECK: br label %6 + +; CHECK-LABEL: 6: ; CHECK: call void @llvm.amdgcn.wave.barrier() +; CHECK: %7 = addrspacecast i64 addrspace(5)* %0 to i64* +; CHECK: %8 = addrspacecast i64 addrspace(5)* %1 to i64* +; CHECK: %9 = addrspacecast i64 addrspace(5)* %2 to i64* ; CHECK: call void @f0(i32 0) ; CHECK: ret void entry: + %0 = alloca i64, align 8, addrspace(5) + %1 = alloca i64, align 8, addrspace(5) + %2 = alloca i64, align 8, addrspace(5) + %3 = addrspacecast i64 addrspace(5)* %0 to i64* + %4 = addrspacecast i64 addrspace(5)* %1 to i64* + %5 = addrspacecast i64 addrspace(5)* %2 to i64* call void @f0(i32 0) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-multiple-lds.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-multiple-lds.ll --- a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-multiple-lds.ll +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-multiple-lds.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer -amdgpu-enable-lds-replace-with-pointer=true < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s ; DESCRIPTION: ; diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-same-lds.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-same-lds.ll --- a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-same-lds.ll +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-same-lds.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer -amdgpu-enable-lds-replace-with-pointer=true < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s ; DESCRIPTION: ; diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-const-expr1.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-const-expr1.ll --- a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-const-expr1.ll +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-const-expr1.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer -amdgpu-enable-lds-replace-with-pointer=true < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s ; DESCRIPTION: ; diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-const-expr2.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-const-expr2.ll --- a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-const-expr2.ll +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-const-expr2.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer -amdgpu-enable-lds-replace-with-pointer=true < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s ; DESCRIPTION: ; There is one lds global defined here, and this lds is used within a single non-kernel diff --git a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-phi-inst.ll b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-phi-inst.ll --- a/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-phi-inst.ll +++ b/llvm/test/CodeGen/AMDGPU/replace-lds-by-ptr-use-within-phi-inst.ll @@ -1,4 +1,4 @@ -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer -amdgpu-enable-lds-replace-with-pointer=true < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-replace-lds-use-with-pointer < %s | FileCheck %s ; DESCRIPTION: ;