Index: lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -35,7 +35,8 @@ MDNode *MaxWorkGroupSizeRange; // FIXME: This should be per-kernel. - int LocalMemAvailable; + uint32_t LocalMemLimit; + uint32_t CurrentLocalMemUsage; bool IsAMDGCN; bool IsAMDHSA; @@ -51,7 +52,8 @@ TM(TM_), Mod(nullptr), MaxWorkGroupSizeRange(nullptr), - LocalMemAvailable(0), + LocalMemLimit(0), + CurrentLocalMemUsage(0), IsAMDGCN(false), IsAMDHSA(false) { } @@ -108,38 +110,86 @@ for (Type *ParamTy : FTy->params()) { PointerType *PtrTy = dyn_cast(ParamTy); if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { - LocalMemAvailable = 0; - DEBUG(dbgs() << "Function has local memory argument. Promoting to " + LocalMemLimit = 0; + DEBUG(dbgs() << "Function has local memory argument. Promoting to " "local memory disabled.\n"); return false; } } const AMDGPUSubtarget &ST = TM->getSubtarget(F); - LocalMemAvailable = ST.getLocalMemorySize(); - if (LocalMemAvailable == 0) + + LocalMemLimit = ST.getLocalMemorySize(); + if (LocalMemLimit == 0) return false; + const DataLayout &DL = Mod->getDataLayout(); + // Check how much local memory is being used by global objects + CurrentLocalMemUsage = 0; for (GlobalVariable &GV : Mod->globals()) { if (GV.getType()->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) continue; - for (User *U : GV.users()) { - Instruction *Use = dyn_cast(U); + for (const User *U : GV.users()) { + const Instruction *Use = dyn_cast(U); if (!Use) continue; if (Use->getParent()->getParent() == &F) { - LocalMemAvailable -= - Mod->getDataLayout().getTypeAllocSize(GV.getValueType()); + unsigned Align = GV.getAlignment(); + if (Align == 0) + Align = DL.getABITypeAlignment(GV.getValueType()); + + // FIXME: Try to account for padding here. The padding is currently + // determined from the inverse order of uses in the function. I'm not + // sure if the use list order is in any way connected to this, so the + // total reported size is likely incorrect. + uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType()); + CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align); + CurrentLocalMemUsage += AllocSize; break; } } } - LocalMemAvailable = std::max(0, LocalMemAvailable); - DEBUG(dbgs() << LocalMemAvailable << " bytes free in local memory.\n"); + unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage); + + // Restrict local memory usage so that we don't drastically reduce occupancy, + // unless it is already significantly reduced. + + // TODO: Have some sort of hint or other heuristics to guess occupancy based + // on other factors.. + unsigned OccupancyHint + = AMDGPU::getIntegerAttribute(F, "amdgpu-expected-occupancy", 0); + if (OccupancyHint == 0) + OccupancyHint = 7; + + // Clamp to max value. + OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerCU()); + + // Check the hint but ignore it if it's obviously wrong from the existing LDS + // usage. + MaxOccupancy = std::min(OccupancyHint, MaxOccupancy); + + + // Round up to the next tier of usage. + unsigned MaxSizeWithWaveCount + = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy); + + // Program is possibly broken by using more local mem than available. + if (CurrentLocalMemUsage > MaxSizeWithWaveCount) + return false; + + LocalMemLimit = MaxSizeWithWaveCount; + + DEBUG( + dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n" + << " Rounding size to " << MaxSizeWithWaveCount + << " with a maximum occupancy of " << MaxOccupancy << '\n' + << " and " << (LocalMemLimit - CurrentLocalMemUsage) + << " available for promotion\n" + ); BasicBlock &EntryBB = *F.begin(); for (auto I = EntryBB.begin(), E = EntryBB.end(); I != E; ) { @@ -482,6 +532,7 @@ return true; } +// FIXME: Should try to pick the most likely to be profitable allocas first. void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) { // Array allocations are probably not worth handling, since an allocation of // the array type is the canonical form. @@ -495,10 +546,10 @@ DEBUG(dbgs() << "Trying to promote " << I << '\n'); - if (tryPromoteAllocaToVector(&I)) + if (tryPromoteAllocaToVector(&I)) { + DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n"); return; - - DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n"); + } const Function &ContainingFunction = *I.getParent()->getParent(); @@ -506,14 +557,30 @@ // function attribute if it is available. unsigned WorkGroupSize = AMDGPU::getMaximumWorkGroupSize(ContainingFunction); - int AllocaSize = - WorkGroupSize * Mod->getDataLayout().getTypeAllocSize(AllocaTy); + const DataLayout &DL = Mod->getDataLayout(); + + unsigned Align = I.getAlignment(); + if (Align == 0) + Align = DL.getABITypeAlignment(I.getAllocatedType()); - if (AllocaSize > LocalMemAvailable) { - DEBUG(dbgs() << " Not enough local memory to promote alloca.\n"); + // FIXME: This computed padding is likely wrong since it depends on inverse + // usage order. + // + // FIXME: It is also possible that if we're allowed to use all of the memory + // could could end up using more than the maximum due to alignment padding. + + uint32_t NewSize = alignTo(CurrentLocalMemUsage, Align); + uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy); + NewSize += AllocSize; + + if (NewSize > LocalMemLimit) { + DEBUG(dbgs() << " " << AllocSize + << " bytes of local memory not available to promote\n"); return; } + CurrentLocalMemUsage = NewSize; + std::vector WorkList; if (!collectUsesWithPtrTypes(&I, WorkList)) { @@ -522,7 +589,6 @@ } DEBUG(dbgs() << "Promoting alloca to local memory\n"); - LocalMemAvailable -= AllocaSize; Function *F = I.getParent()->getParent(); Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -269,6 +269,15 @@ return CFALUBug; } + /// Return the amount of LDS that can be used that will not restrict the + /// occupancy lower than WaveCount. + unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount) const; + + /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if + /// the given LDS memory size is the only constraint. + unsigned getOccupancyWithLocalMemSize(uint32_t Bytes) const; + + int getLocalMemorySize() const { return LocalMemorySize; } @@ -334,7 +343,7 @@ return 10; // FIXME: Not sure what this is for other subtagets. - llvm_unreachable("do not know max waves per CU for this subtarget."); + return 8; } bool enableSubRegLiveness() const override { Index: lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -154,6 +154,64 @@ } } +// FIXME: These limits are for SI. Did they change with the larger maximum LDS +// size? +unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves) const { + switch (NWaves) { + case 10: + return 1638; + case 9: + return 1820; + case 8: + return 2048; + case 7: + return 2340; + case 6: + return 2730; + case 5: + return 3276; + case 4: + return 4096; + case 3: + return 5461; + case 2: + return 8192; + default: + return getLocalMemorySize(); + } +} + +unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const { + if (Bytes <= 1638) + return 10; + + if (Bytes <= 1820) + return 9; + + if (Bytes <= 2048) + return 8; + + if (Bytes <= 2340) + return 7; + + if (Bytes <= 2730) + return 6; + + if (Bytes <= 3276) + return 5; + + if (Bytes <= 4096) + return 4; + + if (Bytes <= 5461) + return 3; + + if (Bytes <= 8192) + return 2; + + return 1; +} + unsigned AMDGPUSubtarget::getAmdKernelCodeChipID() const { switch(getGeneration()) { default: llvm_unreachable("ChipID unknown"); Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h =================================================================== --- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -45,6 +45,9 @@ bool isGlobalSegment(const GlobalValue *GV); bool isReadOnlySegment(const GlobalValue *GV); +unsigned getIntegerAttribute(const Function &F, StringRef Name, + unsigned Default); + unsigned getMaximumWorkGroupSize(const Function &F); unsigned getInitialPSInputAddr(const Function &F); Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp =================================================================== --- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -109,8 +109,8 @@ return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS; } -static unsigned getIntegerAttribute(const Function &F, const char *Name, - unsigned Default) { +unsigned getIntegerAttribute(const Function &F, StringRef Name, + unsigned Default) { Attribute A = F.getFnAttribute(Name); unsigned Result = Default; Index: test/CodeGen/AMDGPU/array-ptr-calc-i32.ll =================================================================== --- test/CodeGen/AMDGPU/array-ptr-calc-i32.ll +++ test/CodeGen/AMDGPU/array-ptr-calc-i32.ll @@ -47,6 +47,6 @@ ret void } -attributes #0 = { nounwind } +attributes #0 = { nounwind "amdgpu-expected-occupancy"="1" } attributes #1 = { nounwind readnone } attributes #2 = { nounwind convergent } Index: test/CodeGen/AMDGPU/indirect-private-64.ll =================================================================== --- test/CodeGen/AMDGPU/indirect-private-64.ll +++ test/CodeGen/AMDGPU/indirect-private-64.ll @@ -3,7 +3,7 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s -declare void @llvm.amdgcn.s.barrier() #1 +declare void @llvm.amdgcn.s.barrier() #0 ; SI-LABEL: {{^}}private_access_f64_alloca: @@ -81,5 +81,5 @@ ret void } -attributes #0 = { nounwind } -attributes #1 = { nounwind convergent } +attributes #0 = { convergent nounwind } +attributes #1 = { nounwind "amdgpu-expected-occupancy"="2" "amdgpu-max-work-group-size"="64" } Index: test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll =================================================================== --- test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll +++ test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll @@ -66,7 +66,52 @@ ret void } -attributes #0 = { nounwind "amdgpu-max-work-group-size"="63" } -attributes #1 = { nounwind "amdgpu-max-work-group-size"="256" } -attributes #2 = { nounwind "amdgpu-max-work-group-size"="1600" } +; CHECK: @occupancy_0( +; CHECK: alloca [5 x i32] +define void @occupancy_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #3 { +entry: + %stack = alloca [5 x i32], align 4 + %0 = load i32, i32 addrspace(1)* %in, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0 + store i32 4, i32* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 + %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1 + store i32 5, i32* %arrayidx3, align 4 + %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 + %2 = load i32, i32* %arrayidx10, align 4 + store i32 %2, i32 addrspace(1)* %out, align 4 + %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 + %3 = load i32, i32* %arrayidx12 + %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 + store i32 %3, i32 addrspace(1)* %arrayidx13 + ret void +} +; CHECK: @occupancy_max( +; CHECK: alloca [5 x i32] +define void @occupancy_max(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #4 { +entry: + %stack = alloca [5 x i32], align 4 + %0 = load i32, i32 addrspace(1)* %in, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0 + store i32 4, i32* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 + %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1 + store i32 5, i32* %arrayidx3, align 4 + %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 + %2 = load i32, i32* %arrayidx10, align 4 + store i32 %2, i32 addrspace(1)* %out, align 4 + %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 + %3 = load i32, i32* %arrayidx12 + %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 + store i32 %3, i32 addrspace(1)* %arrayidx13 + ret void +} + +attributes #0 = { nounwind "amdgpu-max-work-group-size"="63" } +attributes #1 = { nounwind "amdgpu-expected-occupancy"="3" "amdgpu-max-work-group-size"="256" } +attributes #2 = { nounwind "amdgpu-expected-occupancy"="1" "amdgpu-max-work-group-size"="1600" } +attributes #3 = { nounwind "amdgpu-expected-occupancy"="0" } +attributes #4 = { nounwind "amdgpu-expected-occupancy"="-1" } Index: test/CodeGen/AMDGPU/private-memory-r600.ll =================================================================== --- test/CodeGen/AMDGPU/private-memory-r600.ll +++ test/CodeGen/AMDGPU/private-memory-r600.ll @@ -16,7 +16,7 @@ ; OPT: call i32 @llvm.r600.read.tidig.y(), !range !0 ; OPT: call i32 @llvm.r600.read.tidig.z(), !range !0 -define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { +define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { entry: %stack = alloca [5 x i32], align 4 %0 = load i32, i32 addrspace(1)* %in, align 4 @@ -47,7 +47,7 @@ ; R600-NOT: MOVA_INT %struct.point = type { i32, i32 } -define void @multiple_structs(i32 addrspace(1)* %out) { +define void @multiple_structs(i32 addrspace(1)* %out) #0 { entry: %a = alloca %struct.point %b = alloca %struct.point @@ -75,7 +75,7 @@ ; FUNC-LABEL: {{^}}direct_loop: ; R600-NOT: MOVA_INT -define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { entry: %prv_array_const = alloca [2 x i32] %prv_array = alloca [2 x i32] @@ -110,7 +110,7 @@ ; FUNC-LABEL: {{^}}short_array: ; R600: MOVA_INT -define void @short_array(i32 addrspace(1)* %out, i32 %index) { +define void @short_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: %0 = alloca [2 x i16] %1 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 0 @@ -127,7 +127,7 @@ ; FUNC-LABEL: {{^}}char_array: ; R600: MOVA_INT -define void @char_array(i32 addrspace(1)* %out, i32 %index) { +define void @char_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: %0 = alloca [2 x i8] %1 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 0 @@ -148,7 +148,7 @@ ; R600-NOT: MOV T0.X ; Additional check in case the move ends up in the last slot ; R600-NOT: MOV * TO.X -define void @work_item_info(i32 addrspace(1)* %out, i32 %in) { +define void @work_item_info(i32 addrspace(1)* %out, i32 %in) #0 { entry: %0 = alloca [2 x i32] %1 = getelementptr inbounds [2 x i32], [2 x i32]* %0, i32 0, i32 0 @@ -169,7 +169,7 @@ ; R600_CHECK: MOV ; R600_CHECK: [[CHAN:[XYZW]]]+ ; R600-NOT: [[CHAN]]+ -define void @no_overlap(i32 addrspace(1)* %out, i32 %in) { +define void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 { entry: %0 = alloca [3 x i8], align 1 %1 = alloca [2 x i8], align 1 @@ -193,7 +193,7 @@ ret void } -define void @char_array_array(i32 addrspace(1)* %out, i32 %index) { +define void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: %alloca = alloca [2 x [2 x i8]] %gep0 = getelementptr inbounds [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0 @@ -207,7 +207,7 @@ ret void } -define void @i32_array_array(i32 addrspace(1)* %out, i32 %index) { +define void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: %alloca = alloca [2 x [2 x i32]] %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0 @@ -220,7 +220,7 @@ ret void } -define void @i64_array_array(i64 addrspace(1)* %out, i32 %index) { +define void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 { entry: %alloca = alloca [2 x [2 x i64]] %gep0 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0 @@ -235,7 +235,7 @@ %struct.pair32 = type { i32, i32 } -define void @struct_array_array(i32 addrspace(1)* %out, i32 %index) { +define void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: %alloca = alloca [2 x [2 x %struct.pair32]] %gep0 = getelementptr inbounds [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1 @@ -248,7 +248,7 @@ ret void } -define void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) { +define void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: %alloca = alloca [2 x %struct.pair32] %gep0 = getelementptr inbounds [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1 @@ -282,7 +282,7 @@ ; SI-NOT: ds_write ; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; -define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) { +define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %alloca = alloca [16 x i32] %tmp0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a store i32 5, i32* %tmp0 @@ -296,3 +296,5 @@ } ; OPT: !0 = !{i32 0, i32 2048} + +attributes #0 = { nounwind "amdgpu-expected-occupancy"="2" } Index: test/CodeGen/AMDGPU/private-memory.ll =================================================================== --- test/CodeGen/AMDGPU/private-memory.ll +++ test/CodeGen/AMDGPU/private-memory.ll @@ -79,7 +79,7 @@ ; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.x(), !range !0 ; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.y(), !range !0 ; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.z(), !range !0 -define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { +define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { entry: %stack = alloca [5 x i32], align 4 %0 = load i32, i32 addrspace(1)* %in, align 4 @@ -101,7 +101,7 @@ ; OPT-LABEL: @high_alignment( ; OPT: getelementptr inbounds [256 x [8 x i32]], [256 x [8 x i32]] addrspace(3)* @high_alignment.stack, i32 0, i32 %{{[0-9]+}} -define void @high_alignment(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { +define void @high_alignment(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { entry: %stack = alloca [8 x i32], align 16 %0 = load i32, i32 addrspace(1)* %in, align 4 @@ -126,7 +126,7 @@ ; OPT: alloca [5 x i32] ; SI-NOT: ds_write -define void @no_replace_inbounds_gep(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { +define void @no_replace_inbounds_gep(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { entry: %stack = alloca [5 x i32], align 4 %0 = load i32, i32 addrspace(1)* %in, align 4 @@ -161,7 +161,7 @@ ; SI-NOT: v_movrel %struct.point = type { i32, i32 } -define void @multiple_structs(i32 addrspace(1)* %out) { +define void @multiple_structs(i32 addrspace(1)* %out) #0 { entry: %a = alloca %struct.point %b = alloca %struct.point @@ -190,7 +190,7 @@ ; R600-NOT: MOVA_INT ; SI-NOT: v_movrel -define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { entry: %prv_array_const = alloca [2 x i32] %prv_array = alloca [2 x i32] @@ -229,7 +229,7 @@ ; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x68,0xe0 ; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:2 ; encoding: [0x02,0x10,0x68,0xe0 ; SI-PROMOTE: buffer_load_sshort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} -define void @short_array(i32 addrspace(1)* %out, i32 %index) { +define void @short_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: %0 = alloca [2 x i16] %1 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 0 @@ -249,7 +249,7 @@ ; SI-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x60,0xe0 ; SI-DAG: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:1 ; encoding: [0x01,0x10,0x60,0xe0 -define void @char_array(i32 addrspace(1)* %out, i32 %index) { +define void @char_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: %0 = alloca [2 x i8] %1 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 0 @@ -272,7 +272,7 @@ ; R600-NOT: MOV * TO.X ; SI-NOT: v_mov_b32_e{{(32|64)}} v0 -define void @work_item_info(i32 addrspace(1)* %out, i32 %in) { +define void @work_item_info(i32 addrspace(1)* %out, i32 %in) #0 { entry: %0 = alloca [2 x i32] %1 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 0 @@ -294,7 +294,7 @@ ; R600_CHECK: [[CHAN:[XYZW]]]+ ; R600-NOT: [[CHAN]]+ ; SI: v_mov_b32_e32 v3 -define void @no_overlap(i32 addrspace(1)* %out, i32 %in) { +define void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 { entry: %0 = alloca [3 x i8], align 1 %1 = alloca [2 x i8], align 1 @@ -318,7 +318,7 @@ ret void } -define void @char_array_array(i32 addrspace(1)* %out, i32 %index) { +define void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: %alloca = alloca [2 x [2 x i8]] %gep0 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0 @@ -332,7 +332,7 @@ ret void } -define void @i32_array_array(i32 addrspace(1)* %out, i32 %index) { +define void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: %alloca = alloca [2 x [2 x i32]] %gep0 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0 @@ -345,7 +345,7 @@ ret void } -define void @i64_array_array(i64 addrspace(1)* %out, i32 %index) { +define void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 { entry: %alloca = alloca [2 x [2 x i64]] %gep0 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0 @@ -360,7 +360,7 @@ %struct.pair32 = type { i32, i32 } -define void @struct_array_array(i32 addrspace(1)* %out, i32 %index) { +define void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: %alloca = alloca [2 x [2 x %struct.pair32]] %gep0 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1 @@ -373,7 +373,7 @@ ret void } -define void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) { +define void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: %alloca = alloca [2 x %struct.pair32] %gep0 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1 @@ -407,7 +407,7 @@ ; SI-NOT: ds_write ; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:5 ; -define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) { +define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %alloca = alloca [16 x i32] %tmp0 = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a store i32 5, i32* %tmp0 @@ -424,3 +424,5 @@ ; HSAOPT: !1 = !{i32 0, i32 2048} ; NOHSAOPT: !0 = !{i32 0, i32 2048} + +attributes #0 = { nounwind "amdgpu-expected-occupancy"="2" } Index: test/CodeGen/AMDGPU/promote-alloca-globals.ll =================================================================== --- test/CodeGen/AMDGPU/promote-alloca-globals.ll +++ test/CodeGen/AMDGPU/promote-alloca-globals.ll @@ -2,7 +2,8 @@ ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=ASM %s -@global_array= internal unnamed_addr addrspace(3) global [1500 x [10 x i32]] undef, align 4 +@global_array0 = internal unnamed_addr addrspace(3) global [750 x [10 x i32]] undef, align 4 +@global_array1 = internal unnamed_addr addrspace(3) global [750 x [10 x i32]] undef, align 4 ; IR-LABEL: define void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { ; IR: alloca [10 x i32] @@ -26,7 +27,9 @@ %tmp3 = load i32, i32* %arrayidx12 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 store i32 %tmp3, i32 addrspace(1)* %arrayidx13 - %v = getelementptr inbounds [1500 x [10 x i32]], [1500 x [10 x i32]] addrspace(3)* @global_array, i32 0, i32 0, i32 0 - store i32 %tmp3, i32 addrspace(3)* %v + %v0 = getelementptr inbounds [750 x [10 x i32]], [750 x [10 x i32]] addrspace(3)* @global_array0, i32 0, i32 0, i32 0 + store i32 %tmp3, i32 addrspace(3)* %v0 + %v1 = getelementptr inbounds [750 x [10 x i32]], [750 x [10 x i32]] addrspace(3)* @global_array1, i32 0, i32 0, i32 0 + store i32 %tmp3, i32 addrspace(3)* %v1 ret void } Index: test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll =================================================================== --- test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll +++ test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll @@ -11,11 +11,11 @@ declare i32 @llvm.objectsize.i32.p0i8(i8*, i1) #1 ; CHECK-LABEL: @promote_with_memcpy( -; CHECK: getelementptr inbounds [256 x [17 x i32]], [256 x [17 x i32]] addrspace(3)* @promote_with_memcpy.alloca, i32 0, i32 %{{[0-9]+}} +; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memcpy.alloca, i32 0, i32 %{{[0-9]+}} ; CHECK: call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %alloca.bc, i8 addrspace(1)* %in.bc, i32 68, i32 4, i1 false) ; CHECK: call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out.bc, i8 addrspace(3)* %alloca.bc, i32 68, i32 4, i1 false) define void @promote_with_memcpy(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %alloca = alloca [17 x i32], align 16 + %alloca = alloca [17 x i32], align 4 %alloca.bc = bitcast [17 x i32]* %alloca to i8* %in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)* %out.bc = bitcast i32 addrspace(1)* %out to i8 addrspace(1)* @@ -25,11 +25,11 @@ } ; CHECK-LABEL: @promote_with_memmove( -; CHECK: getelementptr inbounds [256 x [17 x i32]], [256 x [17 x i32]] addrspace(3)* @promote_with_memmove.alloca, i32 0, i32 %{{[0-9]+}} +; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memmove.alloca, i32 0, i32 %{{[0-9]+}} ; CHECK: call void @llvm.memmove.p3i8.p1i8.i32(i8 addrspace(3)* %alloca.bc, i8 addrspace(1)* %in.bc, i32 68, i32 4, i1 false) ; CHECK: call void @llvm.memmove.p1i8.p3i8.i32(i8 addrspace(1)* %out.bc, i8 addrspace(3)* %alloca.bc, i32 68, i32 4, i1 false) define void @promote_with_memmove(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %alloca = alloca [17 x i32], align 16 + %alloca = alloca [17 x i32], align 4 %alloca.bc = bitcast [17 x i32]* %alloca to i8* %in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)* %out.bc = bitcast i32 addrspace(1)* %out to i8 addrspace(1)* @@ -39,10 +39,10 @@ } ; CHECK-LABEL: @promote_with_memset( -; CHECK: getelementptr inbounds [256 x [17 x i32]], [256 x [17 x i32]] addrspace(3)* @promote_with_memset.alloca, i32 0, i32 %{{[0-9]+}} +; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memset.alloca, i32 0, i32 %{{[0-9]+}} ; CHECK: call void @llvm.memset.p3i8.i32(i8 addrspace(3)* %alloca.bc, i8 7, i32 68, i32 4, i1 false) define void @promote_with_memset(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { - %alloca = alloca [17 x i32], align 16 + %alloca = alloca [17 x i32], align 4 %alloca.bc = bitcast [17 x i32]* %alloca to i8* %in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)* %out.bc = bitcast i32 addrspace(1)* %out to i8 addrspace(1)* @@ -51,15 +51,15 @@ } ; CHECK-LABEL: @promote_with_objectsize( -; CHECK: [[PTR:%[0-9]+]] = getelementptr inbounds [256 x [17 x i32]], [256 x [17 x i32]] addrspace(3)* @promote_with_objectsize.alloca, i32 0, i32 %{{[0-9]+}} +; CHECK: [[PTR:%[0-9]+]] = getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_objectsize.alloca, i32 0, i32 %{{[0-9]+}} ; CHECK: call i32 @llvm.objectsize.i32.p3i8(i8 addrspace(3)* %alloca.bc, i1 false) define void @promote_with_objectsize(i32 addrspace(1)* %out) #0 { - %alloca = alloca [17 x i32], align 16 + %alloca = alloca [17 x i32], align 4 %alloca.bc = bitcast [17 x i32]* %alloca to i8* %size = call i32 @llvm.objectsize.i32.p0i8(i8* %alloca.bc, i1 false) store i32 %size, i32 addrspace(1)* %out ret void } -attributes #0 = { nounwind } +attributes #0 = { nounwind "amdgpu-max-work-group-size"="64" "amdgpu-expected-occupancy"="3" } attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/promote-alloca-no-opts.ll =================================================================== --- test/CodeGen/AMDGPU/promote-alloca-no-opts.ll +++ test/CodeGen/AMDGPU/promote-alloca-no-opts.ll @@ -34,5 +34,5 @@ ret void } -attributes #0 = { nounwind } -attributes #1 = { nounwind optnone noinline } +attributes #0 = { nounwind "amdgpu-max-work-group-size"="64" } +attributes #1 = { nounwind optnone noinline "amdgpu-max-work-group-size"="64" } Index: test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll @@ -0,0 +1,130 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefix=GCN %s + +; This shows that the amount of LDS estimate is sensitive to the order +; of the LDS globals. + +; Both of these functions use the same amount of LDS, but the total +; changes depending on the visit order of first use. + +; The one with the suboptimal order resulting in extra padding exceeds +; the desired limit + +; The padding estimate heuristic used by the promote alloca pass +; is mostly determined by the order of the globals, + +; Raw usage = 1060 bytes +; Rounded usage: +; 292 + (4 pad) + 256 + (8 pad) + 512 = 1072 +; 512 + (0 pad) + 256 + (0 pad) + 292 = 1060 + +; At default occupancy guess of 7, 2340 bytes available total. + +; 1280 need to be left to promote alloca +; optimally packed, this requires + + +@lds0 = internal unnamed_addr addrspace(3) global [32 x <4 x i32>] undef, align 16 +@lds2 = internal unnamed_addr addrspace(3) global [32 x i64] undef, align 8 +@lds1 = internal unnamed_addr addrspace(3) global [73 x i32] undef, align 4 + + +; GCN-LABEL: {{^}}promote_alloca_size_order_0: +; GCN: workgroup_group_segment_byte_size = 2340 +define void @promote_alloca_size_order_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 { +entry: + %stack = alloca [5 x i32], align 4 + %tmp0 = load i32, i32 addrspace(1)* %in, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp0 + store i32 4, i32* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 + %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp1 + store i32 5, i32* %arrayidx3, align 4 + %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 + %tmp2 = load i32, i32* %arrayidx10, align 4 + store i32 %tmp2, i32 addrspace(1)* %out, align 4 + %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 + %tmp3 = load i32, i32* %arrayidx12 + %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 + store i32 %tmp3, i32 addrspace(1)* %arrayidx13 + + %gep.lds1 = getelementptr inbounds [73 x i32], [73 x i32] addrspace(3)* @lds1, i32 0, i32 %idx + store volatile i32 0, i32 addrspace(3)* %gep.lds1, align 4 + + %gep.lds2 = getelementptr inbounds [32 x i64], [32 x i64] addrspace(3)* @lds2, i32 0, i32 %idx + store volatile i64 0, i64 addrspace(3)* %gep.lds2, align 8 + + %gep.lds0 = getelementptr inbounds [32 x <4 x i32>], [32 x <4 x i32>] addrspace(3)* @lds0, i32 0, i32 %idx + store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(3)* %gep.lds0, align 16 + + ret void +} + +; GCN-LABEL: {{^}}promote_alloca_size_order_1: +; GCN: workgroup_group_segment_byte_size = 2352 +define void @promote_alloca_size_order_1(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 { +entry: + %stack = alloca [5 x i32], align 4 + %tmp0 = load i32, i32 addrspace(1)* %in, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp0 + store i32 4, i32* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 + %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp1 + store i32 5, i32* %arrayidx3, align 4 + %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 + %tmp2 = load i32, i32* %arrayidx10, align 4 + store i32 %tmp2, i32 addrspace(1)* %out, align 4 + %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 + %tmp3 = load i32, i32* %arrayidx12 + %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 + store i32 %tmp3, i32 addrspace(1)* %arrayidx13 + + %gep.lds0 = getelementptr inbounds [32 x <4 x i32>], [32 x <4 x i32>] addrspace(3)* @lds0, i32 0, i32 %idx + store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(3)* %gep.lds0, align 16 + + %gep.lds2 = getelementptr inbounds [32 x i64], [32 x i64] addrspace(3)* @lds2, i32 0, i32 %idx + store volatile i64 0, i64 addrspace(3)* %gep.lds2, align 8 + + %gep.lds1 = getelementptr inbounds [73 x i32], [73 x i32] addrspace(3)* @lds1, i32 0, i32 %idx + store volatile i32 0, i32 addrspace(3)* %gep.lds1, align 4 + + ret void +} + +@lds3 = internal unnamed_addr addrspace(3) global [13 x i32] undef, align 4 +@lds4 = internal unnamed_addr addrspace(3) global [63 x <4 x i32>] undef, align 16 + +; The guess from the alignment padding pushes this over the determined +; size limit, so it isn't promoted + +; GCN-LABEL: {{^}}promote_alloca_align_pad_guess_over_limit: +; GCN: workgroup_group_segment_byte_size = 1060 +define void @promote_alloca_align_pad_guess_over_limit(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 { +entry: + %stack = alloca [5 x i32], align 4 + %tmp0 = load i32, i32 addrspace(1)* %in, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp0 + store i32 4, i32* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 + %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp1 + store i32 5, i32* %arrayidx3, align 4 + %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 + %tmp2 = load i32, i32* %arrayidx10, align 4 + store i32 %tmp2, i32 addrspace(1)* %out, align 4 + %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 + %tmp3 = load i32, i32* %arrayidx12 + %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 + store i32 %tmp3, i32 addrspace(1)* %arrayidx13 + + %gep.lds3 = getelementptr inbounds [13 x i32], [13 x i32] addrspace(3)* @lds3, i32 0, i32 %idx + store volatile i32 0, i32 addrspace(3)* %gep.lds3, align 4 + + %gep.lds4 = getelementptr inbounds [63 x <4 x i32>], [63 x <4 x i32>] addrspace(3)* @lds4, i32 0, i32 %idx + store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(3)* %gep.lds4, align 16 + + ret void +} + +attributes #0 = { nounwind "amdgpu-max-work-group-size"="64" }