Index: llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -128,6 +128,17 @@ return ST.hasApertureRegs(); } + std::pair getFlatWorkGroupSizes(const Function &F) { + const GCNSubtarget &ST = TM.getSubtarget(F); + return ST.getFlatWorkGroupSizes(F); + } + + std::pair + getMaximumFlatWorkGroupRange(const Function &F) { + const GCNSubtarget &ST = TM.getSubtarget(F); + return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()}; + } + private: /// Check if the ConstantExpr \p CE requires queue ptr attribute. static bool visitConstExpr(const ConstantExpr *CE) { @@ -470,6 +481,118 @@ llvm_unreachable("AAAMDAttributes is only valid for function position"); } +/// Propagate amdgpu-flat-work-group-size attribute. +struct AAAMDFlatWorkGroupSize + : public StateWrapper { + using Base = StateWrapper; + AAAMDFlatWorkGroupSize(const IRPosition &IRP, Attributor &A) + : Base(IRP, 32) {} + + /// See AbstractAttribute::getState(...). + IntegerRangeState &getState() override { return *this; } + const IntegerRangeState &getState() const override { return *this; } + + void initialize(Attributor &A) override { + Function *F = getAssociatedFunction(); + auto &InfoCache = static_cast(A.getInfoCache()); + unsigned MinGroupSize, MaxGroupSize; + std::tie(MinGroupSize, MaxGroupSize) = InfoCache.getFlatWorkGroupSizes(*F); + intersectKnown( + ConstantRange(APInt(32, MinGroupSize), APInt(32, MaxGroupSize + 1))); + } + + ChangeStatus updateImpl(Attributor &A) override { + ChangeStatus Change = ChangeStatus::UNCHANGED; + + auto CheckCallSite = [&](AbstractCallSite CS) { + Function *Caller = CS.getInstruction()->getFunction(); + LLVM_DEBUG(dbgs() << "[AAAMDFlatWorkGroupSize] Call " << Caller->getName() + << "->" << getAssociatedFunction()->getName() << '\n'); + + const auto &CallerInfo = A.getAAFor( + *this, IRPosition::function(*Caller), DepClassTy::REQUIRED); + + Change |= + clampStateAndIndicateChange(this->getState(), CallerInfo.getState()); + + return true; + }; + + bool AllCallSitesKnown = true; + if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown)) + indicatePessimisticFixpoint(); + + return Change; + } + + ChangeStatus manifest(Attributor &A) override { + SmallVector AttrList; + Function *F = getAssociatedFunction(); + LLVMContext &Ctx = F->getContext(); + + auto &InfoCache = static_cast(A.getInfoCache()); + unsigned Min, Max; + std::tie(Min, Max) = InfoCache.getMaximumFlatWorkGroupRange(*F); + + // Don't add the attribute if it's the implied default. + if (getAssumed().getLower() == Min && getAssumed().getUpper() - 1 == Max) + return ChangeStatus::UNCHANGED; + + SmallString<10> Buffer; + raw_svector_ostream OS(Buffer); + OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1; + + AttrList.push_back( + Attribute::get(Ctx, "amdgpu-flat-work-group-size", OS.str())); + return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList, + /* ForceReplace */ true); + } + + const std::string getAsStr() const override { + std::string Str; + raw_string_ostream OS(Str); + OS << "AMDFlatWorkGroupSize["; + OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1; + OS << ']'; + return OS.str(); + } + + /// See AbstractAttribute::trackStatistics() + void trackStatistics() const override {} + + /// Create an abstract attribute view for the position \p IRP. + static AAAMDFlatWorkGroupSize &createForPosition(const IRPosition &IRP, + Attributor &A); + + /// See AbstractAttribute::getName() + const std::string getName() const override { + return "AAAMDFlatWorkGroupSize"; + } + + /// See AbstractAttribute::getIdAddr() + const char *getIdAddr() const override { return &ID; } + + /// This function should return true if the type of the \p AA is + /// AAAMDFlatWorkGroupSize + static bool classof(const AbstractAttribute *AA) { + return (AA->getIdAddr() == &ID); + } + + /// Unique ID (due to the unique address) + static const char ID; +}; + +const char AAAMDFlatWorkGroupSize::ID = 0; + +AAAMDFlatWorkGroupSize & +AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP, + Attributor &A) { + if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION) + return *new (A.Allocator) AAAMDFlatWorkGroupSize(IRP, A); + llvm_unreachable( + "AAAMDFlatWorkGroupSize is only valid for function position"); +} + class AMDGPUAttributor : public ModulePass { public: AMDGPUAttributor() : ModulePass(ID) {} @@ -497,7 +620,8 @@ BumpPtrAllocator Allocator; AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, *TM); DenseSet Allowed( - {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID, &AACallEdges::ID}); + {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID, + &AAAMDFlatWorkGroupSize::ID, &AACallEdges::ID}); Attributor A(Functions, InfoCache, CGUpdater, &Allowed); @@ -505,6 +629,9 @@ if (!F.isIntrinsic()) { A.getOrCreateAAFor(IRPosition::function(F)); A.getOrCreateAAFor(IRPosition::function(F)); + if (!AMDGPU::isEntryFunctionCC(F.getCallingConv())) { + A.getOrCreateAAFor(IRPosition::function(F)); + } } } Index: llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp @@ -55,10 +55,7 @@ // Attributes to propagate. // TODO: Support conservative min/max merging instead of cloning. -static constexpr const char* AttributeNames[] = { - "amdgpu-waves-per-eu", - "amdgpu-flat-work-group-size" -}; +static constexpr const char *AttributeNames[] = {"amdgpu-waves-per-eu"}; static constexpr unsigned NumAttr = sizeof(AttributeNames) / sizeof(AttributeNames[0]); Index: llvm/test/CodeGen/AMDGPU/propagate-attributes-flat-work-group-size.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/propagate-attributes-flat-work-group-size.ll +++ /dev/null @@ -1,49 +0,0 @@ -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-late %s | FileCheck %s -; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-propagate-attributes-late %s | FileCheck %s - -; CHECK: define internal void @max_flat_1_1024() #0 { -define internal void @max_flat_1_1024() #0 { - ret void -} - -; CHECK: define internal void @max_flat_1_256() #1 { -define internal void @max_flat_1_256() #1 { - ret void -} - -; CHECK: define amdgpu_kernel void @kernel_1_256_call_default() #1 { -define amdgpu_kernel void @kernel_1_256_call_default() #1 { - call void @default() - ret void -} - -; CHECK: define amdgpu_kernel void @kernel_1_256_call_1_256() #1 { -define amdgpu_kernel void @kernel_1_256_call_1_256() #1 { - call void @max_flat_1_256() - ret void -} - -; CHECK: define amdgpu_kernel void @kernel_1_256_call_64_64() #1 { -define amdgpu_kernel void @kernel_1_256_call_64_64() #1 { - call void @max_flat_64_64() - ret void -} - -; CHECK: define internal void @max_flat_64_64() #2 { -define internal void @max_flat_64_64() #2 { - ret void -} - -; CHECK: define internal void @default() #2 { -define internal void @default() #3 { - ret void -} - -attributes #0 = { noinline "amdgpu-flat-work-group-size"="1,1024" } -attributes #1 = { noinline "amdgpu-flat-work-group-size"="1,256" } -attributes #2 = { noinline "amdgpu-flat-work-group-size"="64,64" } -attributes #3 = { noinline } - -; CHECK: attributes #0 = { noinline "amdgpu-flat-work-group-size"="1,1024" -; CHECK-NEXT: attributes #1 = { noinline "amdgpu-flat-work-group-size"="1,256" -; CHECK-NEXT: attributes #2 = { noinline "amdgpu-flat-work-group-size"="1,256" Index: llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll @@ -0,0 +1,214 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-attributor %s | FileCheck %s + +; Check propagation of amdgpu-flat-work-group-size attribute. + +; Called from a single kernel with 1,256 +define internal void @default_to_1_256() { +; CHECK-LABEL: define {{[^@]+}}@default_to_1_256 +; CHECK-SAME: () #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: ret void +; + ret void +} + +define amdgpu_kernel void @kernel_1_256() #0 { +; CHECK-LABEL: define {{[^@]+}}@kernel_1_256 +; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: call void @default_to_1_256() +; CHECK-NEXT: ret void +; + call void @default_to_1_256() + ret void +} + +; Called from a single kernel with 64,128 +define internal void @default_to_64_128() { +; CHECK-LABEL: define {{[^@]+}}@default_to_64_128 +; CHECK-SAME: () #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: ret void +; + ret void +} + +define amdgpu_kernel void @kernel_64_128() #1 { +; CHECK-LABEL: define {{[^@]+}}@kernel_64_128 +; CHECK-SAME: () #[[ATTR1]] { +; CHECK-NEXT: call void @default_to_64_128() +; CHECK-NEXT: call void @flat_group_64_64() +; CHECK-NEXT: call void @default_to_64_256() +; CHECK-NEXT: call void @flat_group_128_256() +; CHECK-NEXT: ret void +; + call void @default_to_64_128() + call void @flat_group_64_64() + call void @default_to_64_256() + call void @flat_group_128_256() + ret void +} + +; Called from kernels with 128,512 and 512,512 +define internal void @default_to_128_512() { +; CHECK-LABEL: define {{[^@]+}}@default_to_128_512 +; CHECK-SAME: () #[[ATTR2:[0-9]+]] { +; CHECK-NEXT: ret void +; + ret void +} + +; This already has a strict bounds, but called from kernels with wider +; bounds, and should not be changed. +define internal void @flat_group_64_64() #2 { +; CHECK-LABEL: define {{[^@]+}}@flat_group_64_64 +; CHECK-SAME: () #[[ATTR3:[0-9]+]] { +; CHECK-NEXT: ret void +; + ret void +} + +; 128,256 -> 128,128 +define internal void @flat_group_128_256() #3 { +; CHECK-LABEL: define {{[^@]+}}@flat_group_128_256 +; CHECK-SAME: () #[[ATTR4:[0-9]+]] { +; CHECK-NEXT: ret void +; + ret void +} + +define internal void @flat_group_512_1024() #4 { +; CHECK-LABEL: define {{[^@]+}}@flat_group_512_1024 +; CHECK-SAME: () #[[ATTR5:[0-9]+]] { +; CHECK-NEXT: ret void +; + ret void +} + +define amdgpu_kernel void @kernel_128_512() #5 { +; CHECK-LABEL: define {{[^@]+}}@kernel_128_512 +; CHECK-SAME: () #[[ATTR2]] { +; CHECK-NEXT: call void @default_to_128_512() +; CHECK-NEXT: call void @flat_group_64_64() +; CHECK-NEXT: ret void +; + call void @default_to_128_512() + call void @flat_group_64_64() + ret void +} + +define amdgpu_kernel void @kernel_512_512() #6 { +; CHECK-LABEL: define {{[^@]+}}@kernel_512_512 +; CHECK-SAME: () #[[ATTR5]] { +; CHECK-NEXT: call void @default_to_128_512() +; CHECK-NEXT: call void @flat_group_512_1024() +; CHECK-NEXT: ret void +; + call void @default_to_128_512() + call void @flat_group_512_1024() + ret void +} + +; Called from kernels with 128,256 and 64,128 => 64,256 +define internal void @default_to_64_256() { +; CHECK-LABEL: define {{[^@]+}}@default_to_64_256 +; CHECK-SAME: () #[[ATTR6:[0-9]+]] { +; CHECK-NEXT: ret void +; + ret void +} + +; The kernel's lower bound is higher than the callee's lower bound, so +; this should probably be illegal. +define amdgpu_kernel void @kernel_128_256() #3 { +; CHECK-LABEL: define {{[^@]+}}@kernel_128_256 +; CHECK-SAME: () #[[ATTR7:[0-9]+]] { +; CHECK-NEXT: call void @default_to_64_256() +; CHECK-NEXT: ret void +; + call void @default_to_64_256() + ret void +} + +; 64,128 -> 64,128 +define internal void @merge_cycle_0() #1 { +; CHECK-LABEL: define {{[^@]+}}@merge_cycle_0 +; CHECK-SAME: () #[[ATTR1]] { +; CHECK-NEXT: call void @merge_cycle_1() +; CHECK-NEXT: ret void +; + call void @merge_cycle_1() + ret void +} + +; 128,256 -> 128,128 +define internal void @merge_cycle_1() #3 { +; CHECK-LABEL: define {{[^@]+}}@merge_cycle_1 +; CHECK-SAME: () #[[ATTR4]] { +; CHECK-NEXT: call void @merge_cycle_0() +; CHECK-NEXT: ret void +; + call void @merge_cycle_0() + ret void +} + +define amdgpu_kernel void @kernel_64_256() #7 { +; CHECK-LABEL: define {{[^@]+}}@kernel_64_256 +; CHECK-SAME: () #[[ATTR6]] { +; CHECK-NEXT: call void @merge_cycle_0() +; CHECK-NEXT: call void @default_captured_address() +; CHECK-NEXT: call void @externally_visible_default() +; CHECK-NEXT: [[F32:%.*]] = call float bitcast (i32 ()* @bitcasted_function to float ()*)() +; CHECK-NEXT: ret void +; + call void @merge_cycle_0() + call void @default_captured_address() + call void @externally_visible_default() + %f32 = call float bitcast (i32 ()* @bitcasted_function to float ()*)() + ret void +} + +define internal void @default_captured_address() { +; CHECK-LABEL: define {{[^@]+}}@default_captured_address +; CHECK-SAME: () #[[ATTR8:[0-9]+]] { +; CHECK-NEXT: store volatile void ()* @default_captured_address, void ()** undef, align 8 +; CHECK-NEXT: ret void +; + store volatile void ()* @default_captured_address, void ()** undef, align 8 + ret void +} + +define void @externally_visible_default() { +; CHECK-LABEL: define {{[^@]+}}@externally_visible_default +; CHECK-SAME: () #[[ATTR8]] { +; CHECK-NEXT: ret void +; + ret void +} + +; 1,1024 -> 64,256 +define internal i32 @bitcasted_function() { +; CHECK-LABEL: define {{[^@]+}}@bitcasted_function +; CHECK-SAME: () #[[ATTR6]] { +; CHECK-NEXT: ret i32 0 +; + ret i32 0 +} + +attributes #0 = { "amdgpu-flat-work-group-size"="1,256" } +attributes #1 = { "amdgpu-flat-work-group-size"="64,128" } +attributes #2 = { "amdgpu-flat-work-group-size"="64,64" } +attributes #3 = { "amdgpu-flat-work-group-size"="128,256" } +attributes #4 = { "amdgpu-flat-work-group-size"="512,1024" } +attributes #5 = { "amdgpu-flat-work-group-size"="128,512" } +attributes #6 = { "amdgpu-flat-work-group-size"="512,512" } +attributes #7 = { "amdgpu-flat-work-group-size"="64,256" } +;. +; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="64,128" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="128,512" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="64,64" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="128,128" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="64,256" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="128,256" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR8]] = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +;.