Index: llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp @@ -32,6 +32,7 @@ #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Function.h" #include "llvm/IR/Module.h" #include "llvm/Target/TargetMachine.h" @@ -56,8 +57,10 @@ }; // Attributes to propagate. +// TODO: Support conservative min/max merging instead of cloning. static constexpr const char* AttributeNames[] = { - "amdgpu-waves-per-eu" + "amdgpu-waves-per-eu", + "amdgpu-flat-work-group-size" }; static constexpr unsigned NumAttr = @@ -371,15 +374,28 @@ } bool AMDGPUPropagateAttributesEarly::runOnFunction(Function &F) { - if (!TM || !AMDGPU::isEntryFunctionCC(F.getCallingConv())) + if (!TM) { + auto *TPC = getAnalysisIfAvailable(); + if (!TPC) + return false; + + TM = &TPC->getTM(); + } + + if (!AMDGPU::isEntryFunctionCC(F.getCallingConv())) return false; return AMDGPUPropagateAttributes(TM, false).process(F); } bool AMDGPUPropagateAttributesLate::runOnModule(Module &M) { - if (!TM) - return false; + if (!TM) { + auto *TPC = getAnalysisIfAvailable(); + if (!TPC) + return false; + + TM = &TPC->getTM(); + } return AMDGPUPropagateAttributes(TM, true).process(M); } Index: llvm/test/CodeGen/AMDGPU/propagate-attributes-flat-work-group-size.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/propagate-attributes-flat-work-group-size.ll @@ -0,0 +1,48 @@ +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-propagate-attributes-late %s | FileCheck %s + +; CHECK: define internal void @max_flat_1_1024() #0 { +define internal void @max_flat_1_1024() #0 { + ret void +} + +; CHECK: define internal void @max_flat_1_256() #1 { +define internal void @max_flat_1_256() #1 { + ret void +} + +; CHECK: define amdgpu_kernel void @kernel_1_256_call_default() #1 { +define amdgpu_kernel void @kernel_1_256_call_default() #1 { + call void @default() + ret void +} + +; CHECK: define amdgpu_kernel void @kernel_1_256_call_1_256() #1 { +define amdgpu_kernel void @kernel_1_256_call_1_256() #1 { + call void @max_flat_1_256() + ret void +} + +; CHECK: define amdgpu_kernel void @kernel_1_256_call_64_64() #1 { +define amdgpu_kernel void @kernel_1_256_call_64_64() #1 { + call void @max_flat_64_64() + ret void +} + +; CHECK: define internal void @max_flat_64_64() #2 { +define internal void @max_flat_64_64() #2 { + ret void +} + +; CHECK: define internal void @default() #2 { +define internal void @default() #3 { + ret void +} + +attributes #0 = { noinline "amdgpu-flat-work-group-size"="1,1024" } +attributes #1 = { noinline "amdgpu-flat-work-group-size"="1,256" } +attributes #2 = { noinline "amdgpu-flat-work-group-size"="64,64" } +attributes #3 = { noinline } + +; CHECK: attributes #0 = { noinline "amdgpu-flat-work-group-size"="1,1024" +; CHECK-NEXT: attributes #1 = { noinline "amdgpu-flat-work-group-size"="1,256" +; CHECK-NEXT: attributes #2 = { noinline "amdgpu-flat-work-group-size"="1,256"