diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -66,8 +66,11 @@ const SITargetLowering *TLI; AMDGPUTTIImpl CommonTTI; bool IsGraphics; + bool IsPixelShader; bool HasFP32Denormals; bool HasFP64FP16Denormals; + bool UseAggressiveLoadSinking; + bool CanAlwaysSinkLoads; static const FeatureBitset InlineFeatureIgnoreList; @@ -219,6 +222,8 @@ InstructionCost getMinMaxReductionCost( VectorType *Ty, VectorType *CondTy, bool IsUnsigned, TTI::TargetCostKind CostKind); + + bool canAlwaysSinkRead(const Instruction *Inst, const BasicBlock *Dest) const; }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -14,6 +14,7 @@ // //===----------------------------------------------------------------------===// +#include "AMDGPUInstrInfo.h" #include "AMDGPUTargetTransformInfo.h" #include "AMDGPUTargetMachine.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" @@ -288,10 +289,16 @@ : BaseT(TM, F.getParent()->getDataLayout()), ST(static_cast(TM->getSubtargetImpl(F))), TLI(ST->getTargetLowering()), CommonTTI(TM, F), - IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) { + IsGraphics(AMDGPU::isGraphics(F.getCallingConv())), + IsPixelShader(F.getCallingConv() == CallingConv::AMDGPU_PS) { AMDGPU::SIModeRegisterDefaults Mode(F); HasFP32Denormals = Mode.allFP32Denormals(); HasFP64FP16Denormals = Mode.allFP64FP16Denormals(); + + UseAggressiveLoadSinking = + F.getFnAttribute("amdgpu-aggressive-load-sinking").getValueAsBool(); + CanAlwaysSinkLoads = + F.getFnAttribute("amdgpu-always-sink-loads").getValueAsBool(); } unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const { @@ -1209,3 +1216,27 @@ Cost.first += (Size + 255) / 256; return Cost; } + +bool GCNTTIImpl::canAlwaysSinkRead(const Instruction *Inst, + const BasicBlock *Dest) const { + if (!UseAggressiveLoadSinking) + return false; + + // An earlier pass may have established there are no stores/exec manipulation + if (CanAlwaysSinkLoads) + return true; + + bool Result = Inst->hasMetadata(LLVMContext::MD_invariant_load); + if (Result && IsPixelShader) { + // Instructions using implicit derivatives in WQM cannot be safely sunk + // as kills/demotes may alter exec mask. + if (const IntrinsicInst *II = dyn_cast(Inst)) { + if (auto *Intr = AMDGPU::getImageDimIntrinsicInfo(II->getIntrinsicID())) { + const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = + AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); + Result = !BaseOpcode->WQM; + } + } + } + return Result; +} diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -48,6 +48,7 @@ bit IsAtomicRet = 0; bit MSAA = 0; bit BVH = 0; + bit WQM = 0; } def MIMGBaseOpcode : GenericEnum { @@ -59,7 +60,7 @@ let CppTypeName = "MIMGBaseOpcodeInfo"; let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler", "Gather4", "NumExtraArgs", "Gradients", "G16", "Coordinates", - "LodOrClampOrMip", "HasD16", "MSAA", "BVH"]; + "LodOrClampOrMip", "HasD16", "MSAA", "BVH", "WQM"]; string TypeOf_BaseOpcode = "MIMGBaseOpcode"; let PrimaryKey = ["BaseOpcode"]; @@ -1086,6 +1087,7 @@ def "" : MIMG_Sampler_BaseOpcode { let HasD16 = !not(isGetLod); let G16 = isG16; + let WQM = wqm; } let BaseOpcode = !cast(NAME), WQM = wqm, @@ -1111,6 +1113,7 @@ def "" : MIMG_Sampler_BaseOpcode { let HasD16 = 1; let Gather4 = 1; + let WQM = wqm; } let BaseOpcode = !cast(NAME), WQM = wqm, diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -313,6 +313,7 @@ bool HasD16; bool MSAA; bool BVH; + bool WQM; }; LLVM_READONLY diff --git a/llvm/test/CodeGen/AMDGPU/aggressive-load-sinking.ll b/llvm/test/CodeGen/AMDGPU/aggressive-load-sinking.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/aggressive-load-sinking.ll @@ -0,0 +1,231 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx1010 -sink < %s | FileCheck --check-prefix=CHECK %s + +; Sink invariant load +define amdgpu_ps <4 x float> @invariant_load(<4 x float> addrspace(4)* %in, i32 %s) #0 { +; CHECK-LABEL: @invariant_load( +; CHECK-NEXT: main_body: +; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[S:%.*]], 0 +; CHECK-NEXT: br i1 [[C]], label [[BLOCK:%.*]], label [[END:%.*]] +; CHECK: block: +; CHECK-NEXT: [[Z:%.*]] = add i32 [[S]], 1 +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[V:%.*]] = load <4 x float>, <4 x float> addrspace(4)* [[IN:%.*]], align 4, !invariant.load !0 +; CHECK-NEXT: ret <4 x float> [[V]] +; +main_body: + %v = load <4 x float>, <4 x float> addrspace(4)* %in, align 4, !invariant.load !0 + %c = icmp eq i32 %s, 0 + br i1 %c, label %block, label %end +block: + %z = add i32 %s, 1 + br label %end +end: + ret <4 x float> %v +} + +; Cannot sink load not marked as invariant +define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, i32 %s) #0 { +; CHECK-LABEL: @load_1d( +; CHECK-NEXT: main_body: +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) +; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[S]], 0 +; CHECK-NEXT: br i1 [[C]], label [[BLOCK:%.*]], label [[END:%.*]] +; CHECK: block: +; CHECK-NEXT: [[Z:%.*]] = add i32 [[S]], 1 +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: ret <4 x float> [[V]] +; +main_body: + %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + %c = icmp eq i32 %s, 0 + br i1 %c, label %block, label %end +block: + %z = add i32 %s, 1 + br label %end +end: + ret <4 x float> %v +} + +; Can sink invariant load +define amdgpu_ps <4 x float> @load_1d_invariant(<8 x i32> inreg %rsrc, i32 %s) #0 { +; CHECK-LABEL: @load_1d_invariant( +; CHECK-NEXT: main_body: +; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[S:%.*]], 0 +; CHECK-NEXT: br i1 [[C]], label [[BLOCK:%.*]], label [[END:%.*]] +; CHECK: block: +; CHECK-NEXT: [[Z:%.*]] = add i32 [[S]], 1 +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 [[S]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0), !invariant.load !0 +; CHECK-NEXT: ret <4 x float> [[V]] +; +main_body: + %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0), !invariant.load !0 + %c = icmp eq i32 %s, 0 + br i1 %c, label %block, label %end +block: + %z = add i32 %s, 1 + br label %end +end: + ret <4 x float> %v +} + +; Can sink any load with attribute set +define amdgpu_ps <4 x float> @load_1d_forced(<8 x i32> inreg %rsrc, i32 %s) #1 { +; CHECK-LABEL: @load_1d_forced( +; CHECK-NEXT: main_body: +; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[S:%.*]], 0 +; CHECK-NEXT: br i1 [[C]], label [[BLOCK:%.*]], label [[END:%.*]] +; CHECK: block: +; CHECK-NEXT: [[Z:%.*]] = add i32 [[S]], 1 +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 [[S]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) +; CHECK-NEXT: ret <4 x float> [[V]] +; +main_body: + %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + %c = icmp eq i32 %s, 0 + br i1 %c, label %block, label %end +block: + %z = add i32 %s, 1 + br label %end +end: + ret <4 x float> %v +} + +; Cannot sink sample with implicit derivatives +define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) #0 { +; CHECK-LABEL: @sample_2d( +; CHECK-NEXT: main_body: +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0), !invariant.load !0 +; CHECK-NEXT: [[C:%.*]] = fcmp oeq float [[S]], 0.000000e+00 +; CHECK-NEXT: br i1 [[C]], label [[BLOCK:%.*]], label [[END:%.*]] +; CHECK: block: +; CHECK-NEXT: [[Z:%.*]] = fadd float [[S]], 1.000000e+00 +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: ret <4 x float> [[V]] +; +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0), !invariant.load !0 + %c = fcmp oeq float %s, 0.0 + br i1 %c, label %block, label %end +block: + %z = fadd float %s, 1.0 + br label %end +end: + ret <4 x float> %v +} + +; Can sink sample explicit derivatives +define amdgpu_ps <4 x float> @sample_2d_d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t) #0 { +; CHECK-LABEL: @sample_2d_d( +; CHECK-NEXT: main_body: +; CHECK-NEXT: [[C:%.*]] = fcmp oeq float [[S:%.*]], 0.000000e+00 +; CHECK-NEXT: br i1 [[C]], label [[BLOCK:%.*]], label [[END:%.*]] +; CHECK: block: +; CHECK-NEXT: [[Z:%.*]] = fadd float [[S]], 1.000000e+00 +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32 15, float [[DSDH:%.*]], float [[DTDH:%.*]], float [[DSDV:%.*]], float [[DTDV:%.*]], float [[S]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0), !invariant.load !0 +; CHECK-NEXT: ret <4 x float> [[V]] +; +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32 15, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0), !invariant.load !0 + %c = fcmp oeq float %s, 0.0 + br i1 %c, label %block, label %end +block: + %z = fadd float %s, 1.0 + br label %end +end: + ret <4 x float> %v +} + +; Can sink sample with explicit LOD +define amdgpu_ps <4 x float> @sample_2d_l(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %lod) #0 { +; CHECK-LABEL: @sample_2d_l( +; CHECK-NEXT: main_body: +; CHECK-NEXT: [[C:%.*]] = fcmp oeq float [[S:%.*]], 0.000000e+00 +; CHECK-NEXT: br i1 [[C]], label [[BLOCK:%.*]], label [[END:%.*]] +; CHECK: block: +; CHECK-NEXT: [[Z:%.*]] = fadd float [[S]], 1.000000e+00 +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32 15, float [[S]], float [[T:%.*]], float [[LOD:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0), !invariant.load !0 +; CHECK-NEXT: ret <4 x float> [[V]] +; +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32 15, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0), !invariant.load !0 + %c = fcmp oeq float %s, 0.0 + br i1 %c, label %block, label %end +block: + %z = fadd float %s, 1.0 + br label %end +end: + ret <4 x float> %v +} + +; Cannot sink gather4 with implicit derivatives +define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) #0 { +; CHECK-LABEL: @gather4_2d( +; CHECK-NEXT: main_body: +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 15, float [[S:%.*]], float [[T:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0), !invariant.load !0 +; CHECK-NEXT: [[C:%.*]] = fcmp oeq float [[S]], 0.000000e+00 +; CHECK-NEXT: br i1 [[C]], label [[BLOCK:%.*]], label [[END:%.*]] +; CHECK: block: +; CHECK-NEXT: [[Z:%.*]] = fadd float [[S]], 1.000000e+00 +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: ret <4 x float> [[V]] +; +main_body: + %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 15, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0), !invariant.load !0 + %c = fcmp oeq float %s, 0.0 + br i1 %c, label %block, label %end +block: + %z = fadd float %s, 1.0 + br label %end +end: + ret <4 x float> %v +} + +; Can sink gather4 with explicit LOD +define amdgpu_ps <4 x float> @gather4_2d_l_o(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s, float %t, float %lod) #0 { +; CHECK-LABEL: @gather4_2d_l_o( +; CHECK-NEXT: main_body: +; CHECK-NEXT: [[C:%.*]] = fcmp oeq float [[S:%.*]], 0.000000e+00 +; CHECK-NEXT: br i1 [[C]], label [[BLOCK:%.*]], label [[END:%.*]] +; CHECK: block: +; CHECK-NEXT: [[Z:%.*]] = fadd float [[S]], 1.000000e+00 +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: [[V:%.*]] = call <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32 15, i32 [[OFFSET:%.*]], float [[S]], float [[T:%.*]], float [[LOD:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0), !invariant.load !0 +; CHECK-NEXT: ret <4 x float> [[V]] +; +main_body: + %v = call <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32 15, i32 %offset, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0), !invariant.load !0 + %c = fcmp oeq float %s, 0.0 + br i1 %c, label %block, label %end +block: + %z = fadd float %s, 1.0 + br label %end +end: + ret <4 x float> %v +} + +declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #2 +declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #2 +declare <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32.f32(i32, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #2 +declare <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #2 +declare <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #2 +declare <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #2 + +attributes #0 = { nounwind "amdgpu-aggressive-load-sinking"="true" } +attributes #1 = { nounwind "amdgpu-aggressive-load-sinking"="true" "amdgpu-always-sink-loads"="true" } +attributes #2 = { nounwind readonly } + +!0 = !{}