diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -4445,6 +4445,30 @@ } } + // Optimize _bias away when 'bias' is zero + if (const AMDGPU::MIMGBiasMappingInfo *BiasMappingInfo = + AMDGPU::getMIMGBiasMappingInfo(Intr->BaseOpcode)) { + const ConstantFP *ConstantBias; + + if (mi_match(MI.getOperand(ArgOffset + Intr->BiasIndex).getReg(), *MRI, + m_GFCst(ConstantBias))) { + if (ConstantBias->isZero() || ConstantBias->isNegative()) { + // Set new opcode to _lz variant of _l, and change the intrinsic ID. + const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = + AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias, + Intr->Dim); + + // The starting indexes should remain in the same place. + --CorrectedNumVAddrs; + + MI.getOperand(MI.getNumExplicitDefs()) + .setIntrinsicID(static_cast(NewImageDimIntr->Intr)); + MI.RemoveOperand(ArgOffset + Intr->BiasIndex); + Intr = NewImageDimIntr; + } + } + } + // Rewrite the addressing register layout before doing anything else. if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { // 16 bit gradients are supported, but are tied to the A16 control diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -131,6 +131,22 @@ let PrimaryKeyName = "getMIMGMIPMappingInfo"; } +class MIMGBiasMapping { + MIMGBaseOpcode Bias = bias; + MIMGBaseOpcode NoBias = nobias; +} + +def MIMGBiasMappingTable : GenericTable { + let FilterClass = "MIMGBiasMapping"; + let CppTypeName = "MIMGBiasMappingInfo"; + let Fields = ["Bias", "NoBias"]; + string TypeOf_Bias = "MIMGBaseOpcode"; + string TypeOf_NoBias = "MIMGBaseOpcode"; + + let PrimaryKey = ["Bias"]; + let PrimaryKeyName = "getMIMGBiasMappingInfo"; +} + class MIMGG16Mapping { MIMGBaseOpcode G = g; MIMGBaseOpcode G16 = g16; @@ -1140,6 +1156,16 @@ def : MIMGMIPMapping; def : MIMGMIPMapping; +// Bias to NoBias Optimization Mapping +def : MIMGBiasMapping; +def : MIMGBiasMapping; +def : MIMGBiasMapping; +def : MIMGBiasMapping; +def : MIMGBiasMapping; +def : MIMGBiasMapping; +def : MIMGBiasMapping; +def : MIMGBiasMapping; + // G to G16 Optimization Mapping def : MIMGG16Mapping; def : MIMGG16Mapping; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6159,6 +6159,8 @@ AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode); const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo = AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode); + const AMDGPU::MIMGBiasMappingInfo *BiasMappingInfo = + AMDGPU::getMIMGBiasMappingInfo(Intr->BaseOpcode); unsigned IntrOpcode = Intr->BaseOpcode; bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget); @@ -6268,6 +6270,19 @@ } } + // Optimize _B away, when the 'bias' is zero + bool RemoveBias = false; + if (BiasMappingInfo) { + if (auto *ConstantBias = dyn_cast( + Op.getOperand(ArgOffset + Intr->BiasIndex))) { + if (ConstantBias->isZero()) { + // set new opcode to variant without _b + IntrOpcode = BiasMappingInfo->NoBias; + RemoveBias = true; + } + } + } + // Check for 16 bit addresses or derivatives and pack if true. MVT VAddrVT = Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType(); @@ -6282,6 +6297,9 @@ // Push back extra arguments. for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) { + if (RemoveBias && I == Intr->BiasIndex) + continue; + if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) { assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument"); // Special handling of bias when A16 is on. Bias is of type half but diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -64,6 +64,7 @@ #define GET_MIMGEncoding_DECL #define GET_MIMGLZMapping_DECL #define GET_MIMGMIPMapping_DECL +#define GET_MIMGBiASMapping_DECL #include "AMDGPUGenSearchableTables.inc" namespace IsaInfo { @@ -330,6 +331,11 @@ MIMGBaseOpcode NONMIP; }; +struct MIMGBiasMappingInfo { + MIMGBaseOpcode Bias; + MIMGBaseOpcode NoBias; +}; + struct MIMGG16MappingInfo { MIMGBaseOpcode G; MIMGBaseOpcode G16; @@ -341,6 +347,9 @@ LLVM_READONLY const MIMGMIPMappingInfo *getMIMGMIPMappingInfo(unsigned MIP); +LLVM_READONLY +const MIMGBiasMappingInfo *getMIMGBiasMappingInfo(unsigned Bias); + LLVM_READONLY const MIMGG16MappingInfo *getMIMGG16MappingInfo(unsigned G); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -132,6 +132,7 @@ #define GET_MIMGInfoTable_IMPL #define GET_MIMGLZMappingTable_IMPL #define GET_MIMGMIPMappingTable_IMPL +#define GET_MIMGBiasMappingTable_IMPL #define GET_MIMGG16MappingTable_IMPL #include "AMDGPUGenSearchableTables.inc" diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.bias_zero.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.bias_zero.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.bias_zero.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.bias_zero.ll @@ -6,10 +6,8 @@ ; GCN: ; %bb.0: ; %main_body ; GCN-NEXT: s_mov_b64 s[12:13], exec ; GCN-NEXT: s_wqm_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v1, v0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_and_b64 exec, exec, s[12:13] -; GCN-NEXT: image_sample_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf +; GCN-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog main_body: @@ -22,11 +20,8 @@ ; GCN: ; %bb.0: ; %main_body ; GCN-NEXT: s_mov_b64 s[12:13], exec ; GCN-NEXT: s_wqm_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v2, v0 -; GCN-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NEXT: v_bfrev_b32_e32 v1, 1 ; GCN-NEXT: s_and_b64 exec, exec, s[12:13] -; GCN-NEXT: image_sample_b v[0:3], v[1:3], s[0:7], s[8:11] dmask:0xf +; GCN-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog main_body: @@ -39,11 +34,8 @@ ; GCN: ; %bb.0: ; %main_body ; GCN-NEXT: s_mov_b64 s[12:13], exec ; GCN-NEXT: s_wqm_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v2, v0 -; GCN-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NEXT: v_bfrev_b32_e32 v1, 1 ; GCN-NEXT: s_and_b64 exec, exec, s[12:13] -; GCN-NEXT: image_sample_c_b v[0:3], v[1:3], s[0:7], s[8:11] dmask:0xf +; GCN-NEXT: image_sample_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog main_body: @@ -56,12 +48,8 @@ ; GCN: ; %bb.0: ; %main_body ; GCN-NEXT: s_mov_b64 s[12:13], exec ; GCN-NEXT: s_wqm_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v3, v0 -; GCN-NEXT: v_mov_b32_e32 v4, v1 -; GCN-NEXT: v_mov_b32_e32 v5, v2 -; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_and_b64 exec, exec, s[12:13] -; GCN-NEXT: image_sample_c_b v[0:3], v[2:5], s[0:7], s[8:11] dmask:0xf +; GCN-NEXT: image_sample_c v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog main_body: @@ -74,10 +62,8 @@ ; GCN: ; %bb.0: ; %main_body ; GCN-NEXT: s_mov_b64 s[12:13], exec ; GCN-NEXT: s_wqm_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_and_b64 exec, exec, s[12:13] -; GCN-NEXT: image_sample_b_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf +; GCN-NEXT: image_sample_o v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog main_body: @@ -90,12 +76,8 @@ ; GCN: ; %bb.0: ; %main_body ; GCN-NEXT: s_mov_b64 s[12:13], exec ; GCN-NEXT: s_wqm_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v3, v0 -; GCN-NEXT: v_mov_b32_e32 v5, v1 -; GCN-NEXT: v_mov_b32_e32 v6, v2 -; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_and_b64 exec, exec, s[12:13] -; GCN-NEXT: image_sample_b_o v[0:3], v[3:6], s[0:7], s[8:11] dmask:0xf +; GCN-NEXT: image_sample_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog main_body: @@ -108,12 +90,8 @@ ; GCN: ; %bb.0: ; %main_body ; GCN-NEXT: s_mov_b64 s[12:13], exec ; GCN-NEXT: s_wqm_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v3, v0 -; GCN-NEXT: v_mov_b32_e32 v5, v1 -; GCN-NEXT: v_mov_b32_e32 v6, v2 -; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_and_b64 exec, exec, s[12:13] -; GCN-NEXT: image_sample_c_b_o v[0:3], v[3:6], s[0:7], s[8:11] dmask:0xf +; GCN-NEXT: image_sample_c_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog main_body: @@ -126,13 +104,8 @@ ; GCN: ; %bb.0: ; %main_body ; GCN-NEXT: s_mov_b64 s[12:13], exec ; GCN-NEXT: s_wqm_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v4, v0 -; GCN-NEXT: v_mov_b32_e32 v6, v1 -; GCN-NEXT: v_mov_b32_e32 v7, v2 -; GCN-NEXT: v_mov_b32_e32 v8, v3 -; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_and_b64 exec, exec, s[12:13] -; GCN-NEXT: image_sample_c_b_o v[0:3], v[4:8], s[0:7], s[8:11] dmask:0xf +; GCN-NEXT: image_sample_c_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog main_body: @@ -217,14 +190,11 @@ ; GCN: ; %bb.0: ; %main_body ; GCN-NEXT: s_mov_b64 s[12:13], exec ; GCN-NEXT: s_wqm_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_mov_b32_e32 v1, 0xffff +; GCN-NEXT: v_mov_b32_e32 v4, 0xffff ; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GCN-NEXT: v_and_or_b32 v3, v4, v1, v3 -; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_and_or_b32 v2, v2, v4, v3 ; GCN-NEXT: s_and_b64 exec, exec, s[12:13] -; GCN-NEXT: image_sample_c_b_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16 +; GCN-NEXT: image_sample_c_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf a16 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.bias_zero.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.bias_zero.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.bias_zero.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.bias_zero.ll @@ -6,10 +6,8 @@ ; GCN: ; %bb.0: ; %main_body ; GCN-NEXT: s_mov_b64 s[12:13], exec ; GCN-NEXT: s_wqm_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v1, v0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_and_b64 exec, exec, s[12:13] -; GCN-NEXT: image_sample_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf +; GCN-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog main_body: @@ -22,11 +20,8 @@ ; GCN: ; %bb.0: ; %main_body ; GCN-NEXT: s_mov_b64 s[12:13], exec ; GCN-NEXT: s_wqm_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_mov_b32_e32 v1, v0 -; GCN-NEXT: v_bfrev_b32_e32 v0, 1 ; GCN-NEXT: s_and_b64 exec, exec, s[12:13] -; GCN-NEXT: image_sample_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf +; GCN-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog main_body: @@ -39,11 +34,8 @@ ; GCN: ; %bb.0: ; %main_body ; GCN-NEXT: s_mov_b64 s[12:13], exec ; GCN-NEXT: s_wqm_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_mov_b32_e32 v1, v0 -; GCN-NEXT: v_bfrev_b32_e32 v0, 1 ; GCN-NEXT: s_and_b64 exec, exec, s[12:13] -; GCN-NEXT: image_sample_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf +; GCN-NEXT: image_sample_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog main_body: @@ -56,12 +48,8 @@ ; GCN: ; %bb.0: ; %main_body ; GCN-NEXT: s_mov_b64 s[12:13], exec ; GCN-NEXT: s_wqm_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v3, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_mov_b32_e32 v1, v0 -; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_and_b64 exec, exec, s[12:13] -; GCN-NEXT: image_sample_c_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf +; GCN-NEXT: image_sample_c v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog main_body: @@ -74,10 +62,8 @@ ; GCN: ; %bb.0: ; %main_body ; GCN-NEXT: s_mov_b64 s[12:13], exec ; GCN-NEXT: s_wqm_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_and_b64 exec, exec, s[12:13] -; GCN-NEXT: image_sample_b_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf +; GCN-NEXT: image_sample_o v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog main_body: @@ -90,11 +76,8 @@ ; GCN: ; %bb.0: ; %main_body ; GCN-NEXT: s_mov_b64 s[12:13], exec ; GCN-NEXT: s_wqm_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v3, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_and_b64 exec, exec, s[12:13] -; GCN-NEXT: image_sample_b_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf +; GCN-NEXT: image_sample_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog main_body: @@ -107,11 +90,8 @@ ; GCN: ; %bb.0: ; %main_body ; GCN-NEXT: s_mov_b64 s[12:13], exec ; GCN-NEXT: s_wqm_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v3, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_and_b64 exec, exec, s[12:13] -; GCN-NEXT: image_sample_c_b_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf +; GCN-NEXT: image_sample_c_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog main_body: @@ -124,12 +104,8 @@ ; GCN: ; %bb.0: ; %main_body ; GCN-NEXT: s_mov_b64 s[12:13], exec ; GCN-NEXT: s_wqm_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v4, v3 -; GCN-NEXT: v_mov_b32_e32 v3, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_and_b64 exec, exec, s[12:13] -; GCN-NEXT: image_sample_c_b_o v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf +; GCN-NEXT: image_sample_c_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog main_body: @@ -210,13 +186,10 @@ ; GCN: ; %bb.0: ; %main_body ; GCN-NEXT: s_mov_b64 s[12:13], exec ; GCN-NEXT: s_wqm_b64 exec, exec -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: v_mov_b32_e32 v2, v1 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; GCN-NEXT: v_lshl_or_b32 v3, v3, 16, v1 -; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GCN-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GCN-NEXT: s_and_b64 exec, exec, s[12:13] -; GCN-NEXT: image_sample_c_b_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf a16 +; GCN-NEXT: image_sample_c_o v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf a16 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog main_body: