Index: llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h +++ llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h @@ -448,6 +448,13 @@ TargetOpcode::G_INSERT_VECTOR_ELT>(Src0, Src1, Src2); } +template +inline TernaryOp_match +m_GISelect(const Src0Ty &Src0, const Src1Ty &Src1, const Src2Ty &Src2) { + return TernaryOp_match( + Src0, Src1, Src2); +} + /// Matches a register negated by a G_SUB. /// G_SUB 0, %negated_reg template @@ -464,7 +471,49 @@ return m_GXor(Src, m_AllOnesInt()); } -} // namespace GMIPatternMatch +template +struct maxmin_match_helper { + Boundary1 B1; + Boundary2 B2; + Origin O; + + maxmin_match_helper(const Boundary1 &FirstBoundary, + const Boundary2 &SecondBoundary, const Origin &Or) + : B1(FirstBoundary), B2(SecondBoundary), O(Or) {} + + template + bool match(const MachineRegisterInfo &MRI, OpTy &&Op) { + CmpInst::Predicate Predicate1; + Register Base; + + if (mi_match(Op, MRI, + m_GISelect(m_GICmp(m_Pred(Predicate1), m_Reg(), m_Reg()), + m_Reg(Base), B1))) { + CmpInst::Predicate Predicate2; + + if (mi_match(Base, MRI, + m_GISelect(m_GICmp(m_Pred(Predicate2), m_Reg(), m_Reg()), O, + B2))) { + if ((Predicate1 == CmpInst::ICMP_SLT && + Predicate2 == CmpInst::ICMP_SGT) || + (Predicate1 == CmpInst::ICMP_SGT && + Predicate2 == CmpInst::ICMP_SLT)) { + return true; + } + } + } + + return false; + } +}; + +template +inline maxmin_match_helper +m_MaxMin(const Boundary1 &B1, const Boundary2 &B2, const Origin &O) { + return maxmin_match_helper(B1, B2, O); +} + +} // namespace MIPatternMatch } // namespace llvm #endif Index: llvm/lib/Target/AMDGPU/AMDGPUCombine.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -37,6 +37,14 @@ [{ return PostLegalizerHelper.matchCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }]), (apply [{ PostLegalizerHelper.applyCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }])>; +def clamp_i64_to_i16_matchdata : GIDefMatchData<"AMDGPUPostLegalizerCombinerHelper::ClampI64ToI16MatchInfo">; + +def clamp_i64_to_i16 : GICombineRule< + (defs root:$clamp_i64_to_i16, clamp_i64_to_i16_matchdata:$matchinfo), + (match (wip_match_opcode G_TRUNC):$clamp_i64_to_i16, + [{ return PostLegalizerHelper.matchClampI64ToI16(*${clamp_i64_to_i16}, MRI, *MF, ${matchinfo}); }]), + (apply [{ PostLegalizerHelper.applyClampI64ToI16(*${clamp_i64_to_i16}, ${matchinfo}); }])>; + // Combines which should only apply on SI/VI def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>; @@ -49,7 +57,7 @@ def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper< "AMDGPUGenPostLegalizerCombinerHelper", [all_combines, gfx6gfx7_combines, - uchar_to_float, cvt_f32_ubyteN]> { + uchar_to_float, cvt_f32_ubyteN, clamp_i64_to_i16]> { let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule"; let StateClass = "AMDGPUPostLegalizerCombinerHelperState"; let AdditionalArguments = []; Index: llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -1,4 +1,5 @@ -//=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp ---------------===// +//=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp +//---------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -11,8 +12,9 @@ // //===----------------------------------------------------------------------===// -#include "AMDGPUTargetMachine.h" #include "AMDGPULegalizerInfo.h" +#include "AMDGPUTargetMachine.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/CodeGen/GlobalISel/Combiner.h" #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" @@ -22,7 +24,6 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/Support/Debug.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #define DEBUG_TYPE "amdgpu-postlegalizer-combiner" @@ -66,6 +67,19 @@ bool matchCvtF32UByteN(MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo); void applyCvtF32UByteN(MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo); + + struct ClampI64ToI16MatchInfo { + int64_t Cmp1; + int64_t Cmp2; + Register Origin; + }; + + bool matchClampI64ToI16(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineFunction &MF, + ClampI64ToI16MatchInfo &MatchInfo); + + void applyClampI64ToI16(MachineInstr &MI, + const ClampI64ToI16MatchInfo &MatchInfo); }; bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy( @@ -188,11 +202,11 @@ SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0); if (Ty == S32) { - B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, - {SrcReg}, MI.getFlags()); + B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, {SrcReg}, + MI.getFlags()); } else { - auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, - {SrcReg}, MI.getFlags()); + auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, {SrcReg}, + MI.getFlags()); B.buildFPTrunc(DstReg, Cvt0, MI.getFlags()); } @@ -245,6 +259,110 @@ MI.eraseFromParent(); } +bool AMDGPUPostLegalizerCombinerHelper::matchClampI64ToI16( + MachineInstr &MI, MachineRegisterInfo &MRI, MachineFunction &MF, + ClampI64ToI16MatchInfo &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!"); + const LLT SrcType = MRI.getType(MI.getOperand(1).getReg()); + + // we want to check if a 64-bit number gets clamped to 16-bit boundaries (or + // below). + if (SrcType != LLT::scalar(64)) + return false; + + MachineIRBuilder B(MI); + + LLVM_DEBUG(dbgs() << "Matching Clamp i64 to i16"); + + if (mi_match(MI.getOperand(1).getReg(), MRI, + m_MaxMin(m_ICst(MatchInfo.Cmp1), m_ICst(MatchInfo.Cmp2), + m_Reg(MatchInfo.Origin)))) { + const auto Cmp1 = MatchInfo.Cmp1; + const auto Cmp2 = MatchInfo.Cmp2; + const auto Diff = std::abs(Cmp2 - Cmp1); + + // we don't need to clamp here. + if (Diff == 0 || Diff == 1) { + return false; + } + + const int64_t Min = std::numeric_limits::min(); + const int64_t Max = std::numeric_limits::max(); + + // are we really trying to clamp against short boundaries? + return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) || + (Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min)); + } + + return false; +} + +/** + * We want to find a combination of instructions that + * gets generated when an i64 gets clamped to i16. + * The corresponding pattern is: + * G_SELECT MIN/MAX, G_ICMP, G_SELECT MIN/MAX, G_ICMP, G_TRUNC. + * This can be efficiently written as following: + * v_cvt_pk_i16_i32 v0, v0, v1 + * v_med3_i32 v0, Clamp_Min, v0, Clamp_Max + */ +void AMDGPUPostLegalizerCombinerHelper::applyClampI64ToI16( + MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) { + LLVM_DEBUG(dbgs() << "Combining MI"); + + MachineIRBuilder B(MI); + MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + + Register Src = MatchInfo.Origin; + assert(MRI.getType(Src) == LLT::scalar(64)); + const LLT S32 = LLT::scalar(32); + + auto Unmerge = B.buildUnmerge(S32, Src); + Register Hi32 = Unmerge->getOperand(0).getReg(); + Register Lo32 = Unmerge->getOperand(1).getReg(); + MRI.setRegClass(Hi32, &AMDGPU::VGPR_32RegClass); + MRI.setRegClass(Lo32, &AMDGPU::VGPR_32RegClass); + + constexpr unsigned int CvtOpcode = AMDGPU::V_CVT_PK_I16_I32_e64; + assert(MI.getOpcode() != CvtOpcode); + + const auto REG_CLASS = &AMDGPU::VGPR_32RegClass; + + Register CvtDst = MRI.createVirtualRegister(REG_CLASS); + MRI.setType(CvtDst, S32); + + auto CvtPk = B.buildInstr(CvtOpcode); + CvtPk.addDef(CvtDst); + CvtPk.addReg(Hi32); + CvtPk.addReg(Lo32); + CvtPk.setMIFlags(MI.getFlags()); + + auto min = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2); + auto max = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2); + + Register MinBoundaryDst = MRI.createVirtualRegister(REG_CLASS); + MRI.setType(MinBoundaryDst, S32); + B.buildConstant(MinBoundaryDst, min); + + Register MaxBoundaryDst = MRI.createVirtualRegister(REG_CLASS); + MRI.setType(MaxBoundaryDst, S32); + B.buildConstant(MaxBoundaryDst, max); + + Register MedDst = MRI.createVirtualRegister(REG_CLASS); + MRI.setType(MedDst, S32); + + auto Med = B.buildInstr(AMDGPU::V_MED3_I32); + Med.addDef(MedDst); + Med.addReg(MinBoundaryDst); + Med.addReg(CvtDst); + Med.addReg(MaxBoundaryDst); + Med.setMIFlags(MI.getFlags()); + + B.buildCopy(MI.getOperand(0).getReg(), MedDst); + + MI.eraseFromParent(); +} + class AMDGPUPostLegalizerCombinerHelperState { protected: CombinerHelper &Helper; @@ -331,6 +449,7 @@ bool runOnMachineFunction(MachineFunction &MF) override; void getAnalysisUsage(AnalysisUsage &AU) const override; + private: bool IsOptNone; }; @@ -350,7 +469,7 @@ } AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone) - : MachineFunctionPass(ID), IsOptNone(IsOptNone) { + : MachineFunctionPass(ID), IsOptNone(IsOptNone) { initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry()); } @@ -364,8 +483,8 @@ MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); const GCNSubtarget &ST = MF.getSubtarget(); - const AMDGPULegalizerInfo *LI - = static_cast(ST.getLegalizerInfo()); + const AMDGPULegalizerInfo *LI = + static_cast(ST.getLegalizerInfo()); GISelKnownBits *KB = &getAnalysis().get(MF); MachineDominatorTree *MDT = @@ -378,8 +497,8 @@ char AMDGPUPostLegalizerCombiner::ID = 0; INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, - "Combine AMDGPU machine instrs after legalization", - false, false) + "Combine AMDGPU machine instrs after legalization", false, + false) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll @@ -0,0 +1,131 @@ +; RUN: llc -global-isel -mcpu=tahiti -mtriple=amdGFX10-amd-amdhsa -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX678,GFX6789 %s +; RUN: llc -global-isel -mcpu=gfx900 -mtriple=amdGFX10-amd-amdhsa -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9,GFX6789 %s +; RUN: llc -global-isel -mcpu=gfx1010 -march=amdGFX10 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s + +; GFX10-LABEL: {{^}}v_clamp_i64_i16 +; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] +; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] +; GFX6789: v_mov_b32_e32 [[B]], 0xffff8000 +; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0x7fff +; GFX6789: v_med3_i32 [[A]], [[B]], [[A]], [[C]] +; GFX10: v_cvt_pk_i16_i32_e64 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] +; GFX10: v_mov_b32_e32 [[C:v[0-9]+]], 0x7fff +; GFX10: v_med3_i32 [[A]], 0xffff8000, [[A]], [[C]] +define i16 @v_clamp_i64_i16(i64 %in) nounwind { +entry: + %0 = icmp sgt i64 %in, -32768 + %1 = select i1 %0, i64 %in, i64 -32768 + %2 = icmp slt i64 %1, 32767 + %3 = select i1 %2, i64 %1, i64 32767 + %4 = trunc i64 %3 to i16 + + ret i16 %4 +} + +; GFX10-LABEL: {{^}}v_clamp_i64_i16_reverse +; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] +; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] +; GFX6789: v_mov_b32_e32 [[B]], 0xffff8000 +; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0x7fff +; GFX6789: v_med3_i32 [[A]], [[B]], [[A]], [[C]] +; GFX10: v_cvt_pk_i16_i32_e64 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] +; GFX10: v_mov_b32_e32 [[C:v[0-9]+]], 0x7fff +; GFX10: v_med3_i32 [[A]], 0xffff8000, [[A]], [[C]] +define i16 @v_clamp_i64_i16_reverse(i64 %in) nounwind { +entry: + %0 = icmp slt i64 %in, 32767 + %1 = select i1 %0, i64 %in, i64 32767 + %2 = icmp sgt i64 %1, -32768 + %3 = select i1 %2, i64 %1, i64 -32768 + %4 = trunc i64 %3 to i16 + + ret i16 %4 +} + +; GFX10-LABEL: {{^}}v_clamp_i64_i16_wrong_lower +; GFX6789: v_mov_b32_e32 [[B:v[0-9]+]], 0x8001 +; GFX6789: v_cndmask_b32_e32 [[A:v[0-9]+]], [[B]], [[A]], vcc +; GFX6789: v_cndmask_b32_e32 [[C:v[0-9]+]], 0, [[C]], vcc + +; GFX10: v_cndmask_b32_e32 [[A:v[0-9]+]], 0x8001, [[A]], vcc_lo +; GFX10: v_cndmask_b32_e32 [[B:v[0-9]+]], 0, [[B]], vcc_lo +define i16 @v_clamp_i64_i16_wrong_lower(i64 %in) nounwind { +entry: + %0 = icmp slt i64 %in, 32769 + %1 = select i1 %0, i64 %in, i64 32769 + %2 = icmp sgt i64 %1, -32768 + %3 = select i1 %2, i64 %1, i64 -32768 + %4 = trunc i64 %3 to i16 + + ret i16 %4 +} + +; GFX10-LABEL: {{^}}v_clamp_i64_i16_wrong_lower_and_higher +; GFX6789: v_mov_b32_e32 [[B:v[0-9]+]], 0x8000 +; GFX6789: v_cndmask_b32_e32 [[A:v[0-9]+]], [[B]], [[A]], vcc + +; GFX10: v_cndmask_b32_e32 [[A:v[0-9]+]], 0x8000, [[A]], vcc_lo +define i16 @v_clamp_i64_i16_wrong_lower_and_higher(i64 %in) nounwind { +entry: + %0 = icmp sgt i64 %in, -32769 + %1 = select i1 %0, i64 %in, i64 -32769 + %2 = icmp slt i64 %1, 32768 + %3 = select i1 %2, i64 %1, i64 32768 + %4 = trunc i64 %3 to i16 + + ret i16 %4 +} + +; GFX10-LABEL: {{^}}v_clamp_i64_i16_lower_than_short +; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] +; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] +; GFX6789: v_mov_b32_e32 [[B]], 0xffffff01 +; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0x100 +; GFX6789: v_med3_i32 [[A]], [[B]], [[A]], [[C]] +; GFX10: v_cvt_pk_i16_i32_e64 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] +; GFX10: v_mov_b32_e32 [[C:v[0-9]+]], 0x100 +; GFX10: v_med3_i32 [[A]], 0xffffff01, [[A]], [[C]] +define i16 @v_clamp_i64_i16_lower_than_short(i64 %in) nounwind { +entry: + %0 = icmp slt i64 %in, 256 + %1 = select i1 %0, i64 %in, i64 256 + %2 = icmp sgt i64 %1, -255 + %3 = select i1 %2, i64 %1, i64 -255 + %4 = trunc i64 %3 to i16 + + ret i16 %4 +} + +; GFX10-LABEL: {{^}}v_clamp_i64_i16_lower_than_short_reverse +; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] +; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] +; GFX6789: v_mov_b32_e32 [[B]], 0xffffff01 +; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0x100 +; GFX6789: v_med3_i32 [[A]], [[B]], [[A]], [[C]] +; GFX10: v_cvt_pk_i16_i32_e64 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] +; GFX10: v_mov_b32_e32 [[C:v[0-9]+]], 0x100 +; GFX10: v_med3_i32 [[A]], 0xffffff01, [[A]], [[C]] +define i16 @v_clamp_i64_i16_lower_than_short_reverse(i64 %in) nounwind { +entry: + %0 = icmp sgt i64 %in, -255 + %1 = select i1 %0, i64 %in, i64 -255 + %2 = icmp slt i64 %1, 256 + %3 = select i1 %2, i64 %1, i64 256 + %4 = trunc i64 %3 to i16 + + ret i16 %4 +} + +; GFX10-LABEL: {{^}}v_clamp_i64_i16_zero +; GFX678: v_mov_b32_e32 [[A:v[0-9]+]], 0 +; GFX10: v_mov_b32_e32 [[A:v[0-9]+]], 0 +define i16 @v_clamp_i64_i16_zero(i64 %in) nounwind { +entry: + %0 = icmp sgt i64 %in, 0 + %1 = select i1 %0, i64 %in, i64 0 + %2 = icmp slt i64 %1, 0 + %3 = select i1 %2, i64 %1, i64 0 + %4 = trunc i64 %3 to i16 + + ret i16 %4 +} \ No newline at end of file