Index: llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h +++ llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h @@ -286,6 +286,18 @@ return BinaryOp_match(L, R); } +template +inline BinaryOp_match +m_GSMax(const LHS &L, const RHS &R) { + return BinaryOp_match(L, R); +} + +template +inline BinaryOp_match +m_GSMin(const LHS &L, const RHS &R) { + return BinaryOp_match(L, R); +} + // Helper for unary instructions (G_[ZSA]EXT/G_TRUNC) etc template struct UnaryOp_match { SrcTy L; @@ -448,6 +460,13 @@ TargetOpcode::G_INSERT_VECTOR_ELT>(Src0, Src1, Src2); } +template +inline TernaryOp_match +m_GISelect(const Src0Ty &Src0, const Src1Ty &Src1, const Src2Ty &Src2) { + return TernaryOp_match( + Src0, Src1, Src2); +} + /// Matches a register negated by a G_SUB. /// G_SUB 0, %negated_reg template @@ -464,7 +483,8 @@ return m_GXor(Src, m_AllOnesInt()); } -} // namespace GMIPatternMatch + +} // namespace MIPatternMatch } // namespace llvm #endif Index: llvm/lib/Target/AMDGPU/AMDGPUCombine.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -37,13 +37,21 @@ [{ return PostLegalizerHelper.matchCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }]), (apply [{ PostLegalizerHelper.applyCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }])>; +def clamp_i64_to_i16_matchdata : GIDefMatchData<"AMDGPUPreLegalizerCombinerHelper::ClampI64ToI16MatchInfo">; + +def clamp_i64_to_i16 : GICombineRule< + (defs root:$clamp_i64_to_i16, clamp_i64_to_i16_matchdata:$matchinfo), + (match (wip_match_opcode G_TRUNC):$clamp_i64_to_i16, + [{ return PreLegalizerHelper.matchClampI64ToI16(*${clamp_i64_to_i16}, MRI, *MF, ${matchinfo}); }]), + (apply [{ PreLegalizerHelper.applyClampI64ToI16(*${clamp_i64_to_i16}, ${matchinfo}); }])>; + // Combines which should only apply on SI/VI def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>; - def AMDGPUPreLegalizerCombinerHelper: GICombinerHelper< - "AMDGPUGenPreLegalizerCombinerHelper", [all_combines]> { + "AMDGPUGenPreLegalizerCombinerHelper", [all_combines, clamp_i64_to_i16]> { let DisableRuleOption = "amdgpuprelegalizercombiner-disable-rule"; + let StateClass = "AMDGPUPreLegalizerCombinerHelperState"; } def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper< Index: llvm/lib/Target/AMDGPU/AMDGPUGISel.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -166,6 +166,9 @@ def : GINodeEquiv; def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; + def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; Index: llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -213,6 +213,8 @@ def AMDGPUcvt_f32_ubyte3 : SDNode<"AMDGPUISD::CVT_F32_UBYTE3", SDTIntToFPOp, []>; +def AMDGPUcvt_pk_i16_i32 : SDNode<"AMDGPUISD::CVT_PK_I16_I32", + AMDGPUIntPackOp, []>; // urecip - This operation is a helper for integer division, it returns the // result of 1 / a as a fractional unsigned integer. Index: llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -13,6 +13,8 @@ #include "AMDGPUTargetMachine.h" #include "AMDGPULegalizerInfo.h" + + #include "llvm/CodeGen/GlobalISel/Combiner.h" #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" @@ -331,6 +333,7 @@ bool runOnMachineFunction(MachineFunction &MF) override; void getAnalysisUsage(AnalysisUsage &AU) const override; + private: bool IsOptNone; }; Index: llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp @@ -28,6 +28,153 @@ using namespace llvm; using namespace MIPatternMatch; +class AMDGPUPreLegalizerCombinerHelper { +protected: + MachineIRBuilder &B; + MachineFunction &MF; + MachineRegisterInfo &MRI; + CombinerHelper &Helper; + +public: + AMDGPUPreLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper) + : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){}; + + struct ClampI64ToI16MatchInfo { + int64_t Cmp1; + int64_t Cmp2; + Register Origin; + }; + + bool matchClampI64ToI16(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineFunction &MF, + ClampI64ToI16MatchInfo &MatchInfo); + + void applyClampI64ToI16(MachineInstr &MI, + const ClampI64ToI16MatchInfo &MatchInfo); +}; + +bool AMDGPUPreLegalizerCombinerHelper::matchClampI64ToI16( + MachineInstr &MI, MachineRegisterInfo &MRI, MachineFunction &MF, + ClampI64ToI16MatchInfo &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!"); + + // Try to find a pattern where an i64 value should get clamped to short. + const LLT SrcType = MRI.getType(MI.getOperand(1).getReg()); + if (SrcType != LLT::scalar(64)) + return false; + + const LLT DstType = MRI.getType(MI.getOperand(0).getReg()); + if (DstType != LLT::scalar(16)) + return false; + + Register Base; + + // Try to match a combination of min / max MIR opcodes. + if (mi_match(MI.getOperand(1).getReg(), MRI, m_GSMin(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) { + if (!mi_match(Base, MRI, m_GSMax(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) { + return false; + } + } + + if (mi_match(MI.getOperand(1).getReg(), MRI, m_GSMax(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) { + if (!mi_match(Base, MRI, m_GSMin(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) { + return false; + } + } + + const auto Cmp1 = MatchInfo.Cmp1; + const auto Cmp2 = MatchInfo.Cmp2; + const auto Diff = std::abs(Cmp2 - Cmp1); + + // If the difference between both comparison values is 0 or 1, there is no + // need to clamp. + if (Diff == 0 || Diff == 1) + return false; + + const int64_t Min = std::numeric_limits::min(); + const int64_t Max = std::numeric_limits::max(); + + // Check if the comparison values are between SHORT_MIN and SHORT_MAX. + return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) || + (Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min)); +} + +// We want to find a combination of instructions that +// gets generated when an i64 gets clamped to i16. +// The corresponding pattern is: +// G_MAX / G_MAX for i16 <= G_TRUNC i64. +// This can be efficiently written as following: +// v_cvt_pk_i16_i32 v0, v0, v1 +// v_med3_i32 v0, Clamp_Min, v0, Clamp_Max +void AMDGPUPreLegalizerCombinerHelper::applyClampI64ToI16( + MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) { + MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + + Register Src = MatchInfo.Origin; + assert(MRI.getType(Src) == LLT::scalar(64)); + const LLT S32 = LLT::scalar(32); + + B.setMBB(*MI.getParent()); + B.setInstrAndDebugLoc(MI); + + auto Unmerge = B.buildUnmerge(S32, Src); + Register Hi32 = Unmerge.getReg(0); + Register Lo32 = Unmerge.getReg(1); + MRI.setRegClass(Hi32, &AMDGPU::VGPR_32RegClass); + MRI.setRegClass(Lo32, &AMDGPU::VGPR_32RegClass); + + assert(MI.getOpcode() != AMDGPU::G_AMDGPU_CVT_PK_I16_I32); + + Register CvtDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + const LLT V2S16 = LLT::vector(2, 16); + MRI.setType(CvtDst, V2S16); + + B.buildInstr(AMDGPU::G_AMDGPU_CVT_PK_I16_I32, + {CvtDst}, + {Hi32, Lo32}, + MI.getFlags()); + + auto MinBoundary = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2); + auto MaxBoundary = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2); + + auto MinBoundaryDst = B.buildConstant(S32, MinBoundary); + MRI.setRegClass(MinBoundaryDst.getReg(0), &AMDGPU::VGPR_32RegClass); + + auto MaxBoundaryDst = B.buildConstant(S32, MaxBoundary); + MRI.setRegClass(MaxBoundaryDst.getReg(0), &AMDGPU::VGPR_32RegClass); + + Register MedDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + MRI.setType(MedDst, S32); + + Register CvtDst32 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + MRI.setType(CvtDst32, S32); + + B.buildBitcast(CvtDst32, CvtDst); + + B.buildInstr(AMDGPU::G_AMDGPU_MED3_S32, + {MedDst}, + {MinBoundaryDst.getReg(0), CvtDst32, MaxBoundaryDst.getReg(0)}, + MI.getFlags()); + + Register TruncDst = MRI.createGenericVirtualRegister(LLT::scalar(16)); + B.buildTrunc(TruncDst, MedDst); + B.buildCopy(MI.getOperand(0).getReg(), TruncDst); + + MI.eraseFromParent(); +} + +class AMDGPUPreLegalizerCombinerHelperState { +protected: + CombinerHelper &Helper; + AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper; + +public: + AMDGPUPreLegalizerCombinerHelperState( + CombinerHelper &Helper, + AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper) + : Helper(Helper), PreLegalizerHelper(PreLegalizerHelper) {} +}; + #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS #include "AMDGPUGenPreLegalizeGICombiner.inc" #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS @@ -61,7 +208,9 @@ MachineInstr &MI, MachineIRBuilder &B) const { CombinerHelper Helper(Observer, B, KB, MDT); - AMDGPUGenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg); + AMDGPUPreLegalizerCombinerHelper PreLegalizerHelper(B, Helper); + AMDGPUGenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper, + PreLegalizerHelper); if (Generated.tryCombineAll(Observer, MI, B, Helper)) return true; @@ -127,6 +276,7 @@ const Function &F = MF.getFunction(); bool EnableOpt = MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); + GISelKnownBits *KB = &getAnalysis().get(MF); MachineDominatorTree *MDT = IsOptNone ? nullptr : &getAnalysis(); Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3637,6 +3637,8 @@ case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1: case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2: case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3: + case AMDGPU::G_AMDGPU_CVT_PK_I16_I32: + case AMDGPU::G_AMDGPU_MED3_S32: return getDefaultMappingVOP(MI); case AMDGPU::G_UMULH: case AMDGPU::G_SMULH: { Index: llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstructions.td +++ llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2587,6 +2587,18 @@ } } +def G_AMDGPU_CVT_PK_I16_I32 : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src0, type0:$src1); + let hasSideEffects = 0; +} + +def G_AMDGPU_MED3_S32 : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2); + let hasSideEffects = 0; +} + // Atomic cmpxchg. $cmpval ad $newval are packed in a single vector // operand Expects a MachineMemOperand in addition to explicit // operands. Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll @@ -0,0 +1,112 @@ +; RUN: llc -global-isel -mcpu=tahiti -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX678,GFX6789 %s +; RUN: llc -global-isel -mcpu=gfx900 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9,GFX6789 %s +; RUN: llc -global-isel -mcpu=gfx1010 -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s + +declare i64 @llvm.smax.i64(i64, i64) +declare i64 @llvm.smin.i64(i64, i64) + +; GFX10-LABEL: {{^}}v_clamp_i64_i16 +; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] +; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] +; GFX6789: v_mov_b32_e32 [[B]], 0x7fff +; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0xffff8000 +; GFX6789: v_med3_i32 [[A]], [[C]], [[A]], [[B]] +; GFX10: v_cvt_pk_i16_i32_e64 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] +; GFX10: v_mov_b32_e32 [[C:v[0-9]+]], 0xffff8000 +; GFX10: v_med3_i32 [[A]], [[C]], [[A]], 0x7fff +define i16 @v_clamp_i64_i16(i64 %in) #0 { +entry: + %max = call i64 @llvm.smax.i64(i64 %in, i64 -32768) + %min = call i64 @llvm.smin.i64(i64 %max, i64 32767) + %result = trunc i64 %min to i16 + ret i16 %result +} + +; GFX10-LABEL: {{^}}v_clamp_i64_i16_reverse +; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] +; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] +; GFX6789: v_mov_b32_e32 [[B]], 0x7fff +; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0xffff8000 +; GFX6789: v_med3_i32 [[A]], [[C]], [[A]], [[B]] +; GFX10: v_cvt_pk_i16_i32_e64 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] +; GFX10: v_mov_b32_e32 [[C:v[0-9]+]], 0xffff8000 +; GFX10: v_med3_i32 [[A]], [[C]], [[A]], 0x7fff +define i16 @v_clamp_i64_i16_reverse(i64 %in) #0 { +entry: + %min = call i64 @llvm.smin.i64(i64 %in, i64 32767) + %max = call i64 @llvm.smax.i64(i64 %min, i64 -32768) + %result = trunc i64 %max to i16 + ret i16 %result +} + +; GFX10-LABEL: {{^}}v_clamp_i64_i16_invalid_lower +; GFX6789: v_mov_b32_e32 [[B:v[0-9]+]], 0x8001 +; GFX6789: v_cndmask_b32_e32 [[A:v[0-9]+]], [[B]], [[A]], vcc +; GFX6789: v_cndmask_b32_e32 [[C:v[0-9]+]], 0, [[C]], vcc + +; GFX10: v_cndmask_b32_e32 [[A:v[0-9]+]], 0x8001, [[A]], vcc_lo +; GFX10: v_cndmask_b32_e32 [[B:v[0-9]+]], 0, [[B]], vcc_lo +define i16 @v_clamp_i64_i16_invalid_lower(i64 %in) #0 { +entry: + %min = call i64 @llvm.smin.i64(i64 %in, i64 32769) + %max = call i64 @llvm.smax.i64(i64 %min, i64 -32768) + %result = trunc i64 %max to i16 + ret i16 %result +} + +; GFX10-LABEL: {{^}}v_clamp_i64_i16_invalid_lower_and_higher +; GFX6789: v_mov_b32_e32 [[B:v[0-9]+]], 0x8000 +; GFX6789: v_cndmask_b32_e32 [[A:v[0-9]+]], [[B]], [[A]], vcc +; GFX10: v_cndmask_b32_e32 [[A:v[0-9]+]], 0x8000, [[A]], vcc_lo +define i16 @v_clamp_i64_i16_invalid_lower_and_higher(i64 %in) #0 { +entry: + %max = call i64 @llvm.smax.i64(i64 %in, i64 -32769) + %min = call i64 @llvm.smin.i64(i64 %max, i64 32768) + %result = trunc i64 %min to i16 + ret i16 %result +} + +; GFX10-LABEL: {{^}}v_clamp_i64_i16_lower_than_short +; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] +; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] +; GFX6789: v_mov_b32_e32 [[B]], 0x100 +; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0xffffff01 +; GFX6789: v_med3_i32 [[A]], [[C]], [[A]], [[B]] +; GFX10: v_cvt_pk_i16_i32_e64 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] +; GFX10: v_mov_b32_e32 [[C:v[0-9]+]], 0xffffff01 +; GFX10: v_med3_i32 [[A]], [[C]], [[A]], 0x100 +define i16 @v_clamp_i64_i16_lower_than_short(i64 %in) #0 { +entry: + %min = call i64 @llvm.smin.i64(i64 %in, i64 256) + %max = call i64 @llvm.smax.i64(i64 %min, i64 -255) + %result = trunc i64 %max to i16 + ret i16 %result +} + +; GFX10-LABEL: {{^}}v_clamp_i64_i16_lower_than_short_reverse +; GFX678: v_cvt_pk_i16_i32_e32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] +; GFX9: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] +; GFX6789: v_mov_b32_e32 [[B]], 0x100 +; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0xffffff01 +; GFX6789: v_med3_i32 [[A]], [[C]], [[A]], [[B]] +; GFX10: v_cvt_pk_i16_i32_e64 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]] +; GFX10: v_mov_b32_e32 [[C:v[0-9]+]], 0xffffff01 +; GFX10: v_med3_i32 [[A]], [[C]], [[A]], 0x100 +define i16 @v_clamp_i64_i16_lower_than_short_reverse(i64 %in) #0 { +entry: + %max = call i64 @llvm.smax.i64(i64 %in, i64 -255) + %min = call i64 @llvm.smin.i64(i64 %max, i64 256) + %result = trunc i64 %min to i16 + ret i16 %result +} + +; GFX10-LABEL: {{^}}v_clamp_i64_i16_zero +; GFX6789: v_mov_b32_e32 v0, 0 +; GFX10: v_mov_b32_e32 v0, 0 +define i16 @v_clamp_i64_i16_zero(i64 %in) #0 { +entry: + %max = call i64 @llvm.smax.i64(i64 %in, i64 0) + %min = call i64 @llvm.smin.i64(i64 %max, i64 0) + %result = trunc i64 %min to i16 + ret i16 %result +}