diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -45,6 +45,17 @@ [{ return PreLegalizerHelper.matchClampI64ToI16(*${clamp_i64_to_i16}, MRI, *MF, ${matchinfo}); }]), (apply [{ PreLegalizerHelper.applyClampI64ToI16(*${clamp_i64_to_i16}, ${matchinfo}); }])>; +def med3_matchdata : GIDefMatchData<"AMDGPURegBankCombinerHelper::Med3MatchInfo">; + +def int_minmax_to_med3 : GICombineRule< + (defs root:$min_or_max, med3_matchdata:$matchinfo), + (match (wip_match_opcode G_SMAX, + G_SMIN, + G_UMAX, + G_UMIN):$min_or_max, + [{ return RegBankHelper.matchIntMinMaxToMed3(*${min_or_max}, ${matchinfo}); }]), + (apply [{ RegBankHelper.applyMed3(*${min_or_max}, ${matchinfo}); }])>; + // Combines which should only apply on SI/VI def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>; @@ -64,6 +75,8 @@ } def AMDGPURegBankCombinerHelper : GICombinerHelper< - "AMDGPUGenRegBankCombinerHelper", [zext_trunc_fold]> { + "AMDGPUGenRegBankCombinerHelper", [zext_trunc_fold, int_minmax_to_med3]> { let DisableRuleOption = "amdgpuregbankcombiner-disable-rule"; + let StateClass = "AMDGPURegBankCombinerHelperState"; + let AdditionalArguments = []; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -167,7 +167,8 @@ def : GINodeEquiv; def : GINodeEquiv; -def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp @@ -143,7 +143,7 @@ auto Bitcast = B.buildBitcast({S32}, CvtPk); auto Med3 = B.buildInstr( - AMDGPU::G_AMDGPU_MED3, {S32}, + AMDGPU::G_AMDGPU_SMED3, {S32}, {MinBoundaryDst.getReg(0), Bitcast.getReg(0), MaxBoundaryDst.getReg(0)}, MI.getFlags()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -13,7 +13,9 @@ #include "AMDGPU.h" #include "AMDGPULegalizerInfo.h" +#include "AMDGPURegisterBankInfo.h" #include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/CodeGen/GlobalISel/Combiner.h" #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" @@ -27,6 +29,126 @@ using namespace llvm; using namespace MIPatternMatch; +class AMDGPURegBankCombinerHelper { +protected: + MachineIRBuilder &B; + MachineFunction &MF; + MachineRegisterInfo &MRI; + const RegisterBankInfo &RBI; + const TargetRegisterInfo &TRI; + CombinerHelper &Helper; + +public: + AMDGPURegBankCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper) + : B(B), MF(B.getMF()), MRI(*B.getMRI()), + RBI(*MF.getSubtarget().getRegBankInfo()), + TRI(*MF.getSubtarget().getRegisterInfo()), Helper(Helper){}; + + bool isVgprRegBank(Register Reg); + + struct MinMaxMedOpc { + unsigned Min, Max, Med; + }; + + struct Med3MatchInfo { + unsigned Opc; + Register Val0, Val1, Val2; + }; + + MinMaxMedOpc getMinMaxPair(unsigned Opc); + + template + bool matchMed(MachineInstr &MI, MachineRegisterInfo &MRI, MinMaxMedOpc MMMOpc, + Register &Val, Register &K0, Register &K1); + + bool matchIntMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo); + void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo); +}; + +bool AMDGPURegBankCombinerHelper::isVgprRegBank(Register Reg) { + return RBI.getRegBank(Reg, MRI, TRI)->getID() == AMDGPU::VGPRRegBankID; +} + +AMDGPURegBankCombinerHelper::MinMaxMedOpc +AMDGPURegBankCombinerHelper::getMinMaxPair(unsigned Opc) { + switch (Opc) { + default: + llvm_unreachable("Unsupported opcode"); + case AMDGPU::G_SMAX: + case AMDGPU::G_SMIN: + return {AMDGPU::G_SMIN, AMDGPU::G_SMAX, AMDGPU::G_AMDGPU_SMED3}; + case AMDGPU::G_UMAX: + case AMDGPU::G_UMIN: + return {AMDGPU::G_UMIN, AMDGPU::G_UMAX, AMDGPU::G_AMDGPU_UMED3}; + } +} + +template +bool AMDGPURegBankCombinerHelper::matchMed(MachineInstr &MI, + MachineRegisterInfo &MRI, + MinMaxMedOpc MMMOpc, Register &Val, + Register &K0, Register &K1) { + // 4 operand commutes of: min(max(Val, K0), K1). + // Find K1 from outer instr: min(max(...), K1) or min(K1, max(...)). + // Find K0 and Val from inner instr: max(K0, Val) or max(Val, K0). + // 4 operand commutes of: max(min(Val, K1), K0). + // Find K0 from outer instr: max(min(...), K0) or max(K0, min(...)). + // Find K1 and Val from inner instr: min(K1, Val) or min(Val, K1). + return mi_match( + MI, MRI, + m_any_of( + m_CommutativeBinOp( + MMMOpc.Min, m_CommutativeBinOp(MMMOpc.Max, m_Reg(Val), m_Cst(K0)), + m_Cst(K1)), + m_CommutativeBinOp( + MMMOpc.Max, m_CommutativeBinOp(MMMOpc.Min, m_Reg(Val), m_Cst(K1)), + m_Cst(K0)))); +} + +bool AMDGPURegBankCombinerHelper::matchIntMinMaxToMed3( + MachineInstr &MI, Med3MatchInfo &MatchInfo) { + Register Dst = MI.getOperand(0).getReg(); + if (!isVgprRegBank(Dst)) + return false; + + if (MRI.getType(Dst).isVector()) + return false; + + MinMaxMedOpc OpcodeTriple = getMinMaxPair(MI.getOpcode()); + Register Val, K0, K1; + // Match min(max(Val, K0), K1) or max(min(Val, K1), K0). Then see if K0 <= K1. + if (!matchMed(MI, MRI, OpcodeTriple, Val, K0, K1)) + return false; + + const APInt &K0_Imm = getConstantIntVRegVal(K0, MRI)->getValue(); + const APInt &K1_Imm = getConstantIntVRegVal(K1, MRI)->getValue(); + if (OpcodeTriple.Med == AMDGPU::G_AMDGPU_SMED3 && K0_Imm.sgt(K1_Imm)) + return false; + if (OpcodeTriple.Med == AMDGPU::G_AMDGPU_UMED3 && K0_Imm.ugt(K1_Imm)) + return false; + + MatchInfo = {OpcodeTriple.Med, Val, K0, K1}; + return true; +} + +void AMDGPURegBankCombinerHelper::applyMed3(MachineInstr &MI, + Med3MatchInfo &MatchInfo) { + B.setInstrAndDebugLoc(MI); + B.buildInstr(MatchInfo.Opc, {MI.getOperand(0)}, + {MatchInfo.Val0, MatchInfo.Val1, MatchInfo.Val2}, MI.getFlags()); + MI.eraseFromParent(); +} + +class AMDGPURegBankCombinerHelperState { +protected: + CombinerHelper &Helper; + AMDGPURegBankCombinerHelper &RegBankHelper; + +public: + AMDGPURegBankCombinerHelperState(CombinerHelper &Helper, + AMDGPURegBankCombinerHelper &RegBankHelper) + : Helper(Helper), RegBankHelper(RegBankHelper) {} +}; #define AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_DEPS #include "AMDGPUGenRegBankGICombiner.inc" @@ -62,9 +184,11 @@ MachineInstr &MI, MachineIRBuilder &B) const { CombinerHelper Helper(Observer, B, KB, MDT); - AMDGPUGenRegBankCombinerHelper Generated(GeneratedRuleCfg); + AMDGPURegBankCombinerHelper RegBankHelper(B, Helper); + AMDGPUGenRegBankCombinerHelper Generated(GeneratedRuleCfg, Helper, + RegBankHelper); - if (Generated.tryCombineAll(Observer, MI, B, Helper)) + if (Generated.tryCombineAll(Observer, MI, B)) return true; return false; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3507,7 +3507,7 @@ case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2: case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3: case AMDGPU::G_AMDGPU_CVT_PK_I16_I32: - case AMDGPU::G_AMDGPU_MED3: + case AMDGPU::G_AMDGPU_SMED3: return getDefaultMappingVOP(MI); case AMDGPU::G_UMULH: case AMDGPU::G_SMULH: { diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2652,7 +2652,13 @@ let hasSideEffects = 0; } -def G_AMDGPU_MED3 : AMDGPUGenericInstruction { +def G_AMDGPU_SMED3 : AMDGPUGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2); + let hasSideEffects = 0; +} + +def G_AMDGPU_UMED3 : AMDGPUGenericInstruction { let OutOperandList = (outs type0:$dst); let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2); let hasSideEffects = 0; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-smed3.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-smed3.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-smed3.mir @@ -0,0 +1,328 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -run-pass=amdgpu-regbank-combiner -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: test_min_max_ValK0_K1_i32 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $vgpr0, $sgpr30_sgpr31 + + ; CHECK-LABEL: name: test_min_max_ValK0_K1_i32 + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12 + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[C]], [[C1]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 + %0:vgpr(s32) = COPY $vgpr0 + %1:sgpr_64 = COPY $sgpr30_sgpr31 + %2:sgpr(s32) = G_CONSTANT i32 -12 + %7:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_SMAX %0, %7 + %4:sgpr(s32) = G_CONSTANT i32 17 + %8:vgpr(s32) = COPY %4(s32) + %5:vgpr(s32) = G_SMIN %3, %8 + $vgpr0 = COPY %5(s32) + %6:ccr_sgpr_64 = COPY %1 + S_SETPC_B64_return %6, implicit $vgpr0 +... + +--- +name: min_max_ValK0_K1_i32 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $vgpr0, $sgpr30_sgpr31 + + ; CHECK-LABEL: name: min_max_ValK0_K1_i32 + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12 + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[C]], [[C1]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 + %0:vgpr(s32) = COPY $vgpr0 + %1:sgpr_64 = COPY $sgpr30_sgpr31 + %2:sgpr(s32) = G_CONSTANT i32 -12 + %7:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_SMAX %7, %0 + %4:sgpr(s32) = G_CONSTANT i32 17 + %8:vgpr(s32) = COPY %4(s32) + %5:vgpr(s32) = G_SMIN %3, %8 + $vgpr0 = COPY %5(s32) + %6:ccr_sgpr_64 = COPY %1 + S_SETPC_B64_return %6, implicit $vgpr0 +... + +--- +name: test_min_K1max_ValK0__i32 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $vgpr0, $sgpr30_sgpr31 + + ; CHECK-LABEL: name: test_min_K1max_ValK0__i32 + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12 + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[C]], [[C1]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 + %0:vgpr(s32) = COPY $vgpr0 + %1:sgpr_64 = COPY $sgpr30_sgpr31 + %2:sgpr(s32) = G_CONSTANT i32 -12 + %7:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_SMAX %0, %7 + %4:sgpr(s32) = G_CONSTANT i32 17 + %8:vgpr(s32) = COPY %4(s32) + %5:vgpr(s32) = G_SMIN %8, %3 + $vgpr0 = COPY %5(s32) + %6:ccr_sgpr_64 = COPY %1 + S_SETPC_B64_return %6, implicit $vgpr0 +... + +--- +name: test_min_K1max_K0Val__i32 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $vgpr0, $sgpr30_sgpr31 + + ; CHECK-LABEL: name: test_min_K1max_K0Val__i32 + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12 + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[C]], [[C1]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 + %0:vgpr(s32) = COPY $vgpr0 + %1:sgpr_64 = COPY $sgpr30_sgpr31 + %2:sgpr(s32) = G_CONSTANT i32 -12 + %7:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_SMAX %7, %0 + %4:sgpr(s32) = G_CONSTANT i32 17 + %8:vgpr(s32) = COPY %4(s32) + %5:vgpr(s32) = G_SMIN %8, %3 + $vgpr0 = COPY %5(s32) + %6:ccr_sgpr_64 = COPY %1 + S_SETPC_B64_return %6, implicit $vgpr0 +... + +--- +name: test_max_min_ValK1_K0_i32 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $vgpr0, $sgpr30_sgpr31 + + ; CHECK-LABEL: name: test_max_min_ValK1_K0_i32 + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12 + ; CHECK: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[C1]], [[C]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 + %0:vgpr(s32) = COPY $vgpr0 + %1:sgpr_64 = COPY $sgpr30_sgpr31 + %2:sgpr(s32) = G_CONSTANT i32 17 + %7:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_SMIN %0, %7 + %4:sgpr(s32) = G_CONSTANT i32 -12 + %8:vgpr(s32) = COPY %4(s32) + %5:vgpr(s32) = G_SMAX %3, %8 + $vgpr0 = COPY %5(s32) + %6:ccr_sgpr_64 = COPY %1 + S_SETPC_B64_return %6, implicit $vgpr0 +... + +--- +name: test_max_min_K1Val_K0_i32 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $vgpr0, $sgpr30_sgpr31 + + ; CHECK-LABEL: name: test_max_min_K1Val_K0_i32 + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12 + ; CHECK: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[C1]], [[C]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 + %0:vgpr(s32) = COPY $vgpr0 + %1:sgpr_64 = COPY $sgpr30_sgpr31 + %2:sgpr(s32) = G_CONSTANT i32 17 + %7:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_SMIN %7, %0 + %4:sgpr(s32) = G_CONSTANT i32 -12 + %8:vgpr(s32) = COPY %4(s32) + %5:vgpr(s32) = G_SMAX %3, %8 + $vgpr0 = COPY %5(s32) + %6:ccr_sgpr_64 = COPY %1 + S_SETPC_B64_return %6, implicit $vgpr0 +... + +--- +name: test_max_K0min_ValK1__i32 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $vgpr0, $sgpr30_sgpr31 + + ; CHECK-LABEL: name: test_max_K0min_ValK1__i32 + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12 + ; CHECK: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[C1]], [[C]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 + %0:vgpr(s32) = COPY $vgpr0 + %1:sgpr_64 = COPY $sgpr30_sgpr31 + %2:sgpr(s32) = G_CONSTANT i32 17 + %7:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_SMIN %0, %7 + %4:sgpr(s32) = G_CONSTANT i32 -12 + %8:vgpr(s32) = COPY %4(s32) + %5:vgpr(s32) = G_SMAX %8, %3 + $vgpr0 = COPY %5(s32) + %6:ccr_sgpr_64 = COPY %1 + S_SETPC_B64_return %6, implicit $vgpr0 +... + +--- +name: test_max_K0min_K1Val__i32 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $vgpr0, $sgpr30_sgpr31 + + ; CHECK-LABEL: name: test_max_K0min_K1Val__i32 + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12 + ; CHECK: [[AMDGPU_SMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_SMED3 [[COPY]], [[C1]], [[C]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_SMED3_]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 + %0:vgpr(s32) = COPY $vgpr0 + %1:sgpr_64 = COPY $sgpr30_sgpr31 + %2:sgpr(s32) = G_CONSTANT i32 17 + %7:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_SMIN %7, %0 + %4:sgpr(s32) = G_CONSTANT i32 -12 + %8:vgpr(s32) = COPY %4(s32) + %5:vgpr(s32) = G_SMAX %8, %3 + $vgpr0 = COPY %5(s32) + %6:ccr_sgpr_64 = COPY %1 + S_SETPC_B64_return %6, implicit $vgpr0 +... + +--- +name: test_max_K0min_K1Val__v2i16 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $vgpr0, $sgpr30_sgpr31 + + ; CHECK-LABEL: name: test_max_K0min_K1Val__v2i16 + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C]](s32), [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12 + ; CHECK: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C1]](s32), [[C1]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(<2 x s16>) = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; CHECK: [[SMIN:%[0-9]+]]:vgpr(<2 x s16>) = G_SMIN [[COPY2]], [[COPY]] + ; CHECK: [[COPY3:%[0-9]+]]:vgpr(<2 x s16>) = COPY [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; CHECK: [[SMAX:%[0-9]+]]:vgpr(<2 x s16>) = G_SMAX [[COPY3]], [[SMIN]] + ; CHECK: $vgpr0 = COPY [[SMAX]](<2 x s16>) + ; CHECK: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY4]], implicit $vgpr0 + %0:vgpr(<2 x s16>) = COPY $vgpr0 + %1:sgpr_64 = COPY $sgpr30_sgpr31 + %9:sgpr(s32) = G_CONSTANT i32 17 + %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %9(s32), %9(s32) + %10:sgpr(s32) = G_CONSTANT i32 -12 + %5:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %10(s32), %10(s32) + %11:vgpr(<2 x s16>) = COPY %2(<2 x s16>) + %4:vgpr(<2 x s16>) = G_SMIN %11, %0 + %12:vgpr(<2 x s16>) = COPY %5(<2 x s16>) + %7:vgpr(<2 x s16>) = G_SMAX %12, %4 + $vgpr0 = COPY %7(<2 x s16>) + %8:ccr_sgpr_64 = COPY %1 + S_SETPC_B64_return %8, implicit $vgpr0 +... + +--- +name: test_uniform_min_max +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $sgpr2 + + ; CHECK-LABEL: name: test_uniform_min_max + ; CHECK: liveins: $sgpr2 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -12 + ; CHECK: [[SMAX:%[0-9]+]]:sgpr(s32) = G_SMAX [[COPY]], [[C]] + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK: [[SMIN:%[0-9]+]]:sgpr(s32) = G_SMIN [[SMAX]], [[C1]] + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[SMIN]](s32) + ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY1]](s32) + ; CHECK: $sgpr0 = COPY [[INT]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0 + %0:sgpr(s32) = COPY $sgpr2 + %3:sgpr(s32) = G_CONSTANT i32 -12 + %4:sgpr(s32) = G_SMAX %0, %3 + %5:sgpr(s32) = G_CONSTANT i32 17 + %6:sgpr(s32) = G_SMIN %4, %5 + %8:vgpr(s32) = COPY %6(s32) + %7:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), %8(s32) + $sgpr0 = COPY %7(s32) + SI_RETURN_TO_EPILOG implicit $sgpr0 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-umed3.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-umed3.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-umed3.mir @@ -0,0 +1,329 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -run-pass=amdgpu-regbank-combiner -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: test_min_max_ValK0_K1_u32 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $vgpr0, $sgpr30_sgpr31 + + ; CHECK-LABEL: name: test_min_max_ValK0_K1_u32 + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12 + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[C]], [[C1]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 + %0:vgpr(s32) = COPY $vgpr0 + %1:sgpr_64 = COPY $sgpr30_sgpr31 + %2:sgpr(s32) = G_CONSTANT i32 12 + %7:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_UMAX %0, %7 + %4:sgpr(s32) = G_CONSTANT i32 17 + %8:vgpr(s32) = COPY %4(s32) + %5:vgpr(s32) = G_UMIN %3, %8 + $vgpr0 = COPY %5(s32) + %6:ccr_sgpr_64 = COPY %1 + S_SETPC_B64_return %6, implicit $vgpr0 +... + +--- +name: min_max_ValK0_K1_i32 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $vgpr0, $sgpr30_sgpr31 + + ; CHECK-LABEL: name: min_max_ValK0_K1_i32 + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12 + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[C]], [[C1]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 + %0:vgpr(s32) = COPY $vgpr0 + %1:sgpr_64 = COPY $sgpr30_sgpr31 + %2:sgpr(s32) = G_CONSTANT i32 12 + %7:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_UMAX %7, %0 + %4:sgpr(s32) = G_CONSTANT i32 17 + %8:vgpr(s32) = COPY %4(s32) + %5:vgpr(s32) = G_UMIN %3, %8 + $vgpr0 = COPY %5(s32) + %6:ccr_sgpr_64 = COPY %1 + S_SETPC_B64_return %6, implicit $vgpr0 +... + +--- +name: test_min_K1max_ValK0__u32 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $vgpr0, $sgpr30_sgpr31 + + ; CHECK-LABEL: name: test_min_K1max_ValK0__u32 + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12 + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[C]], [[C1]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 + %0:vgpr(s32) = COPY $vgpr0 + %1:sgpr_64 = COPY $sgpr30_sgpr31 + %2:sgpr(s32) = G_CONSTANT i32 12 + %7:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_UMAX %0, %7 + %4:sgpr(s32) = G_CONSTANT i32 17 + %8:vgpr(s32) = COPY %4(s32) + %5:vgpr(s32) = G_UMIN %8, %3 + $vgpr0 = COPY %5(s32) + %6:ccr_sgpr_64 = COPY %1 + S_SETPC_B64_return %6, implicit $vgpr0 +... + +--- +name: test_min_K1max_K0Val__u32 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $vgpr0, $sgpr30_sgpr31 + + ; CHECK-LABEL: name: test_min_K1max_K0Val__u32 + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12 + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[C]], [[C1]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 + %0:vgpr(s32) = COPY $vgpr0 + %1:sgpr_64 = COPY $sgpr30_sgpr31 + %2:sgpr(s32) = G_CONSTANT i32 12 + %7:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_UMAX %7, %0 + %4:sgpr(s32) = G_CONSTANT i32 17 + %8:vgpr(s32) = COPY %4(s32) + %5:vgpr(s32) = G_UMIN %8, %3 + $vgpr0 = COPY %5(s32) + %6:ccr_sgpr_64 = COPY %1 + S_SETPC_B64_return %6, implicit $vgpr0 +... + +--- +name: test_max_min_ValK1_K0_u32 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $vgpr0, $sgpr30_sgpr31 + + ; CHECK-LABEL: name: test_max_min_ValK1_K0_u32 + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12 + ; CHECK: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[C1]], [[C]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 + %0:vgpr(s32) = COPY $vgpr0 + %1:sgpr_64 = COPY $sgpr30_sgpr31 + %2:sgpr(s32) = G_CONSTANT i32 17 + %7:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_UMIN %0, %7 + %4:sgpr(s32) = G_CONSTANT i32 12 + %8:vgpr(s32) = COPY %4(s32) + %5:vgpr(s32) = G_UMAX %3, %8 + $vgpr0 = COPY %5(s32) + %6:ccr_sgpr_64 = COPY %1 + S_SETPC_B64_return %6, implicit $vgpr0 +... + +--- +name: test_max_min_K1Val_K0_u32 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $vgpr0, $sgpr30_sgpr31 + + ; CHECK-LABEL: name: test_max_min_K1Val_K0_u32 + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12 + ; CHECK: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[C1]], [[C]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 + %0:vgpr(s32) = COPY $vgpr0 + %1:sgpr_64 = COPY $sgpr30_sgpr31 + %2:sgpr(s32) = G_CONSTANT i32 17 + %7:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_UMIN %7, %0 + %4:sgpr(s32) = G_CONSTANT i32 12 + %8:vgpr(s32) = COPY %4(s32) + %5:vgpr(s32) = G_UMAX %3, %8 + $vgpr0 = COPY %5(s32) + %6:ccr_sgpr_64 = COPY %1 + S_SETPC_B64_return %6, implicit $vgpr0 +... + +--- +name: test_max_K0min_ValK1__u32 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $vgpr0, $sgpr30_sgpr31 + + ; CHECK-LABEL: name: test_max_K0min_ValK1__u32 + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12 + ; CHECK: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[C1]], [[C]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 + %0:vgpr(s32) = COPY $vgpr0 + %1:sgpr_64 = COPY $sgpr30_sgpr31 + %2:sgpr(s32) = G_CONSTANT i32 17 + %7:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_UMIN %0, %7 + %4:sgpr(s32) = G_CONSTANT i32 12 + %8:vgpr(s32) = COPY %4(s32) + %5:vgpr(s32) = G_UMAX %8, %3 + $vgpr0 = COPY %5(s32) + %6:ccr_sgpr_64 = COPY %1 + S_SETPC_B64_return %6, implicit $vgpr0 +... + +--- +name: test_max_K0min_K1Val__u32 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $vgpr0, $sgpr30_sgpr31 + + ; CHECK-LABEL: name: test_max_K0min_K1Val__u32 + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12 + ; CHECK: [[AMDGPU_UMED3_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_UMED3 [[COPY]], [[C1]], [[C]] + ; CHECK: $vgpr0 = COPY [[AMDGPU_UMED3_]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 + %0:vgpr(s32) = COPY $vgpr0 + %1:sgpr_64 = COPY $sgpr30_sgpr31 + %2:sgpr(s32) = G_CONSTANT i32 17 + %7:vgpr(s32) = COPY %2(s32) + %3:vgpr(s32) = G_UMIN %7, %0 + %4:sgpr(s32) = G_CONSTANT i32 12 + %8:vgpr(s32) = COPY %4(s32) + %5:vgpr(s32) = G_UMAX %8, %3 + $vgpr0 = COPY %5(s32) + %6:ccr_sgpr_64 = COPY %1 + S_SETPC_B64_return %6, implicit $vgpr0 +... + +--- +name: test_max_K0min_K1Val__v2u16 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $vgpr0, $sgpr30_sgpr31 + + ; CHECK-LABEL: name: test_max_K0min_K1Val__v2u16 + ; CHECK: liveins: $vgpr0, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C]](s32), [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12 + ; CHECK: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[C1]](s32), [[C1]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(<2 x s16>) = COPY [[BUILD_VECTOR_TRUNC]](<2 x s16>) + ; CHECK: [[UMIN:%[0-9]+]]:vgpr(<2 x s16>) = G_UMIN [[COPY2]], [[COPY]] + ; CHECK: [[COPY3:%[0-9]+]]:vgpr(<2 x s16>) = COPY [[BUILD_VECTOR_TRUNC1]](<2 x s16>) + ; CHECK: [[UMAX:%[0-9]+]]:vgpr(<2 x s16>) = G_UMAX [[COPY3]], [[UMIN]] + ; CHECK: $vgpr0 = COPY [[UMAX]](<2 x s16>) + ; CHECK: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY1]] + ; CHECK: S_SETPC_B64_return [[COPY4]], implicit $vgpr0 + %0:vgpr(<2 x s16>) = COPY $vgpr0 + %1:sgpr_64 = COPY $sgpr30_sgpr31 + %9:sgpr(s32) = G_CONSTANT i32 17 + %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %9(s32), %9(s32) + %10:sgpr(s32) = G_CONSTANT i32 12 + %5:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %10(s32), %10(s32) + %11:vgpr(<2 x s16>) = COPY %2(<2 x s16>) + %4:vgpr(<2 x s16>) = G_UMIN %11, %0 + %12:vgpr(<2 x s16>) = COPY %5(<2 x s16>) + %7:vgpr(<2 x s16>) = G_UMAX %12, %4 + $vgpr0 = COPY %7(<2 x s16>) + %8:ccr_sgpr_64 = COPY %1 + S_SETPC_B64_return %8, implicit $vgpr0 +... + +--- +name: test_uniform_min_max +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1: + liveins: $sgpr2 + + ; CHECK-LABEL: name: test_uniform_min_max + ; CHECK: liveins: $sgpr2 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr2 + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 12 + ; CHECK: [[UMAX:%[0-9]+]]:sgpr(s32) = G_UMAX [[COPY]], [[C]] + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 17 + ; CHECK: [[UMIN:%[0-9]+]]:sgpr(s32) = G_UMIN [[UMAX]], [[C1]] + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[UMIN]](s32) + ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY1]](s32) + ; CHECK: $sgpr0 = COPY [[INT]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0 + %0:sgpr(s32) = COPY $sgpr2 + %3:sgpr(s32) = G_CONSTANT i32 12 + %4:sgpr(s32) = G_UMAX %0, %3 + %5:sgpr(s32) = G_CONSTANT i32 17 + %6:sgpr(s32) = G_UMIN %4, %5 + %8:vgpr(s32) = COPY %6(s32) + %7:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), %8(s32) + $sgpr0 = COPY %7(s32) + SI_RETURN_TO_EPILOG implicit $sgpr0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll @@ -0,0 +1,127 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s + +define i32 @test_min_max_ValK0_K1_i32(i32 %a) { +; GFX10-LABEL: test_min_max_ValK0_K1_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_med3_i32 v0, v0, -12, 17 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %smax = call i32 @llvm.smax.i32(i32 %a, i32 -12) + %smed = call i32 @llvm.smin.i32(i32 %smax, i32 17) + ret i32 %smed +} + +define i32 @min_max_ValK0_K1_i32(i32 %a) { +; GFX10-LABEL: min_max_ValK0_K1_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_med3_i32 v0, v0, -12, 17 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %smax = call i32 @llvm.smax.i32(i32 -12, i32 %a) + %smed = call i32 @llvm.smin.i32(i32 %smax, i32 17) + ret i32 %smed +} + +define i32 @test_min_K1max_ValK0__i32(i32 %a) { +; GFX10-LABEL: test_min_K1max_ValK0__i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_med3_i32 v0, v0, -12, 17 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %smax = call i32 @llvm.smax.i32(i32 %a, i32 -12) + %smed = call i32 @llvm.smin.i32(i32 17, i32 %smax) + ret i32 %smed +} + +define i32 @test_min_K1max_K0Val__i32(i32 %a) { +; GFX10-LABEL: test_min_K1max_K0Val__i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_med3_i32 v0, v0, -12, 17 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %smax = call i32 @llvm.smax.i32(i32 -12, i32 %a) + %smed = call i32 @llvm.smin.i32(i32 17, i32 %smax) + ret i32 %smed +} + +define i32 @test_max_min_ValK1_K0_i32(i32 %a) { +; GFX10-LABEL: test_max_min_ValK1_K0_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_med3_i32 v0, v0, -12, 17 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %smin = call i32 @llvm.smin.i32(i32 %a, i32 17) + %smed = call i32 @llvm.smax.i32(i32 %smin, i32 -12) + ret i32 %smed +} + +define i32 @test_max_min_K1Val_K0_i32(i32 %a) { +; GFX10-LABEL: test_max_min_K1Val_K0_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_med3_i32 v0, v0, -12, 17 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %smin = call i32 @llvm.smin.i32(i32 17, i32 %a) + %smed = call i32 @llvm.smax.i32(i32 %smin, i32 -12) + ret i32 %smed +} + +define i32 @test_max_K0min_ValK1__i32(i32 %a) { +; GFX10-LABEL: test_max_K0min_ValK1__i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_med3_i32 v0, v0, -12, 17 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %smin = call i32 @llvm.smin.i32(i32 %a, i32 17) + %smed = call i32 @llvm.smax.i32(i32 -12, i32 %smin) + ret i32 %smed +} + +define i32 @test_max_K0min_K1Val__i32(i32 %a) { +; GFX10-LABEL: test_max_K0min_K1Val__i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_med3_i32 v0, v0, -12, 17 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %smin = call i32 @llvm.smin.i32(i32 17, i32 %a) + %smed = call i32 @llvm.smax.i32(i32 -12, i32 %smin) + ret i32 %smed +} + +define <2 x i16> @test_max_K0min_K1Val__v2i16(<2 x i16> %a) { +; GFX10-LABEL: test_max_K0min_K1Val__v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_min_i16 v0, 17, v0 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_max_i16 v0, -12, v0 op_sel_hi:[0,1] +; GFX10-NEXT: s_setpc_b64 s[30:31] + %smin = call <2 x i16> @llvm.smin.v2i16(<2 x i16> , <2 x i16> %a) + %smed = call <2 x i16> @llvm.smax.v2i16(<2 x i16> , <2 x i16> %smin) + ret <2 x i16> %smed +} + +define amdgpu_ps i32 @test_uniform_min_max(i32 inreg %a) { +; GFX10-LABEL: test_uniform_min_max: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_max_i32 s0, s2, -12 +; GFX10-NEXT: s_min_i32 s0, s0, 17 +; GFX10-NEXT: ; return to shader part epilog + %smax = call i32 @llvm.smax.i32(i32 %a, i32 -12) + %smed = call i32 @llvm.smin.i32(i32 %smax, i32 17) + ret i32 %smed +} + +declare i32 @llvm.smin.i32(i32, i32) +declare i32 @llvm.smax.i32(i32, i32) +declare <2 x i16> @llvm.smin.v2i16(<2 x i16>, <2 x i16>) +declare <2 x i16> @llvm.smax.v2i16(<2 x i16>, <2 x i16>) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll @@ -0,0 +1,127 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s + +define i32 @test_min_max_ValK0_K1_u32(i32 %a) { +; GFX10-LABEL: test_min_max_ValK0_K1_u32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_med3_u32 v0, v0, 12, 17 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %umax = call i32 @llvm.umax.i32(i32 %a, i32 12) + %umed = call i32 @llvm.umin.i32(i32 %umax, i32 17) + ret i32 %umed +} + +define i32 @min_max_ValK0_K1_i32(i32 %a) { +; GFX10-LABEL: min_max_ValK0_K1_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_med3_u32 v0, v0, 12, 17 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %umax = call i32 @llvm.umax.i32(i32 12, i32 %a) + %umed = call i32 @llvm.umin.i32(i32 %umax, i32 17) + ret i32 %umed +} + +define i32 @test_min_K1max_ValK0__u32(i32 %a) { +; GFX10-LABEL: test_min_K1max_ValK0__u32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_med3_u32 v0, v0, 12, 17 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %umax = call i32 @llvm.umax.i32(i32 %a, i32 12) + %umed = call i32 @llvm.umin.i32(i32 17, i32 %umax) + ret i32 %umed +} + +define i32 @test_min_K1max_K0Val__u32(i32 %a) { +; GFX10-LABEL: test_min_K1max_K0Val__u32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_med3_u32 v0, v0, 12, 17 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %umax = call i32 @llvm.umax.i32(i32 12, i32 %a) + %umed = call i32 @llvm.umin.i32(i32 17, i32 %umax) + ret i32 %umed +} + +define i32 @test_max_min_ValK1_K0_u32(i32 %a) { +; GFX10-LABEL: test_max_min_ValK1_K0_u32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_med3_u32 v0, v0, 12, 17 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %umin = call i32 @llvm.umin.i32(i32 %a, i32 17) + %umed = call i32 @llvm.umax.i32(i32 %umin, i32 12) + ret i32 %umed +} + +define i32 @test_max_min_K1Val_K0_u32(i32 %a) { +; GFX10-LABEL: test_max_min_K1Val_K0_u32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_med3_u32 v0, v0, 12, 17 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %umin = call i32 @llvm.umin.i32(i32 17, i32 %a) + %umed = call i32 @llvm.umax.i32(i32 %umin, i32 12) + ret i32 %umed +} + +define i32 @test_max_K0min_ValK1__u32(i32 %a) { +; GFX10-LABEL: test_max_K0min_ValK1__u32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_med3_u32 v0, v0, 12, 17 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %umin = call i32 @llvm.umin.i32(i32 %a, i32 17) + %umed = call i32 @llvm.umax.i32(i32 12, i32 %umin) + ret i32 %umed +} + +define i32 @test_max_K0min_K1Val__u32(i32 %a) { +; GFX10-LABEL: test_max_K0min_K1Val__u32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_med3_u32 v0, v0, 12, 17 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %umin = call i32 @llvm.umin.i32(i32 17, i32 %a) + %umed = call i32 @llvm.umax.i32(i32 12, i32 %umin) + ret i32 %umed +} + +define <2 x i16> @test_max_K0min_K1Val__v2u16(<2 x i16> %a) { +; GFX10-LABEL: test_max_K0min_K1Val__v2u16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_min_u16 v0, 17, v0 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_max_u16 v0, 12, v0 op_sel_hi:[0,1] +; GFX10-NEXT: s_setpc_b64 s[30:31] + %umin = call <2 x i16> @llvm.umin.v2i16(<2 x i16> , <2 x i16> %a) + %umed = call <2 x i16> @llvm.umax.v2i16(<2 x i16> , <2 x i16> %umin) + ret <2 x i16> %umed +} + +define amdgpu_ps i32 @test_uniform_min_max(i32 inreg %a) { +; GFX10-LABEL: test_uniform_min_max: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_max_u32 s0, s2, 12 +; GFX10-NEXT: s_min_u32 s0, s0, 17 +; GFX10-NEXT: ; return to shader part epilog + %umax = call i32 @llvm.umax.i32(i32 %a, i32 12) + %umed = call i32 @llvm.umin.i32(i32 %umax, i32 17) + ret i32 %umed +} + +declare i32 @llvm.umin.i32(i32, i32) +declare i32 @llvm.umax.i32(i32, i32) +declare <2 x i16> @llvm.umin.v2i16(<2 x i16>, <2 x i16>) +declare <2 x i16> @llvm.umax.v2i16(<2 x i16>, <2 x i16>)