Index: llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -170,8 +170,6 @@ case AMDGPUISD::CLAMP: case AMDGPUISD::COS_HW: case AMDGPUISD::SIN_HW: - case AMDGPUISD::FMIN3: - case AMDGPUISD::FMAX3: case AMDGPUISD::FMED3: case AMDGPUISD::FMAD_FTZ: case AMDGPUISD::RCP: Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -395,12 +395,6 @@ FMAX_LEGACY, FMIN_LEGACY, - FMAX3, - SMAX3, - UMAX3, - FMIN3, - SMIN3, - UMIN3, FMED3, SMED3, UMED3, Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4413,12 +4413,6 @@ NODE_NAME_CASE(SIN_HW) NODE_NAME_CASE(FMAX_LEGACY) NODE_NAME_CASE(FMIN_LEGACY) - NODE_NAME_CASE(FMAX3) - NODE_NAME_CASE(SMAX3) - NODE_NAME_CASE(UMAX3) - NODE_NAME_CASE(FMIN3) - NODE_NAME_CASE(SMIN3) - NODE_NAME_CASE(UMIN3) NODE_NAME_CASE(FMED3) NODE_NAME_CASE(SMED3) NODE_NAME_CASE(UMED3) @@ -4802,8 +4796,6 @@ DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1); } case AMDGPUISD::FMED3: - case AMDGPUISD::FMIN3: - case AMDGPUISD::FMAX3: case AMDGPUISD::FMAD_FTZ: { if (SNaN) return true; Index: llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -150,38 +150,6 @@ [] >; -// FIXME: TableGen doesn't like commutative instructions with more -// than 2 operands. -// out = max(a, b, c) a, b and c are floats -def AMDGPUfmax3 : SDNode<"AMDGPUISD::FMAX3", SDTFPTernaryOp, - [/*SDNPCommutative, SDNPAssociative*/] ->; - -// out = max(a, b, c) a, b, and c are signed ints -def AMDGPUsmax3 : SDNode<"AMDGPUISD::SMAX3", AMDGPUDTIntTernaryOp, - [/*SDNPCommutative, SDNPAssociative*/] ->; - -// out = max(a, b, c) a, b and c are unsigned ints -def AMDGPUumax3 : SDNode<"AMDGPUISD::UMAX3", AMDGPUDTIntTernaryOp, - [/*SDNPCommutative, SDNPAssociative*/] ->; - -// out = min(a, b, c) a, b and c are floats -def AMDGPUfmin3 : SDNode<"AMDGPUISD::FMIN3", SDTFPTernaryOp, - [/*SDNPCommutative, SDNPAssociative*/] ->; - -// out = min(a, b, c) a, b and c are signed ints -def AMDGPUsmin3 : SDNode<"AMDGPUISD::SMIN3", AMDGPUDTIntTernaryOp, - [/*SDNPCommutative, SDNPAssociative*/] ->; - -// out = min(a, b) a and b are unsigned ints -def AMDGPUumin3 : SDNode<"AMDGPUISD::UMIN3", AMDGPUDTIntTernaryOp, - [/*SDNPCommutative, SDNPAssociative*/] ->; - // out = (src0 + src1 > 0xFFFFFFFF) ? 1 : 0 def AMDGPUcarry : SDNode<"AMDGPUISD::CARRY", SDTIntBinOp, []>; Index: llvm/lib/Target/AMDGPU/GCNSubtarget.h =================================================================== --- llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -353,10 +353,6 @@ return getGeneration() >= AMDGPUSubtarget::GFX9; } - bool hasMin3Max3_16() const { - return getGeneration() >= AMDGPUSubtarget::GFX9; - } - bool hasFmaMixInsts() const { return HasFmaMixInsts; } Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -9728,9 +9728,7 @@ case ISD::FMINNUM_IEEE: case ISD::FMAXNUM_IEEE: case AMDGPUISD::CLAMP: - case AMDGPUISD::FMED3: - case AMDGPUISD::FMAX3: - case AMDGPUISD::FMIN3: { + case AMDGPUISD::FMED3: { // FIXME: Shouldn't treat the generic operations different based these. // However, we aren't really required to flush the result from // minnum/maxnum.. @@ -9980,27 +9978,6 @@ return isCanonicalized(DAG, N0) ? N0 : SDValue(); } -static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { - switch (Opc) { - case ISD::FMAXNUM: - case ISD::FMAXNUM_IEEE: - return AMDGPUISD::FMAX3; - case ISD::SMAX: - return AMDGPUISD::SMAX3; - case ISD::UMAX: - return AMDGPUISD::UMAX3; - case ISD::FMINNUM: - case ISD::FMINNUM_IEEE: - return AMDGPUISD::FMIN3; - case ISD::SMIN: - return AMDGPUISD::SMIN3; - case ISD::UMIN: - return AMDGPUISD::UMIN3; - default: - llvm_unreachable("Not a min/max opcode"); - } -} - SDValue SITargetLowering::performIntMed3ImmCombine( SelectionDAG &DAG, const SDLoc &SL, SDValue Op0, SDValue Op1, bool Signed) const { @@ -10120,36 +10097,6 @@ // Only do this if the inner op has one use since this will just increases // register pressure for no benefit. - if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY && - !VT.isVector() && - (VT == MVT::i32 || VT == MVT::f32 || - ((VT == MVT::f16 || VT == MVT::i16) && Subtarget->hasMin3Max3_16()))) { - // max(max(a, b), c) -> max3(a, b, c) - // min(min(a, b), c) -> min3(a, b, c) - if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { - SDLoc DL(N); - return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), - DL, - N->getValueType(0), - Op0.getOperand(0), - Op0.getOperand(1), - Op1); - } - - // Try commuted. - // max(a, max(b, c)) -> max3(a, b, c) - // min(a, min(b, c)) -> min3(a, b, c) - if (Op1.getOpcode() == Opc && Op1.hasOneUse()) { - SDLoc DL(N); - return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), - DL, - N->getValueType(0), - Op0, - Op1.getOperand(0), - Op1.getOperand(1)); - } - } - // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1) if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) { if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true)) Index: llvm/lib/Target/AMDGPU/VOP3Instructions.td =================================================================== --- llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -347,15 +347,15 @@ // XXX - No FPException seems suspect but manual doesn't say it does let mayRaiseFPException = 0 in { let isCommutable = 1 in { - defm V_MIN3_I32 : VOP3Inst <"v_min3_i32", VOP3_Profile, AMDGPUsmin3>; - defm V_MIN3_U32 : VOP3Inst <"v_min3_u32", VOP3_Profile, AMDGPUumin3>; - defm V_MAX3_I32 : VOP3Inst <"v_max3_i32", VOP3_Profile, AMDGPUsmax3>; - defm V_MAX3_U32 : VOP3Inst <"v_max3_u32", VOP3_Profile, AMDGPUumax3>; + defm V_MIN3_I32 : VOP3Inst <"v_min3_i32", VOP3_Profile>; + defm V_MIN3_U32 : VOP3Inst <"v_min3_u32", VOP3_Profile>; + defm V_MAX3_I32 : VOP3Inst <"v_max3_i32", VOP3_Profile>; + defm V_MAX3_U32 : VOP3Inst <"v_max3_u32", VOP3_Profile>; defm V_MED3_I32 : VOP3Inst <"v_med3_i32", VOP3_Profile, AMDGPUsmed3>; defm V_MED3_U32 : VOP3Inst <"v_med3_u32", VOP3_Profile, AMDGPUumed3>; } // End isCommutable = 1 - defm V_MIN3_F32 : VOP3Inst <"v_min3_f32", VOP3_Profile, AMDGPUfmin3>; - defm V_MAX3_F32 : VOP3Inst <"v_max3_f32", VOP3_Profile, AMDGPUfmax3>; + defm V_MIN3_F32 : VOP3Inst <"v_min3_f32", VOP3_Profile>; + defm V_MAX3_F32 : VOP3Inst <"v_max3_f32", VOP3_Profile>; defm V_MED3_F32 : VOP3Inst <"v_med3_f32", VOP3_Profile, AMDGPUfmed3>; } // End mayRaiseFPException = 0 @@ -609,6 +609,12 @@ }]; } +class ThreeOp_i32_Pats : GCNPat < + // This matches (op2 (op1 i32:$src0, i32:$src1), i32:$src2) with conditions. + (ThreeOpFrag i32:$src0, i32:$src1, i32:$src2), + (inst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2) +>; + let SubtargetPredicate = isGFX9Plus in { let isCommutable = 1, isReMaterializable = 1 in { defm V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile>; @@ -624,13 +630,13 @@ defm V_MED3_I16 : VOP3Inst <"v_med3_i16", VOP3_Profile, AMDGPUsmed3>; defm V_MED3_U16 : VOP3Inst <"v_med3_u16", VOP3_Profile, AMDGPUumed3>; -defm V_MIN3_F16 : VOP3Inst <"v_min3_f16", VOP3_Profile, AMDGPUfmin3>; -defm V_MIN3_I16 : VOP3Inst <"v_min3_i16", VOP3_Profile, AMDGPUsmin3>; -defm V_MIN3_U16 : VOP3Inst <"v_min3_u16", VOP3_Profile, AMDGPUumin3>; +defm V_MIN3_F16 : VOP3Inst <"v_min3_f16", VOP3_Profile>; +defm V_MIN3_I16 : VOP3Inst <"v_min3_i16", VOP3_Profile>; +defm V_MIN3_U16 : VOP3Inst <"v_min3_u16", VOP3_Profile>; -defm V_MAX3_F16 : VOP3Inst <"v_max3_f16", VOP3_Profile, AMDGPUfmax3>; -defm V_MAX3_I16 : VOP3Inst <"v_max3_i16", VOP3_Profile, AMDGPUsmax3>; -defm V_MAX3_U16 : VOP3Inst <"v_max3_u16", VOP3_Profile, AMDGPUumax3>; +defm V_MAX3_F16 : VOP3Inst <"v_max3_f16", VOP3_Profile>; +defm V_MAX3_I16 : VOP3Inst <"v_max3_i16", VOP3_Profile>; +defm V_MAX3_U16 : VOP3Inst <"v_max3_u16", VOP3_Profile>; defm V_ADD_I16 : VOP3Inst <"v_add_i16", VOP3_Profile>; defm V_SUB_I16 : VOP3Inst <"v_sub_i16", VOP3_Profile>; @@ -649,13 +655,6 @@ defm V_LSHL_OR_B32 : VOP3Inst <"v_lshl_or_b32", VOP3_Profile>; } // End isReMaterializable = 1 - -class ThreeOp_i32_Pats : GCNPat < - // This matches (op2 (op1 i32:$src0, i32:$src1), i32:$src2) with conditions. - (ThreeOpFrag i32:$src0, i32:$src1, i32:$src2), - (inst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2) ->; - def : ThreeOp_i32_Pats; def : ThreeOp_i32_Pats; def : ThreeOp_i32_Pats; @@ -688,6 +687,39 @@ def : OpSelBinOpClampPat; } // End SubtargetPredicate = isGFX9Plus +class ThreeOp_i16_Pats : GCNPat < + (ThreeOpFrag i16:$src0, i16:$src1, i16:$src2), + (inst SRCMODS.NONE, i16:$src0, SRCMODS.NONE, i16:$src1, + SRCMODS.NONE, i16:$src2, DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +class ThreeOpFP_Pats : GCNPat < + (DivergentBinFrag (HasOneUseBinOp + (VOP3Mods vt:$src0, i32:$src0_mods), + (VOP3Mods vt:$src1, i32:$src1_mods)), + (vt (VOP3Mods vt:$src2, i32:$src2_mods))), + (inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, + DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +def : ThreeOp_i32_Pats; +def : ThreeOp_i32_Pats; +def : ThreeOp_i32_Pats; +def : ThreeOp_i32_Pats; +def : ThreeOpFP_Pats; +def : ThreeOpFP_Pats; + +let SubtargetPredicate = isGFX9Plus in { +def : ThreeOp_i16_Pats; +def : ThreeOp_i16_Pats; +def : ThreeOp_i16_Pats; +def : ThreeOp_i16_Pats; +def : ThreeOpFP_Pats; +def : ThreeOpFP_Pats; +} + def VOP3_PERMLANE_Profile : VOP3_Profile, VOP3_OPSEL> { let Src0RC64 = VRegSrc_32; let Src1RC64 = SCSrc_b32; Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fmax3.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fmax3.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fmax3.ll @@ -10,9 +10,8 @@ ; SI_VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI_VI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI_VI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI_VI-NEXT: v_max_f32_e32 v0, v0, v1 -; SI_VI-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; SI_VI-NEXT: v_max_f32_e32 v0, v0, v1 +; SI_VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI_VI-NEXT: v_max3_f32 v0, v0, v1, v2 ; SI_VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_max3_f32_ieee_true: @@ -20,9 +19,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_max3_f32 v0, v0, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_max3_f32_ieee_true: @@ -31,9 +29,8 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX10-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX10-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX10-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX10-NEXT: v_max3_f32 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %max = call float @llvm.maxnum.f32(float %a, float %b) %max3 = call float @llvm.maxnum.f32(float %max, float %c) @@ -46,9 +43,8 @@ ; SI_VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI_VI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI_VI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI_VI-NEXT: v_max_f32_e32 v0, v0, v1 -; SI_VI-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; SI_VI-NEXT: v_max_f32_e32 v0, v1, v0 +; SI_VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI_VI-NEXT: v_max3_f32 v0, v0, v1, v2 ; SI_VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_max3_f32_commute_ieee_true: @@ -56,9 +52,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX9-NEXT: v_max_f32_e32 v0, v1, v0 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_max3_f32 v0, v0, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_max3_f32_commute_ieee_true: @@ -67,9 +62,8 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX10-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX10-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX10-NEXT: v_max_f32_e32 v0, v1, v0 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX10-NEXT: v_max3_f32 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %max = call float @llvm.maxnum.f32(float %a, float %b) %max3 = call float @llvm.maxnum.f32(float %c, float %max) @@ -105,9 +99,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX9-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX9-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX9-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX9-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX9-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX9-NEXT: v_max3_f16 v0, v0, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_max3_f16_ieee_true: @@ -116,9 +109,8 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX10-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX10-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX10-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: v_max3_f16 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %max = call half @llvm.maxnum.f16(half %a, half %b) %max3 = call half @llvm.maxnum.f16(half %max, half %c) @@ -154,9 +146,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX9-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX9-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX9-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX9-NEXT: v_max_f16_e32 v0, v1, v0 +; GFX9-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX9-NEXT: v_max3_f16 v0, v0, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_max3_f16_commute_ieee_true: @@ -165,9 +156,8 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX10-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX10-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX10-NEXT: v_max_f16_e32 v0, v1, v0 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: v_max3_f16 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %max = call half @llvm.maxnum.f16(half %a, half %b) %max3 = call half @llvm.maxnum.f16(half %c, half %max) @@ -177,8 +167,7 @@ define amdgpu_ps float @test_max3_f32_ieee_false(float %a, float %b, float %c) { ; GCN-LABEL: test_max3_f32_ieee_false: ; GCN: ; %bb.0: -; GCN-NEXT: v_max_f32_e32 v0, v0, v1 -; GCN-NEXT: v_max_f32_e32 v0, v0, v2 +; GCN-NEXT: v_max3_f32 v0, v0, v1, v2 ; GCN-NEXT: ; return to shader part epilog %max = call float @llvm.maxnum.f32(float %a, float %b) %max3 = call float @llvm.maxnum.f32(float %max, float %c) @@ -188,8 +177,7 @@ define amdgpu_ps float @test_max3_f32_commute_ieee_false(float %a, float %b, float %c) { ; GCN-LABEL: test_max3_f32_commute_ieee_false: ; GCN: ; %bb.0: -; GCN-NEXT: v_max_f32_e32 v0, v0, v1 -; GCN-NEXT: v_max_f32_e32 v0, v2, v0 +; GCN-NEXT: v_max3_f32 v0, v0, v1, v2 ; GCN-NEXT: ; return to shader part epilog %max = call float @llvm.maxnum.f32(float %a, float %b) %max3 = call float @llvm.maxnum.f32(float %c, float %max) @@ -217,8 +205,7 @@ ; ; GFX9_10-LABEL: test_max3_f16_ieee_false: ; GFX9_10: ; %bb.0: -; GFX9_10-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX9_10-NEXT: v_max_f16_e32 v0, v0, v2 +; GFX9_10-NEXT: v_max3_f16 v0, v0, v1, v2 ; GFX9_10-NEXT: ; return to shader part epilog %max = call half @llvm.maxnum.f16(half %a, half %b) %max3 = call half @llvm.maxnum.f16(half %max, half %c) @@ -246,8 +233,7 @@ ; ; GFX9_10-LABEL: test_max3_f16_commute_ieee_false: ; GFX9_10: ; %bb.0: -; GFX9_10-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX9_10-NEXT: v_max_f16_e32 v0, v2, v0 +; GFX9_10-NEXT: v_max3_f16 v0, v0, v1, v2 ; GFX9_10-NEXT: ; return to shader part epilog %max = call half @llvm.maxnum.f16(half %a, half %b) %max3 = call half @llvm.maxnum.f16(half %c, half %max) @@ -298,8 +284,7 @@ ; SI-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: v_max_f32_e64 v0, |v0|, v1 -; SI-NEXT: v_max_f32_e32 v0, v0, v2 +; SI-NEXT: v_max3_f32 v0, |v0|, v1, v2 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: buffer_store_dword v0, v[3:4], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm @@ -307,16 +292,14 @@ ; VI-LABEL: test_max3_f32_fabs_fneg: ; VI: ; %bb.0: ; VI-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 -; VI-NEXT: v_max_f32_e64 v0, |v0|, v1 -; VI-NEXT: v_max_f32_e32 v0, v0, v2 +; VI-NEXT: v_max3_f32 v0, |v0|, v1, v2 ; VI-NEXT: flat_store_dword v[3:4], v0 ; VI-NEXT: s_endpgm ; ; GFX9_10-LABEL: test_max3_f32_fabs_fneg: ; GFX9_10: ; %bb.0: ; GFX9_10-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 -; GFX9_10-NEXT: v_max_f32_e64 v0, |v0|, v1 -; GFX9_10-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX9_10-NEXT: v_max3_f32 v0, |v0|, v1, v2 ; GFX9_10-NEXT: global_store_dword v[3:4], v0, off ; GFX9_10-NEXT: s_endpgm %fabs = call float @llvm.fabs.f32(float %a) @@ -332,25 +315,22 @@ define amdgpu_ps void @test_fmax3_f32_vvv(float %a, float %b, float %c, float addrspace(1)* %out) { ; SI-LABEL: test_fmax3_f32_vvv: ; SI: ; %bb.0: -; SI-NEXT: v_max_f32_e32 v0, v0, v1 +; SI-NEXT: v_max3_f32 v0, v0, v1, v2 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: v_max_f32_e32 v0, v0, v2 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: buffer_store_dword v0, v[3:4], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmax3_f32_vvv: ; VI: ; %bb.0: -; VI-NEXT: v_max_f32_e32 v0, v0, v1 -; VI-NEXT: v_max_f32_e32 v0, v0, v2 +; VI-NEXT: v_max3_f32 v0, v0, v1, v2 ; VI-NEXT: flat_store_dword v[3:4], v0 ; VI-NEXT: s_endpgm ; ; GFX9_10-LABEL: test_fmax3_f32_vvv: ; GFX9_10: ; %bb.0: -; GFX9_10-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX9_10-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX9_10-NEXT: v_max3_f32 v0, v0, v1, v2 ; GFX9_10-NEXT: global_store_dword v[3:4], v0, off ; GFX9_10-NEXT: s_endpgm %fmax = call float @llvm.maxnum.f32(float %a, float %b) @@ -362,25 +342,22 @@ define amdgpu_ps void @test_fmax3_f32_svv(float inreg %a, float %b, float %c, float addrspace(1)* %out) { ; SI-LABEL: test_fmax3_f32_svv: ; SI: ; %bb.0: -; SI-NEXT: v_max_f32_e32 v0, s2, v0 +; SI-NEXT: v_max3_f32 v0, s2, v0, v1 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: v_max_f32_e32 v0, v0, v1 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmax3_f32_svv: ; VI: ; %bb.0: -; VI-NEXT: v_max_f32_e32 v0, s2, v0 -; VI-NEXT: v_max_f32_e32 v0, v0, v1 +; VI-NEXT: v_max3_f32 v0, s2, v0, v1 ; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9_10-LABEL: test_fmax3_f32_svv: ; GFX9_10: ; %bb.0: -; GFX9_10-NEXT: v_max_f32_e32 v0, s2, v0 -; GFX9_10-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX9_10-NEXT: v_max3_f32 v0, s2, v0, v1 ; GFX9_10-NEXT: global_store_dword v[2:3], v0, off ; GFX9_10-NEXT: s_endpgm %fmax = call float @llvm.maxnum.f32(float %a, float %b) @@ -392,25 +369,22 @@ define amdgpu_ps void @test_fmax3_f32_vvs(float %a, float %b, float inreg %c, float addrspace(1)* %out) { ; SI-LABEL: test_fmax3_f32_vvs: ; SI: ; %bb.0: -; SI-NEXT: v_max_f32_e32 v0, v0, v1 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: v_max_f32_e32 v0, s2, v0 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 +; SI-NEXT: v_max3_f32 v0, v0, v1, s2 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b64 s[0:1], 0 +; SI-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmax3_f32_vvs: ; VI: ; %bb.0: -; VI-NEXT: v_max_f32_e32 v0, v0, v1 -; VI-NEXT: v_max_f32_e32 v0, s2, v0 +; VI-NEXT: v_max3_f32 v0, v0, v1, s2 ; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9_10-LABEL: test_fmax3_f32_vvs: ; GFX9_10: ; %bb.0: -; GFX9_10-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX9_10-NEXT: v_max_f32_e32 v0, s2, v0 +; GFX9_10-NEXT: v_max3_f32 v0, v0, v1, s2 ; GFX9_10-NEXT: global_store_dword v[2:3], v0, off ; GFX9_10-NEXT: s_endpgm %fmax = call float @llvm.maxnum.f32(float %a, float %b) @@ -423,10 +397,9 @@ ; SI-LABEL: test_fmax3_f32_ssv: ; SI: ; %bb.0: ; SI-NEXT: v_mov_b32_e32 v3, s3 -; SI-NEXT: v_max_f32_e32 v3, s2, v3 +; SI-NEXT: v_max3_f32 v0, s2, v3, v0 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: v_max_f32_e32 v0, v3, v0 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm @@ -434,23 +407,20 @@ ; VI-LABEL: test_fmax3_f32_ssv: ; VI: ; %bb.0: ; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_max_f32_e32 v3, s2, v3 -; VI-NEXT: v_max_f32_e32 v0, v3, v0 +; VI-NEXT: v_max3_f32 v0, s2, v3, v0 ; VI-NEXT: flat_store_dword v[1:2], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_fmax3_f32_ssv: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_max_f32_e32 v3, s2, v3 -; GFX9-NEXT: v_max_f32_e32 v0, v3, v0 +; GFX9-NEXT: v_max3_f32 v0, s2, v3, v0 ; GFX9-NEXT: global_store_dword v[1:2], v0, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: test_fmax3_f32_ssv: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_max_f32_e64 v3, s2, s3 -; GFX10-NEXT: v_max_f32_e32 v0, v3, v0 +; GFX10-NEXT: v_max3_f32 v0, s2, s3, v0 ; GFX10-NEXT: global_store_dword v[1:2], v0, off ; GFX10-NEXT: s_endpgm %fmax = call float @llvm.maxnum.f32(float %a, float %b) @@ -462,27 +432,33 @@ define amdgpu_ps void @test_fmax3_f32_vss(float %a, float inreg %b, float inreg %c, float addrspace(1)* %out) { ; SI-LABEL: test_fmax3_f32_vss: ; SI: ; %bb.0: -; SI-NEXT: v_max_f32_e32 v0, s2, v0 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: v_max_f32_e32 v0, s3, v0 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 +; SI-NEXT: v_mov_b32_e32 v3, s3 +; SI-NEXT: v_max3_f32 v0, v0, s2, v3 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b64 s[0:1], 0 +; SI-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmax3_f32_vss: ; VI: ; %bb.0: -; VI-NEXT: v_max_f32_e32 v0, s2, v0 -; VI-NEXT: v_max_f32_e32 v0, s3, v0 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_max3_f32 v0, v0, s2, v3 ; VI-NEXT: flat_store_dword v[1:2], v0 ; VI-NEXT: s_endpgm ; -; GFX9_10-LABEL: test_fmax3_f32_vss: -; GFX9_10: ; %bb.0: -; GFX9_10-NEXT: v_max_f32_e32 v0, s2, v0 -; GFX9_10-NEXT: v_max_f32_e32 v0, s3, v0 -; GFX9_10-NEXT: global_store_dword v[1:2], v0, off -; GFX9_10-NEXT: s_endpgm +; GFX9-LABEL: test_fmax3_f32_vss: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_max3_f32 v0, v0, s2, v3 +; GFX9-NEXT: global_store_dword v[1:2], v0, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: test_fmax3_f32_vss: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_max3_f32 v0, v0, s2, s3 +; GFX10-NEXT: global_store_dword v[1:2], v0, off +; GFX10-NEXT: s_endpgm %fmax = call float @llvm.maxnum.f32(float %a, float %b) %fmax3 = call float @llvm.maxnum.f32(float %fmax, float %c) store float %fmax3, float addrspace(1)* %out, align 4 @@ -493,10 +469,10 @@ ; SI-LABEL: test_fmax3_f32_sss: ; SI: ; %bb.0: ; SI-NEXT: v_mov_b32_e32 v2, s3 -; SI-NEXT: v_max_f32_e32 v2, s2, v2 +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: v_max3_f32 v2, s2, v2, v3 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: v_max_f32_e32 v2, s4, v2 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm @@ -504,23 +480,23 @@ ; VI-LABEL: test_fmax3_f32_sss: ; VI: ; %bb.0: ; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: v_max_f32_e32 v2, s2, v2 -; VI-NEXT: v_max_f32_e32 v2, s4, v2 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_max3_f32 v2, s2, v2, v3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_fmax3_f32_sss: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: v_max_f32_e32 v2, s2, v2 -; GFX9-NEXT: v_max_f32_e32 v2, s4, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_max3_f32 v2, s2, v2, v3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: test_fmax3_f32_sss: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_max_f32_e64 v2, s2, s3 -; GFX10-NEXT: v_max_f32_e32 v2, s4, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: v_max3_f32 v2, s2, s3, v2 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %fmax = call float @llvm.maxnum.f32(float %a, float %b) @@ -555,8 +531,7 @@ ; ; GFX9_10-LABEL: test_max3_f16_vvv: ; GFX9_10: ; %bb.0: -; GFX9_10-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX9_10-NEXT: v_max_f16_e32 v0, v0, v2 +; GFX9_10-NEXT: v_max3_f16 v0, v0, v1, v2 ; GFX9_10-NEXT: global_store_short v[3:4], v0, off ; GFX9_10-NEXT: s_endpgm %fmax = call half @llvm.maxnum.f16(half %a, half %b) @@ -591,8 +566,7 @@ ; ; GFX9_10-LABEL: test_max3_f16_svv: ; GFX9_10: ; %bb.0: -; GFX9_10-NEXT: v_max_f16_e32 v0, s2, v0 -; GFX9_10-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX9_10-NEXT: v_max3_f16 v0, s2, v0, v1 ; GFX9_10-NEXT: global_store_short v[2:3], v0, off ; GFX9_10-NEXT: s_endpgm %fmax = call half @llvm.maxnum.f16(half %a, half %b) @@ -627,8 +601,7 @@ ; ; GFX9_10-LABEL: test_max3_f16_vvs: ; GFX9_10: ; %bb.0: -; GFX9_10-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX9_10-NEXT: v_max_f16_e32 v0, s2, v0 +; GFX9_10-NEXT: v_max3_f16 v0, v0, v1, s2 ; GFX9_10-NEXT: global_store_short v[2:3], v0, off ; GFX9_10-NEXT: s_endpgm %fmax = call half @llvm.maxnum.f16(half %a, half %b) @@ -665,15 +638,13 @@ ; GFX9-LABEL: test_max3_f16_ssv: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_max_f16_e32 v3, s2, v3 -; GFX9-NEXT: v_max_f16_e32 v0, v3, v0 +; GFX9-NEXT: v_max3_f16 v0, s2, v3, v0 ; GFX9-NEXT: global_store_short v[1:2], v0, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: test_max3_f16_ssv: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_max_f16_e64 v3, s2, s3 -; GFX10-NEXT: v_max_f16_e32 v0, v3, v0 +; GFX10-NEXT: v_max3_f16 v0, s2, s3, v0 ; GFX10-NEXT: global_store_short v[1:2], v0, off ; GFX10-NEXT: s_endpgm %fmax = call half @llvm.maxnum.f16(half %a, half %b) @@ -706,12 +677,18 @@ ; VI-NEXT: flat_store_short v[1:2], v0 ; VI-NEXT: s_endpgm ; -; GFX9_10-LABEL: test_max3_f16_vss: -; GFX9_10: ; %bb.0: -; GFX9_10-NEXT: v_max_f16_e32 v0, s2, v0 -; GFX9_10-NEXT: v_max_f16_e32 v0, s3, v0 -; GFX9_10-NEXT: global_store_short v[1:2], v0, off -; GFX9_10-NEXT: s_endpgm +; GFX9-LABEL: test_max3_f16_vss: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_max3_f16 v0, v0, s2, v3 +; GFX9-NEXT: global_store_short v[1:2], v0, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: test_max3_f16_vss: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_max3_f16 v0, v0, s2, s3 +; GFX10-NEXT: global_store_short v[1:2], v0, off +; GFX10-NEXT: s_endpgm %fmax = call half @llvm.maxnum.f16(half %a, half %b) %fmax3 = call half @llvm.maxnum.f16(half %fmax, half %c) store half %fmax3, half addrspace(1)* %out, align 4 @@ -746,15 +723,15 @@ ; GFX9-LABEL: test_max3_f16_sss: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: v_max_f16_e32 v2, s2, v2 -; GFX9-NEXT: v_max_f16_e32 v2, s4, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_max3_f16 v2, s2, v2, v3 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: test_max3_f16_sss: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_max_f16_e64 v2, s2, s3 -; GFX10-NEXT: v_max_f16_e32 v2, s4, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: v_max3_f16 v2, s2, s3, v2 ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm %fmax = call half @llvm.maxnum.f16(half %a, half %b) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fmin3.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fmin3.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fmin3.ll @@ -10,9 +10,8 @@ ; SI_VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI_VI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI_VI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI_VI-NEXT: v_min_f32_e32 v0, v0, v1 -; SI_VI-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; SI_VI-NEXT: v_min_f32_e32 v0, v0, v1 +; SI_VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI_VI-NEXT: v_min3_f32 v0, v0, v1, v2 ; SI_VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_min3_f32_ieee_true: @@ -20,9 +19,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX9-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX9-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_min3_f32 v0, v0, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_min3_f32_ieee_true: @@ -31,9 +29,8 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX10-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX10-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX10-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX10-NEXT: v_min3_f32 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %min = call float @llvm.minnum.f32(float %a, float %b) %min3 = call float @llvm.minnum.f32(float %min, float %c) @@ -46,9 +43,8 @@ ; SI_VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI_VI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI_VI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI_VI-NEXT: v_min_f32_e32 v0, v0, v1 -; SI_VI-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; SI_VI-NEXT: v_min_f32_e32 v0, v1, v0 +; SI_VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI_VI-NEXT: v_min3_f32 v0, v0, v1, v2 ; SI_VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_min3_f32_commute_ieee_true: @@ -56,9 +52,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX9-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX9-NEXT: v_min_f32_e32 v0, v1, v0 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_min3_f32 v0, v0, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_min3_f32_commute_ieee_true: @@ -67,9 +62,8 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 ; GFX10-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX10-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX10-NEXT: v_max_f32_e32 v1, v2, v2 -; GFX10-NEXT: v_min_f32_e32 v0, v1, v0 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX10-NEXT: v_min3_f32 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %min = call float @llvm.minnum.f32(float %a, float %b) %min3 = call float @llvm.minnum.f32(float %c, float %min) @@ -105,9 +99,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX9-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX9-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX9-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX9-NEXT: v_min3_f16 v0, v0, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_min3_f16_ieee_true: @@ -116,9 +109,8 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX10-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX10-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX10-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: v_min3_f16 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %min = call half @llvm.minnum.f16(half %a, half %b) %min3 = call half @llvm.minnum.f16(half %min, half %c) @@ -154,9 +146,8 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX9-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX9-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX9-NEXT: v_min_f16_e32 v0, v1, v0 +; GFX9-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX9-NEXT: v_min3_f16 v0, v0, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_min3_f16_commute_ieee_true: @@ -165,9 +156,8 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX10-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX10-NEXT: v_max_f16_e32 v1, v2, v2 -; GFX10-NEXT: v_min_f16_e32 v0, v1, v0 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: v_min3_f16 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %min = call half @llvm.minnum.f16(half %a, half %b) %min3 = call half @llvm.minnum.f16(half %c, half %min) @@ -177,8 +167,7 @@ define amdgpu_ps float @test_min3_f32_ieee_false(float %a, float %b, float %c) { ; GCN-LABEL: test_min3_f32_ieee_false: ; GCN: ; %bb.0: -; GCN-NEXT: v_min_f32_e32 v0, v0, v1 -; GCN-NEXT: v_min_f32_e32 v0, v0, v2 +; GCN-NEXT: v_min3_f32 v0, v0, v1, v2 ; GCN-NEXT: ; return to shader part epilog %min = call float @llvm.minnum.f32(float %a, float %b) %min3 = call float @llvm.minnum.f32(float %min, float %c) @@ -188,8 +177,7 @@ define amdgpu_ps float @test_min3_f32_commute_ieee_false(float %a, float %b, float %c) { ; GCN-LABEL: test_min3_f32_commute_ieee_false: ; GCN: ; %bb.0: -; GCN-NEXT: v_min_f32_e32 v0, v0, v1 -; GCN-NEXT: v_min_f32_e32 v0, v2, v0 +; GCN-NEXT: v_min3_f32 v0, v0, v1, v2 ; GCN-NEXT: ; return to shader part epilog %min = call float @llvm.minnum.f32(float %a, float %b) %min3 = call float @llvm.minnum.f32(float %c, float %min) @@ -217,8 +205,7 @@ ; ; GFX9_10-LABEL: test_min3_f16_ieee_false: ; GFX9_10: ; %bb.0: -; GFX9_10-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX9_10-NEXT: v_min_f16_e32 v0, v0, v2 +; GFX9_10-NEXT: v_min3_f16 v0, v0, v1, v2 ; GFX9_10-NEXT: ; return to shader part epilog %min = call half @llvm.minnum.f16(half %a, half %b) %min3 = call half @llvm.minnum.f16(half %min, half %c) @@ -246,8 +233,7 @@ ; ; GFX9_10-LABEL: test_min3_f16_commute_ieee_false: ; GFX9_10: ; %bb.0: -; GFX9_10-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX9_10-NEXT: v_min_f16_e32 v0, v2, v0 +; GFX9_10-NEXT: v_min3_f16 v0, v0, v1, v2 ; GFX9_10-NEXT: ; return to shader part epilog %min = call half @llvm.minnum.f16(half %a, half %b) %min3 = call half @llvm.minnum.f16(half %c, half %min) @@ -298,8 +284,7 @@ ; SI-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: v_min_f32_e64 v0, |v0|, v1 -; SI-NEXT: v_min_f32_e32 v0, v0, v2 +; SI-NEXT: v_min3_f32 v0, |v0|, v1, v2 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: buffer_store_dword v0, v[3:4], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm @@ -307,16 +292,14 @@ ; VI-LABEL: test_min3_f32_fabs_fneg: ; VI: ; %bb.0: ; VI-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 -; VI-NEXT: v_min_f32_e64 v0, |v0|, v1 -; VI-NEXT: v_min_f32_e32 v0, v0, v2 +; VI-NEXT: v_min3_f32 v0, |v0|, v1, v2 ; VI-NEXT: flat_store_dword v[3:4], v0 ; VI-NEXT: s_endpgm ; ; GFX9_10-LABEL: test_min3_f32_fabs_fneg: ; GFX9_10: ; %bb.0: ; GFX9_10-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 -; GFX9_10-NEXT: v_min_f32_e64 v0, |v0|, v1 -; GFX9_10-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX9_10-NEXT: v_min3_f32 v0, |v0|, v1, v2 ; GFX9_10-NEXT: global_store_dword v[3:4], v0, off ; GFX9_10-NEXT: s_endpgm %fabs = call float @llvm.fabs.f32(float %a) @@ -332,25 +315,22 @@ define amdgpu_ps void @test_fmin3_f32_vvv(float %a, float %b, float %c, float addrspace(1)* %out) { ; SI-LABEL: test_fmin3_f32_vvv: ; SI: ; %bb.0: -; SI-NEXT: v_min_f32_e32 v0, v0, v1 +; SI-NEXT: v_min3_f32 v0, v0, v1, v2 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: v_min_f32_e32 v0, v0, v2 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: buffer_store_dword v0, v[3:4], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmin3_f32_vvv: ; VI: ; %bb.0: -; VI-NEXT: v_min_f32_e32 v0, v0, v1 -; VI-NEXT: v_min_f32_e32 v0, v0, v2 +; VI-NEXT: v_min3_f32 v0, v0, v1, v2 ; VI-NEXT: flat_store_dword v[3:4], v0 ; VI-NEXT: s_endpgm ; ; GFX9_10-LABEL: test_fmin3_f32_vvv: ; GFX9_10: ; %bb.0: -; GFX9_10-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX9_10-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX9_10-NEXT: v_min3_f32 v0, v0, v1, v2 ; GFX9_10-NEXT: global_store_dword v[3:4], v0, off ; GFX9_10-NEXT: s_endpgm %fmin = call float @llvm.minnum.f32(float %a, float %b) @@ -362,25 +342,22 @@ define amdgpu_ps void @test_fmin3_f32_svv(float inreg %a, float %b, float %c, float addrspace(1)* %out) { ; SI-LABEL: test_fmin3_f32_svv: ; SI: ; %bb.0: -; SI-NEXT: v_min_f32_e32 v0, s2, v0 +; SI-NEXT: v_min3_f32 v0, s2, v0, v1 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: v_min_f32_e32 v0, v0, v1 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmin3_f32_svv: ; VI: ; %bb.0: -; VI-NEXT: v_min_f32_e32 v0, s2, v0 -; VI-NEXT: v_min_f32_e32 v0, v0, v1 +; VI-NEXT: v_min3_f32 v0, s2, v0, v1 ; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9_10-LABEL: test_fmin3_f32_svv: ; GFX9_10: ; %bb.0: -; GFX9_10-NEXT: v_min_f32_e32 v0, s2, v0 -; GFX9_10-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX9_10-NEXT: v_min3_f32 v0, s2, v0, v1 ; GFX9_10-NEXT: global_store_dword v[2:3], v0, off ; GFX9_10-NEXT: s_endpgm %fmin = call float @llvm.minnum.f32(float %a, float %b) @@ -392,25 +369,22 @@ define amdgpu_ps void @test_fmin3_f32_vvs(float %a, float %b, float inreg %c, float addrspace(1)* %out) { ; SI-LABEL: test_fmin3_f32_vvs: ; SI: ; %bb.0: -; SI-NEXT: v_min_f32_e32 v0, v0, v1 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: v_min_f32_e32 v0, s2, v0 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 +; SI-NEXT: v_min3_f32 v0, v0, v1, s2 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b64 s[0:1], 0 +; SI-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmin3_f32_vvs: ; VI: ; %bb.0: -; VI-NEXT: v_min_f32_e32 v0, v0, v1 -; VI-NEXT: v_min_f32_e32 v0, s2, v0 +; VI-NEXT: v_min3_f32 v0, v0, v1, s2 ; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9_10-LABEL: test_fmin3_f32_vvs: ; GFX9_10: ; %bb.0: -; GFX9_10-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX9_10-NEXT: v_min_f32_e32 v0, s2, v0 +; GFX9_10-NEXT: v_min3_f32 v0, v0, v1, s2 ; GFX9_10-NEXT: global_store_dword v[2:3], v0, off ; GFX9_10-NEXT: s_endpgm %fmin = call float @llvm.minnum.f32(float %a, float %b) @@ -423,10 +397,9 @@ ; SI-LABEL: test_fmin3_f32_ssv: ; SI: ; %bb.0: ; SI-NEXT: v_mov_b32_e32 v3, s3 -; SI-NEXT: v_min_f32_e32 v3, s2, v3 +; SI-NEXT: v_min3_f32 v0, s2, v3, v0 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: v_min_f32_e32 v0, v3, v0 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm @@ -434,23 +407,20 @@ ; VI-LABEL: test_fmin3_f32_ssv: ; VI: ; %bb.0: ; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_min_f32_e32 v3, s2, v3 -; VI-NEXT: v_min_f32_e32 v0, v3, v0 +; VI-NEXT: v_min3_f32 v0, s2, v3, v0 ; VI-NEXT: flat_store_dword v[1:2], v0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_fmin3_f32_ssv: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_min_f32_e32 v3, s2, v3 -; GFX9-NEXT: v_min_f32_e32 v0, v3, v0 +; GFX9-NEXT: v_min3_f32 v0, s2, v3, v0 ; GFX9-NEXT: global_store_dword v[1:2], v0, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: test_fmin3_f32_ssv: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_min_f32_e64 v3, s2, s3 -; GFX10-NEXT: v_min_f32_e32 v0, v3, v0 +; GFX10-NEXT: v_min3_f32 v0, s2, s3, v0 ; GFX10-NEXT: global_store_dword v[1:2], v0, off ; GFX10-NEXT: s_endpgm %fmin = call float @llvm.minnum.f32(float %a, float %b) @@ -462,27 +432,33 @@ define amdgpu_ps void @test_fmin3_f32_vss(float %a, float inreg %b, float inreg %c, float addrspace(1)* %out) { ; SI-LABEL: test_fmin3_f32_vss: ; SI: ; %bb.0: -; SI-NEXT: v_min_f32_e32 v0, s2, v0 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: v_min_f32_e32 v0, s3, v0 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 +; SI-NEXT: v_mov_b32_e32 v3, s3 +; SI-NEXT: v_min3_f32 v0, v0, s2, v3 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b64 s[0:1], 0 +; SI-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_fmin3_f32_vss: ; VI: ; %bb.0: -; VI-NEXT: v_min_f32_e32 v0, s2, v0 -; VI-NEXT: v_min_f32_e32 v0, s3, v0 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_min3_f32 v0, v0, s2, v3 ; VI-NEXT: flat_store_dword v[1:2], v0 ; VI-NEXT: s_endpgm ; -; GFX9_10-LABEL: test_fmin3_f32_vss: -; GFX9_10: ; %bb.0: -; GFX9_10-NEXT: v_min_f32_e32 v0, s2, v0 -; GFX9_10-NEXT: v_min_f32_e32 v0, s3, v0 -; GFX9_10-NEXT: global_store_dword v[1:2], v0, off -; GFX9_10-NEXT: s_endpgm +; GFX9-LABEL: test_fmin3_f32_vss: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_min3_f32 v0, v0, s2, v3 +; GFX9-NEXT: global_store_dword v[1:2], v0, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: test_fmin3_f32_vss: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_min3_f32 v0, v0, s2, s3 +; GFX10-NEXT: global_store_dword v[1:2], v0, off +; GFX10-NEXT: s_endpgm %fmin = call float @llvm.minnum.f32(float %a, float %b) %fmin3 = call float @llvm.minnum.f32(float %fmin, float %c) store float %fmin3, float addrspace(1)* %out, align 4 @@ -493,10 +469,10 @@ ; SI-LABEL: test_fmin3_f32_sss: ; SI: ; %bb.0: ; SI-NEXT: v_mov_b32_e32 v2, s3 -; SI-NEXT: v_min_f32_e32 v2, s2, v2 +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: v_min3_f32 v2, s2, v2, v3 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: v_min_f32_e32 v2, s4, v2 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm @@ -504,23 +480,23 @@ ; VI-LABEL: test_fmin3_f32_sss: ; VI: ; %bb.0: ; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: v_min_f32_e32 v2, s2, v2 -; VI-NEXT: v_min_f32_e32 v2, s4, v2 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_min3_f32 v2, s2, v2, v3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_fmin3_f32_sss: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: v_min_f32_e32 v2, s2, v2 -; GFX9-NEXT: v_min_f32_e32 v2, s4, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_min3_f32 v2, s2, v2, v3 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: test_fmin3_f32_sss: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_min_f32_e64 v2, s2, s3 -; GFX10-NEXT: v_min_f32_e32 v2, s4, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: v_min3_f32 v2, s2, s3, v2 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm %fmin = call float @llvm.minnum.f32(float %a, float %b) @@ -555,8 +531,7 @@ ; ; GFX9_10-LABEL: test_min3_f16_vvv: ; GFX9_10: ; %bb.0: -; GFX9_10-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX9_10-NEXT: v_min_f16_e32 v0, v0, v2 +; GFX9_10-NEXT: v_min3_f16 v0, v0, v1, v2 ; GFX9_10-NEXT: global_store_short v[3:4], v0, off ; GFX9_10-NEXT: s_endpgm %fmin = call half @llvm.minnum.f16(half %a, half %b) @@ -591,8 +566,7 @@ ; ; GFX9_10-LABEL: test_min3_f16_svv: ; GFX9_10: ; %bb.0: -; GFX9_10-NEXT: v_min_f16_e32 v0, s2, v0 -; GFX9_10-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX9_10-NEXT: v_min3_f16 v0, s2, v0, v1 ; GFX9_10-NEXT: global_store_short v[2:3], v0, off ; GFX9_10-NEXT: s_endpgm %fmin = call half @llvm.minnum.f16(half %a, half %b) @@ -627,8 +601,7 @@ ; ; GFX9_10-LABEL: test_min3_f16_vvs: ; GFX9_10: ; %bb.0: -; GFX9_10-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX9_10-NEXT: v_min_f16_e32 v0, s2, v0 +; GFX9_10-NEXT: v_min3_f16 v0, v0, v1, s2 ; GFX9_10-NEXT: global_store_short v[2:3], v0, off ; GFX9_10-NEXT: s_endpgm %fmin = call half @llvm.minnum.f16(half %a, half %b) @@ -665,15 +638,13 @@ ; GFX9-LABEL: test_min3_f16_ssv: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_min_f16_e32 v3, s2, v3 -; GFX9-NEXT: v_min_f16_e32 v0, v3, v0 +; GFX9-NEXT: v_min3_f16 v0, s2, v3, v0 ; GFX9-NEXT: global_store_short v[1:2], v0, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: test_min3_f16_ssv: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_min_f16_e64 v3, s2, s3 -; GFX10-NEXT: v_min_f16_e32 v0, v3, v0 +; GFX10-NEXT: v_min3_f16 v0, s2, s3, v0 ; GFX10-NEXT: global_store_short v[1:2], v0, off ; GFX10-NEXT: s_endpgm %fmin = call half @llvm.minnum.f16(half %a, half %b) @@ -706,12 +677,18 @@ ; VI-NEXT: flat_store_short v[1:2], v0 ; VI-NEXT: s_endpgm ; -; GFX9_10-LABEL: test_min3_f16_vss: -; GFX9_10: ; %bb.0: -; GFX9_10-NEXT: v_min_f16_e32 v0, s2, v0 -; GFX9_10-NEXT: v_min_f16_e32 v0, s3, v0 -; GFX9_10-NEXT: global_store_short v[1:2], v0, off -; GFX9_10-NEXT: s_endpgm +; GFX9-LABEL: test_min3_f16_vss: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_min3_f16 v0, v0, s2, v3 +; GFX9-NEXT: global_store_short v[1:2], v0, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: test_min3_f16_vss: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_min3_f16 v0, v0, s2, s3 +; GFX10-NEXT: global_store_short v[1:2], v0, off +; GFX10-NEXT: s_endpgm %fmin = call half @llvm.minnum.f16(half %a, half %b) %fmin3 = call half @llvm.minnum.f16(half %fmin, half %c) store half %fmin3, half addrspace(1)* %out, align 4 @@ -746,15 +723,15 @@ ; GFX9-LABEL: test_min3_f16_sss: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: v_min_f16_e32 v2, s2, v2 -; GFX9-NEXT: v_min_f16_e32 v2, s4, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_min3_f16 v2, s2, v2, v3 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: test_min3_f16_sss: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_min_f16_e64 v2, s2, s3 -; GFX10-NEXT: v_min_f16_e32 v2, s4, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: v_min3_f16 v2, s2, s3, v2 ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm %fmin = call half @llvm.minnum.f16(half %a, half %b) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/max3.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/max3.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/max3.ll @@ -8,23 +8,20 @@ ; SI_VI-LABEL: test_max3_u32: ; SI_VI: ; %bb.0: ; SI_VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI_VI-NEXT: v_max_u32_e32 v0, v0, v1 -; SI_VI-NEXT: v_max_u32_e32 v0, v0, v2 +; SI_VI-NEXT: v_max3_u32 v0, v0, v1, v2 ; SI_VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_max3_u32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_max_u32_e32 v0, v0, v2 +; GFX9-NEXT: v_max3_u32 v0, v0, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_max3_u32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_u32_e32 v0, v0, v1 -; GFX10-NEXT: v_max_u32_e32 v0, v0, v2 +; GFX10-NEXT: v_max3_u32 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %umax = call i32 @llvm.umax.i32(i32 %a, i32 %b) %umax3 = call i32 @llvm.umax.i32(i32 %umax, i32 %c) @@ -46,23 +43,20 @@ ; SI_VI-LABEL: test_max3_u32_commute: ; SI_VI: ; %bb.0: ; SI_VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI_VI-NEXT: v_max_u32_e32 v0, v0, v1 -; SI_VI-NEXT: v_max_u32_e32 v0, v2, v0 +; SI_VI-NEXT: v_max3_u32 v0, v0, v1, v2 ; SI_VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_max3_u32_commute: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_max_u32_e32 v0, v2, v0 +; GFX9-NEXT: v_max3_u32 v0, v0, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_max3_u32_commute: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_u32_e32 v0, v0, v1 -; GFX10-NEXT: v_max_u32_e32 v0, v2, v0 +; GFX10-NEXT: v_max3_u32 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %umax = call i32 @llvm.umax.i32(i32 %a, i32 %b) %umax3 = call i32 @llvm.umax.i32(i32 %c, i32 %umax) @@ -76,9 +70,8 @@ ; SI-NEXT: s_mov_b32 s4, 0xffff ; SI-NEXT: v_and_b32_e32 v0, s4, v0 ; SI-NEXT: v_and_b32_e32 v1, s4, v1 -; SI-NEXT: v_max_u32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, s4, v2 -; SI-NEXT: v_max_u32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v2, s4, v2 +; SI-NEXT: v_max3_u32 v0, v0, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: test_max3_u16: @@ -91,16 +84,14 @@ ; GFX9-LABEL: test_max3_u16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_u16_e32 v0, v0, v1 -; GFX9-NEXT: v_max_u16_e32 v0, v0, v2 +; GFX9-NEXT: v_max3_u16 v0, v0, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_max3_u16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_u16 v0, v0, v1 -; GFX10-NEXT: v_max_u16 v0, v0, v2 +; GFX10-NEXT: v_max3_u16 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %umax = call i16 @llvm.umax.i16(i16 %a, i16 %b) %umax3 = call i16 @llvm.umax.i16(i16 %umax, i16 %c) @@ -114,9 +105,8 @@ ; SI-NEXT: s_mov_b32 s4, 0xffff ; SI-NEXT: v_and_b32_e32 v0, s4, v0 ; SI-NEXT: v_and_b32_e32 v1, s4, v1 -; SI-NEXT: v_max_u32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, s4, v2 -; SI-NEXT: v_max_u32_e32 v0, v1, v0 +; SI-NEXT: v_and_b32_e32 v2, s4, v2 +; SI-NEXT: v_max3_u32 v0, v0, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: test_max3_u16_commute: @@ -129,16 +119,14 @@ ; GFX9-LABEL: test_max3_u16_commute: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_u16_e32 v0, v0, v1 -; GFX9-NEXT: v_max_u16_e32 v0, v2, v0 +; GFX9-NEXT: v_max3_u16 v0, v0, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_max3_u16_commute: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_u16 v0, v0, v1 -; GFX10-NEXT: v_max_u16 v0, v2, v0 +; GFX10-NEXT: v_max3_u16 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %umax = call i16 @llvm.umax.i16(i16 %a, i16 %b) %umax3 = call i16 @llvm.umax.i16(i16 %c, i16 %umax) @@ -149,23 +137,20 @@ ; SI_VI-LABEL: test_max3_i32: ; SI_VI: ; %bb.0: ; SI_VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI_VI-NEXT: v_max_i32_e32 v0, v0, v1 -; SI_VI-NEXT: v_max_i32_e32 v0, v0, v2 +; SI_VI-NEXT: v_max3_i32 v0, v0, v1, v2 ; SI_VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_max3_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_i32_e32 v0, v0, v1 -; GFX9-NEXT: v_max_i32_e32 v0, v0, v2 +; GFX9-NEXT: v_max3_i32 v0, v0, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_max3_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_i32_e32 v0, v0, v1 -; GFX10-NEXT: v_max_i32_e32 v0, v0, v2 +; GFX10-NEXT: v_max3_i32 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %smax = call i32 @llvm.smax.i32(i32 %a, i32 %b) %smax3 = call i32 @llvm.smax.i32(i32 %smax, i32 %c) @@ -187,23 +172,20 @@ ; SI_VI-LABEL: test_max3_i32_commute: ; SI_VI: ; %bb.0: ; SI_VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI_VI-NEXT: v_max_i32_e32 v0, v0, v1 -; SI_VI-NEXT: v_max_i32_e32 v0, v2, v0 +; SI_VI-NEXT: v_max3_i32 v0, v0, v1, v2 ; SI_VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_max3_i32_commute: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_i32_e32 v0, v0, v1 -; GFX9-NEXT: v_max_i32_e32 v0, v2, v0 +; GFX9-NEXT: v_max3_i32 v0, v0, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_max3_i32_commute: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_i32_e32 v0, v0, v1 -; GFX10-NEXT: v_max_i32_e32 v0, v2, v0 +; GFX10-NEXT: v_max3_i32 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %smax = call i32 @llvm.smax.i32(i32 %a, i32 %b) %smax3 = call i32 @llvm.smax.i32(i32 %c, i32 %smax) @@ -232,16 +214,14 @@ ; GFX9-LABEL: test_max3_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_i16_e32 v0, v0, v1 -; GFX9-NEXT: v_max_i16_e32 v0, v0, v2 +; GFX9-NEXT: v_max3_i16 v0, v0, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_max3_i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_i16 v0, v0, v1 -; GFX10-NEXT: v_max_i16 v0, v0, v2 +; GFX10-NEXT: v_max3_i16 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %smax = call i16 @llvm.smax.i16(i16 %a, i16 %b) %smax3 = call i16 @llvm.smax.i16(i16 %smax, i16 %c) @@ -270,16 +250,14 @@ ; GFX9-LABEL: test_max3_i16_commute: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_i16_e32 v0, v0, v1 -; GFX9-NEXT: v_max_i16_e32 v0, v2, v0 +; GFX9-NEXT: v_max3_i16 v0, v0, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_max3_i16_commute: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_max_i16 v0, v0, v1 -; GFX10-NEXT: v_max_i16 v0, v2, v0 +; GFX10-NEXT: v_max3_i16 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %smax = call i16 @llvm.smax.i16(i16 %a, i16 %b) %smax3 = call i16 @llvm.smax.i16(i16 %c, i16 %smax) @@ -329,25 +307,22 @@ define amdgpu_ps void @test_max3_u32_vvv(i32 %a, i32 %b, i32 %c, i32 addrspace(1)* %out) { ; SI-LABEL: test_max3_u32_vvv: ; SI: ; %bb.0: -; SI-NEXT: v_max_u32_e32 v0, v0, v1 +; SI-NEXT: v_max3_u32 v0, v0, v1, v2 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: v_max_u32_e32 v0, v0, v2 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: buffer_store_dword v0, v[3:4], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_max3_u32_vvv: ; VI: ; %bb.0: -; VI-NEXT: v_max_u32_e32 v0, v0, v1 -; VI-NEXT: v_max_u32_e32 v0, v0, v2 +; VI-NEXT: v_max3_u32 v0, v0, v1, v2 ; VI-NEXT: flat_store_dword v[3:4], v0 ; VI-NEXT: s_endpgm ; ; GFX9_10-LABEL: test_max3_u32_vvv: ; GFX9_10: ; %bb.0: -; GFX9_10-NEXT: v_max_u32_e32 v0, v0, v1 -; GFX9_10-NEXT: v_max_u32_e32 v0, v0, v2 +; GFX9_10-NEXT: v_max3_u32 v0, v0, v1, v2 ; GFX9_10-NEXT: global_store_dword v[3:4], v0, off ; GFX9_10-NEXT: s_endpgm %max = call i32 @llvm.umax.i32(i32 %a, i32 %b) @@ -359,25 +334,22 @@ define amdgpu_ps void @test_max3_u32_svv(i32 inreg %a, i32 %b, i32 %c, i32 addrspace(1)* %out) { ; SI-LABEL: test_max3_u32_svv: ; SI: ; %bb.0: -; SI-NEXT: v_max_u32_e32 v0, s2, v0 +; SI-NEXT: v_max3_u32 v0, s2, v0, v1 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: v_max_u32_e32 v0, v0, v1 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_max3_u32_svv: ; VI: ; %bb.0: -; VI-NEXT: v_max_u32_e32 v0, s2, v0 -; VI-NEXT: v_max_u32_e32 v0, v0, v1 +; VI-NEXT: v_max3_u32 v0, s2, v0, v1 ; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9_10-LABEL: test_max3_u32_svv: ; GFX9_10: ; %bb.0: -; GFX9_10-NEXT: v_max_u32_e32 v0, s2, v0 -; GFX9_10-NEXT: v_max_u32_e32 v0, v0, v1 +; GFX9_10-NEXT: v_max3_u32 v0, s2, v0, v1 ; GFX9_10-NEXT: global_store_dword v[2:3], v0, off ; GFX9_10-NEXT: s_endpgm %max = call i32 @llvm.umax.i32(i32 %a, i32 %b) @@ -389,25 +361,22 @@ define amdgpu_ps void @test_max3_u32_vvs(i32 %a, i32 %b, i32 inreg %c, i32 addrspace(1)* %out) { ; SI-LABEL: test_max3_u32_vvs: ; SI: ; %bb.0: -; SI-NEXT: v_max_u32_e32 v0, v0, v1 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: v_max_u32_e32 v0, s2, v0 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 +; SI-NEXT: v_max3_u32 v0, v0, v1, s2 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b64 s[0:1], 0 +; SI-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_max3_u32_vvs: ; VI: ; %bb.0: -; VI-NEXT: v_max_u32_e32 v0, v0, v1 -; VI-NEXT: v_max_u32_e32 v0, s2, v0 +; VI-NEXT: v_max3_u32 v0, v0, v1, s2 ; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9_10-LABEL: test_max3_u32_vvs: ; GFX9_10: ; %bb.0: -; GFX9_10-NEXT: v_max_u32_e32 v0, v0, v1 -; GFX9_10-NEXT: v_max_u32_e32 v0, s2, v0 +; GFX9_10-NEXT: v_max3_u32 v0, v0, v1, s2 ; GFX9_10-NEXT: global_store_dword v[2:3], v0, off ; GFX9_10-NEXT: s_endpgm %max = call i32 @llvm.umax.i32(i32 %a, i32 %b) @@ -449,27 +418,33 @@ define amdgpu_ps void @test_max3_i32_vss(i32 %a, i32 inreg %b, i32 inreg %c, i32 addrspace(1)* %out) { ; SI-LABEL: test_max3_i32_vss: ; SI: ; %bb.0: -; SI-NEXT: v_max_i32_e32 v0, s2, v0 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: v_max_i32_e32 v0, s3, v0 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 +; SI-NEXT: v_mov_b32_e32 v3, s3 +; SI-NEXT: v_max3_i32 v0, v0, s2, v3 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b64 s[0:1], 0 +; SI-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_max3_i32_vss: ; VI: ; %bb.0: -; VI-NEXT: v_max_i32_e32 v0, s2, v0 -; VI-NEXT: v_max_i32_e32 v0, s3, v0 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_max3_i32 v0, v0, s2, v3 ; VI-NEXT: flat_store_dword v[1:2], v0 ; VI-NEXT: s_endpgm ; -; GFX9_10-LABEL: test_max3_i32_vss: -; GFX9_10: ; %bb.0: -; GFX9_10-NEXT: v_max_i32_e32 v0, s2, v0 -; GFX9_10-NEXT: v_max_i32_e32 v0, s3, v0 -; GFX9_10-NEXT: global_store_dword v[1:2], v0, off -; GFX9_10-NEXT: s_endpgm +; GFX9-LABEL: test_max3_i32_vss: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_max3_i32 v0, v0, s2, v3 +; GFX9-NEXT: global_store_dword v[1:2], v0, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: test_max3_i32_vss: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_max3_i32 v0, v0, s2, s3 +; GFX10-NEXT: global_store_dword v[1:2], v0, off +; GFX10-NEXT: s_endpgm %max = call i32 @llvm.smax.i32(i32 %a, i32 %b) %max3 = call i32 @llvm.smax.i32(i32 %max, i32 %c) store i32 %max3, i32 addrspace(1)* %out, align 4 @@ -518,8 +493,7 @@ ; SI-NEXT: v_and_b32_e32 v0, s0, v0 ; SI-NEXT: v_and_b32_e32 v1, s0, v1 ; SI-NEXT: v_and_b32_e32 v2, s0, v2 -; SI-NEXT: v_max_u32_e32 v0, v0, v1 -; SI-NEXT: v_max_u32_e32 v0, v0, v2 +; SI-NEXT: v_max3_u32 v0, v0, v1, v2 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: buffer_store_short v0, v[3:4], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm @@ -531,19 +505,11 @@ ; VI-NEXT: flat_store_short v[3:4], v0 ; VI-NEXT: s_endpgm ; -; GFX9-LABEL: test_max3_u16_vvv: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_max_u16_e32 v0, v0, v1 -; GFX9-NEXT: v_max_u16_e32 v0, v0, v2 -; GFX9-NEXT: global_store_short v[3:4], v0, off -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: test_max3_u16_vvv: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_max_u16 v0, v0, v1 -; GFX10-NEXT: v_max_u16 v0, v0, v2 -; GFX10-NEXT: global_store_short v[3:4], v0, off -; GFX10-NEXT: s_endpgm +; GFX9_10-LABEL: test_max3_u16_vvv: +; GFX9_10: ; %bb.0: +; GFX9_10-NEXT: v_max3_u16 v0, v0, v1, v2 +; GFX9_10-NEXT: global_store_short v[3:4], v0, off +; GFX9_10-NEXT: s_endpgm %max = call i16 @llvm.umax.i16(i16 %a, i16 %b) %max3 = call i16 @llvm.umax.i16(i16 %max, i16 %c) store i16 %max3, i16 addrspace(1)* %out, align 4 @@ -559,8 +525,7 @@ ; SI-NEXT: s_and_b32 s1, s2, s0 ; SI-NEXT: v_and_b32_e32 v0, s0, v0 ; SI-NEXT: v_and_b32_e32 v1, s0, v1 -; SI-NEXT: v_max_u32_e32 v0, s1, v0 -; SI-NEXT: v_max_u32_e32 v0, v0, v1 +; SI-NEXT: v_max3_u32 v0, s1, v0, v1 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm @@ -572,19 +537,11 @@ ; VI-NEXT: flat_store_short v[2:3], v0 ; VI-NEXT: s_endpgm ; -; GFX9-LABEL: test_max3_u16_svv: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_max_u16_e32 v0, s2, v0 -; GFX9-NEXT: v_max_u16_e32 v0, v0, v1 -; GFX9-NEXT: global_store_short v[2:3], v0, off -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: test_max3_u16_svv: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_max_u16 v0, s2, v0 -; GFX10-NEXT: v_max_u16 v0, v0, v1 -; GFX10-NEXT: global_store_short v[2:3], v0, off -; GFX10-NEXT: s_endpgm +; GFX9_10-LABEL: test_max3_u16_svv: +; GFX9_10: ; %bb.0: +; GFX9_10-NEXT: v_max3_u16 v0, s2, v0, v1 +; GFX9_10-NEXT: global_store_short v[2:3], v0, off +; GFX9_10-NEXT: s_endpgm %max = call i16 @llvm.umax.i16(i16 %a, i16 %b) %max3 = call i16 @llvm.umax.i16(i16 %max, i16 %c) store i16 %max3, i16 addrspace(1)* %out, align 4 @@ -600,8 +557,7 @@ ; SI-NEXT: v_and_b32_e32 v0, s0, v0 ; SI-NEXT: v_and_b32_e32 v1, s0, v1 ; SI-NEXT: s_and_b32 s0, s2, s0 -; SI-NEXT: v_max_u32_e32 v0, v0, v1 -; SI-NEXT: v_max_u32_e32 v0, s0, v0 +; SI-NEXT: v_max3_u32 v0, v0, v1, s0 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm @@ -613,19 +569,11 @@ ; VI-NEXT: flat_store_short v[2:3], v0 ; VI-NEXT: s_endpgm ; -; GFX9-LABEL: test_max3_u16_vvs: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_max_u16_e32 v0, v0, v1 -; GFX9-NEXT: v_max_u16_e32 v0, s2, v0 -; GFX9-NEXT: global_store_short v[2:3], v0, off -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: test_max3_u16_vvs: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_max_u16 v0, v0, v1 -; GFX10-NEXT: v_max_u16 v0, v0, s2 -; GFX10-NEXT: global_store_short v[2:3], v0, off -; GFX10-NEXT: s_endpgm +; GFX9_10-LABEL: test_max3_u16_vvs: +; GFX9_10: ; %bb.0: +; GFX9_10-NEXT: v_max3_u16 v0, v0, v1, s2 +; GFX9_10-NEXT: global_store_short v[2:3], v0, off +; GFX9_10-NEXT: s_endpgm %max = call i16 @llvm.umax.i16(i16 %a, i16 %b) %max3 = call i16 @llvm.umax.i16(i16 %max, i16 %c) store i16 %max3, i16 addrspace(1)* %out, align 4 @@ -703,15 +651,14 @@ ; ; GFX9-LABEL: test_max3_i16_vss: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_max_i16_e32 v0, s2, v0 -; GFX9-NEXT: v_max_i16_e32 v0, s3, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_max3_i16 v0, v0, s2, v3 ; GFX9-NEXT: global_store_short v[1:2], v0, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: test_max3_i16_vss: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_max_i16 v0, v0, s2 -; GFX10-NEXT: v_max_i16 v0, v0, s3 +; GFX10-NEXT: v_max3_i16 v0, v0, s2, s3 ; GFX10-NEXT: global_store_short v[1:2], v0, off ; GFX10-NEXT: s_endpgm %max = call i16 @llvm.smax.i16(i16 %a, i16 %b) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/min3.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/min3.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/min3.ll @@ -8,23 +8,20 @@ ; SI_VI-LABEL: test_min3_u32: ; SI_VI: ; %bb.0: ; SI_VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI_VI-NEXT: v_min_u32_e32 v0, v0, v1 -; SI_VI-NEXT: v_min_u32_e32 v0, v0, v2 +; SI_VI-NEXT: v_min3_u32 v0, v0, v1, v2 ; SI_VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_min3_u32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_min_u32_e32 v0, v0, v2 +; GFX9-NEXT: v_min3_u32 v0, v0, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_min3_u32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_u32_e32 v0, v0, v1 -; GFX10-NEXT: v_min_u32_e32 v0, v0, v2 +; GFX10-NEXT: v_min3_u32 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %umin = call i32 @llvm.umin.i32(i32 %a, i32 %b) %umin3 = call i32 @llvm.umin.i32(i32 %umin, i32 %c) @@ -46,23 +43,20 @@ ; SI_VI-LABEL: test_min3_u32_commute: ; SI_VI: ; %bb.0: ; SI_VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI_VI-NEXT: v_min_u32_e32 v0, v0, v1 -; SI_VI-NEXT: v_min_u32_e32 v0, v2, v0 +; SI_VI-NEXT: v_min3_u32 v0, v0, v1, v2 ; SI_VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_min3_u32_commute: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_min_u32_e32 v0, v2, v0 +; GFX9-NEXT: v_min3_u32 v0, v0, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_min3_u32_commute: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_u32_e32 v0, v0, v1 -; GFX10-NEXT: v_min_u32_e32 v0, v2, v0 +; GFX10-NEXT: v_min3_u32 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %umin = call i32 @llvm.umin.i32(i32 %a, i32 %b) %umin3 = call i32 @llvm.umin.i32(i32 %c, i32 %umin) @@ -76,9 +70,8 @@ ; SI-NEXT: s_mov_b32 s4, 0xffff ; SI-NEXT: v_and_b32_e32 v0, s4, v0 ; SI-NEXT: v_and_b32_e32 v1, s4, v1 -; SI-NEXT: v_min_u32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, s4, v2 -; SI-NEXT: v_min_u32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v2, s4, v2 +; SI-NEXT: v_min3_u32 v0, v0, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: test_min3_u16: @@ -91,16 +84,14 @@ ; GFX9-LABEL: test_min3_u16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_u16_e32 v0, v0, v1 -; GFX9-NEXT: v_min_u16_e32 v0, v0, v2 +; GFX9-NEXT: v_min3_u16 v0, v0, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_min3_u16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_u16 v0, v0, v1 -; GFX10-NEXT: v_min_u16 v0, v0, v2 +; GFX10-NEXT: v_min3_u16 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %umin = call i16 @llvm.umin.i16(i16 %a, i16 %b) %umin3 = call i16 @llvm.umin.i16(i16 %umin, i16 %c) @@ -114,9 +105,8 @@ ; SI-NEXT: s_mov_b32 s4, 0xffff ; SI-NEXT: v_and_b32_e32 v0, s4, v0 ; SI-NEXT: v_and_b32_e32 v1, s4, v1 -; SI-NEXT: v_min_u32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, s4, v2 -; SI-NEXT: v_min_u32_e32 v0, v1, v0 +; SI-NEXT: v_and_b32_e32 v2, s4, v2 +; SI-NEXT: v_min3_u32 v0, v0, v1, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: test_min3_u16_commute: @@ -129,16 +119,14 @@ ; GFX9-LABEL: test_min3_u16_commute: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_u16_e32 v0, v0, v1 -; GFX9-NEXT: v_min_u16_e32 v0, v2, v0 +; GFX9-NEXT: v_min3_u16 v0, v0, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_min3_u16_commute: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_u16 v0, v0, v1 -; GFX10-NEXT: v_min_u16 v0, v2, v0 +; GFX10-NEXT: v_min3_u16 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %umin = call i16 @llvm.umin.i16(i16 %a, i16 %b) %umin3 = call i16 @llvm.umin.i16(i16 %c, i16 %umin) @@ -149,23 +137,20 @@ ; SI_VI-LABEL: test_min3_i32: ; SI_VI: ; %bb.0: ; SI_VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI_VI-NEXT: v_min_i32_e32 v0, v0, v1 -; SI_VI-NEXT: v_min_i32_e32 v0, v0, v2 +; SI_VI-NEXT: v_min3_i32 v0, v0, v1, v2 ; SI_VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_min3_i32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_i32_e32 v0, v0, v1 -; GFX9-NEXT: v_min_i32_e32 v0, v0, v2 +; GFX9-NEXT: v_min3_i32 v0, v0, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_min3_i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_i32_e32 v0, v0, v1 -; GFX10-NEXT: v_min_i32_e32 v0, v0, v2 +; GFX10-NEXT: v_min3_i32 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %smin = call i32 @llvm.smin.i32(i32 %a, i32 %b) %smin3 = call i32 @llvm.smin.i32(i32 %smin, i32 %c) @@ -187,23 +172,20 @@ ; SI_VI-LABEL: test_min3_i32_commute: ; SI_VI: ; %bb.0: ; SI_VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI_VI-NEXT: v_min_i32_e32 v0, v0, v1 -; SI_VI-NEXT: v_min_i32_e32 v0, v2, v0 +; SI_VI-NEXT: v_min3_i32 v0, v0, v1, v2 ; SI_VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_min3_i32_commute: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_i32_e32 v0, v0, v1 -; GFX9-NEXT: v_min_i32_e32 v0, v2, v0 +; GFX9-NEXT: v_min3_i32 v0, v0, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_min3_i32_commute: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_i32_e32 v0, v0, v1 -; GFX10-NEXT: v_min_i32_e32 v0, v2, v0 +; GFX10-NEXT: v_min3_i32 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %smin = call i32 @llvm.smin.i32(i32 %a, i32 %b) %smin3 = call i32 @llvm.smin.i32(i32 %c, i32 %smin) @@ -232,16 +214,14 @@ ; GFX9-LABEL: test_min3_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_i16_e32 v0, v0, v1 -; GFX9-NEXT: v_min_i16_e32 v0, v0, v2 +; GFX9-NEXT: v_min3_i16 v0, v0, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_min3_i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_i16 v0, v0, v1 -; GFX10-NEXT: v_min_i16 v0, v0, v2 +; GFX10-NEXT: v_min3_i16 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %smin = call i16 @llvm.smin.i16(i16 %a, i16 %b) %smin3 = call i16 @llvm.smin.i16(i16 %smin, i16 %c) @@ -270,16 +250,14 @@ ; GFX9-LABEL: test_min3_i16_commute: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_i16_e32 v0, v0, v1 -; GFX9-NEXT: v_min_i16_e32 v0, v2, v0 +; GFX9-NEXT: v_min3_i16 v0, v0, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_min3_i16_commute: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_min_i16 v0, v0, v1 -; GFX10-NEXT: v_min_i16 v0, v2, v0 +; GFX10-NEXT: v_min3_i16 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %smin = call i16 @llvm.smin.i16(i16 %a, i16 %b) %smin3 = call i16 @llvm.smin.i16(i16 %c, i16 %smin) @@ -329,25 +307,22 @@ define amdgpu_ps void @test_min3_u32_vvv(i32 %a, i32 %b, i32 %c, i32 addrspace(1)* %out) { ; SI-LABEL: test_min3_u32_vvv: ; SI: ; %bb.0: -; SI-NEXT: v_min_u32_e32 v0, v0, v1 +; SI-NEXT: v_min3_u32 v0, v0, v1, v2 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: v_min_u32_e32 v0, v0, v2 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: buffer_store_dword v0, v[3:4], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_min3_u32_vvv: ; VI: ; %bb.0: -; VI-NEXT: v_min_u32_e32 v0, v0, v1 -; VI-NEXT: v_min_u32_e32 v0, v0, v2 +; VI-NEXT: v_min3_u32 v0, v0, v1, v2 ; VI-NEXT: flat_store_dword v[3:4], v0 ; VI-NEXT: s_endpgm ; ; GFX9_10-LABEL: test_min3_u32_vvv: ; GFX9_10: ; %bb.0: -; GFX9_10-NEXT: v_min_u32_e32 v0, v0, v1 -; GFX9_10-NEXT: v_min_u32_e32 v0, v0, v2 +; GFX9_10-NEXT: v_min3_u32 v0, v0, v1, v2 ; GFX9_10-NEXT: global_store_dword v[3:4], v0, off ; GFX9_10-NEXT: s_endpgm %min = call i32 @llvm.umin.i32(i32 %a, i32 %b) @@ -359,25 +334,22 @@ define amdgpu_ps void @test_min3_u32_svv(i32 inreg %a, i32 %b, i32 %c, i32 addrspace(1)* %out) { ; SI-LABEL: test_min3_u32_svv: ; SI: ; %bb.0: -; SI-NEXT: v_min_u32_e32 v0, s2, v0 +; SI-NEXT: v_min3_u32 v0, s2, v0, v1 ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: v_min_u32_e32 v0, v0, v1 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_min3_u32_svv: ; VI: ; %bb.0: -; VI-NEXT: v_min_u32_e32 v0, s2, v0 -; VI-NEXT: v_min_u32_e32 v0, v0, v1 +; VI-NEXT: v_min3_u32 v0, s2, v0, v1 ; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9_10-LABEL: test_min3_u32_svv: ; GFX9_10: ; %bb.0: -; GFX9_10-NEXT: v_min_u32_e32 v0, s2, v0 -; GFX9_10-NEXT: v_min_u32_e32 v0, v0, v1 +; GFX9_10-NEXT: v_min3_u32 v0, s2, v0, v1 ; GFX9_10-NEXT: global_store_dword v[2:3], v0, off ; GFX9_10-NEXT: s_endpgm %min = call i32 @llvm.umin.i32(i32 %a, i32 %b) @@ -389,25 +361,22 @@ define amdgpu_ps void @test_min3_u32_vvs(i32 %a, i32 %b, i32 inreg %c, i32 addrspace(1)* %out) { ; SI-LABEL: test_min3_u32_vvs: ; SI: ; %bb.0: -; SI-NEXT: v_min_u32_e32 v0, v0, v1 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: v_min_u32_e32 v0, s2, v0 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 +; SI-NEXT: v_min3_u32 v0, v0, v1, s2 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b64 s[0:1], 0 +; SI-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_min3_u32_vvs: ; VI: ; %bb.0: -; VI-NEXT: v_min_u32_e32 v0, v0, v1 -; VI-NEXT: v_min_u32_e32 v0, s2, v0 +; VI-NEXT: v_min3_u32 v0, v0, v1, s2 ; VI-NEXT: flat_store_dword v[2:3], v0 ; VI-NEXT: s_endpgm ; ; GFX9_10-LABEL: test_min3_u32_vvs: ; GFX9_10: ; %bb.0: -; GFX9_10-NEXT: v_min_u32_e32 v0, v0, v1 -; GFX9_10-NEXT: v_min_u32_e32 v0, s2, v0 +; GFX9_10-NEXT: v_min3_u32 v0, v0, v1, s2 ; GFX9_10-NEXT: global_store_dword v[2:3], v0, off ; GFX9_10-NEXT: s_endpgm %min = call i32 @llvm.umin.i32(i32 %a, i32 %b) @@ -449,27 +418,33 @@ define amdgpu_ps void @test_min3_i32_vss(i32 %a, i32 inreg %b, i32 inreg %c, i32 addrspace(1)* %out) { ; SI-LABEL: test_min3_i32_vss: ; SI: ; %bb.0: -; SI-NEXT: v_min_i32_e32 v0, s2, v0 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: v_min_i32_e32 v0, s3, v0 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 +; SI-NEXT: v_mov_b32_e32 v3, s3 +; SI-NEXT: v_min3_i32 v0, v0, s2, v3 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b64 s[0:1], 0 +; SI-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_min3_i32_vss: ; VI: ; %bb.0: -; VI-NEXT: v_min_i32_e32 v0, s2, v0 -; VI-NEXT: v_min_i32_e32 v0, s3, v0 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_min3_i32 v0, v0, s2, v3 ; VI-NEXT: flat_store_dword v[1:2], v0 ; VI-NEXT: s_endpgm ; -; GFX9_10-LABEL: test_min3_i32_vss: -; GFX9_10: ; %bb.0: -; GFX9_10-NEXT: v_min_i32_e32 v0, s2, v0 -; GFX9_10-NEXT: v_min_i32_e32 v0, s3, v0 -; GFX9_10-NEXT: global_store_dword v[1:2], v0, off -; GFX9_10-NEXT: s_endpgm +; GFX9-LABEL: test_min3_i32_vss: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_min3_i32 v0, v0, s2, v3 +; GFX9-NEXT: global_store_dword v[1:2], v0, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: test_min3_i32_vss: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_min3_i32 v0, v0, s2, s3 +; GFX10-NEXT: global_store_dword v[1:2], v0, off +; GFX10-NEXT: s_endpgm %min = call i32 @llvm.smin.i32(i32 %a, i32 %b) %min3 = call i32 @llvm.smin.i32(i32 %min, i32 %c) store i32 %min3, i32 addrspace(1)* %out, align 4 @@ -518,8 +493,7 @@ ; SI-NEXT: v_and_b32_e32 v0, s0, v0 ; SI-NEXT: v_and_b32_e32 v1, s0, v1 ; SI-NEXT: v_and_b32_e32 v2, s0, v2 -; SI-NEXT: v_min_u32_e32 v0, v0, v1 -; SI-NEXT: v_min_u32_e32 v0, v0, v2 +; SI-NEXT: v_min3_u32 v0, v0, v1, v2 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: buffer_store_short v0, v[3:4], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm @@ -531,19 +505,11 @@ ; VI-NEXT: flat_store_short v[3:4], v0 ; VI-NEXT: s_endpgm ; -; GFX9-LABEL: test_min3_u16_vvv: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_min_u16_e32 v0, v0, v1 -; GFX9-NEXT: v_min_u16_e32 v0, v0, v2 -; GFX9-NEXT: global_store_short v[3:4], v0, off -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: test_min3_u16_vvv: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_min_u16 v0, v0, v1 -; GFX10-NEXT: v_min_u16 v0, v0, v2 -; GFX10-NEXT: global_store_short v[3:4], v0, off -; GFX10-NEXT: s_endpgm +; GFX9_10-LABEL: test_min3_u16_vvv: +; GFX9_10: ; %bb.0: +; GFX9_10-NEXT: v_min3_u16 v0, v0, v1, v2 +; GFX9_10-NEXT: global_store_short v[3:4], v0, off +; GFX9_10-NEXT: s_endpgm %min = call i16 @llvm.umin.i16(i16 %a, i16 %b) %min3 = call i16 @llvm.umin.i16(i16 %min, i16 %c) store i16 %min3, i16 addrspace(1)* %out, align 4 @@ -559,8 +525,7 @@ ; SI-NEXT: s_and_b32 s1, s2, s0 ; SI-NEXT: v_and_b32_e32 v0, s0, v0 ; SI-NEXT: v_and_b32_e32 v1, s0, v1 -; SI-NEXT: v_min_u32_e32 v0, s1, v0 -; SI-NEXT: v_min_u32_e32 v0, v0, v1 +; SI-NEXT: v_min3_u32 v0, s1, v0, v1 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm @@ -572,19 +537,11 @@ ; VI-NEXT: flat_store_short v[2:3], v0 ; VI-NEXT: s_endpgm ; -; GFX9-LABEL: test_min3_u16_svv: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_min_u16_e32 v0, s2, v0 -; GFX9-NEXT: v_min_u16_e32 v0, v0, v1 -; GFX9-NEXT: global_store_short v[2:3], v0, off -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: test_min3_u16_svv: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_min_u16 v0, s2, v0 -; GFX10-NEXT: v_min_u16 v0, v0, v1 -; GFX10-NEXT: global_store_short v[2:3], v0, off -; GFX10-NEXT: s_endpgm +; GFX9_10-LABEL: test_min3_u16_svv: +; GFX9_10: ; %bb.0: +; GFX9_10-NEXT: v_min3_u16 v0, s2, v0, v1 +; GFX9_10-NEXT: global_store_short v[2:3], v0, off +; GFX9_10-NEXT: s_endpgm %min = call i16 @llvm.umin.i16(i16 %a, i16 %b) %min3 = call i16 @llvm.umin.i16(i16 %min, i16 %c) store i16 %min3, i16 addrspace(1)* %out, align 4 @@ -600,8 +557,7 @@ ; SI-NEXT: v_and_b32_e32 v0, s0, v0 ; SI-NEXT: v_and_b32_e32 v1, s0, v1 ; SI-NEXT: s_and_b32 s0, s2, s0 -; SI-NEXT: v_min_u32_e32 v0, v0, v1 -; SI-NEXT: v_min_u32_e32 v0, s0, v0 +; SI-NEXT: v_min3_u32 v0, v0, v1, s0 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 ; SI-NEXT: s_endpgm @@ -613,19 +569,11 @@ ; VI-NEXT: flat_store_short v[2:3], v0 ; VI-NEXT: s_endpgm ; -; GFX9-LABEL: test_min3_u16_vvs: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_min_u16_e32 v0, v0, v1 -; GFX9-NEXT: v_min_u16_e32 v0, s2, v0 -; GFX9-NEXT: global_store_short v[2:3], v0, off -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: test_min3_u16_vvs: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_min_u16 v0, v0, v1 -; GFX10-NEXT: v_min_u16 v0, v0, s2 -; GFX10-NEXT: global_store_short v[2:3], v0, off -; GFX10-NEXT: s_endpgm +; GFX9_10-LABEL: test_min3_u16_vvs: +; GFX9_10: ; %bb.0: +; GFX9_10-NEXT: v_min3_u16 v0, v0, v1, s2 +; GFX9_10-NEXT: global_store_short v[2:3], v0, off +; GFX9_10-NEXT: s_endpgm %min = call i16 @llvm.umin.i16(i16 %a, i16 %b) %min3 = call i16 @llvm.umin.i16(i16 %min, i16 %c) store i16 %min3, i16 addrspace(1)* %out, align 4 @@ -703,15 +651,14 @@ ; ; GFX9-LABEL: test_min3_i16_vss: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_min_i16_e32 v0, s2, v0 -; GFX9-NEXT: v_min_i16_e32 v0, s3, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_min3_i16 v0, v0, s2, v3 ; GFX9-NEXT: global_store_short v[1:2], v0, off ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: test_min3_i16_vss: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_min_i16 v0, v0, s2 -; GFX10-NEXT: v_min_i16 v0, v0, s3 +; GFX10-NEXT: v_min3_i16 v0, v0, s2, s3 ; GFX10-NEXT: global_store_short v[1:2], v0, off ; GFX10-NEXT: s_endpgm %min = call i16 @llvm.smin.i16(i16 %a, i16 %b) Index: llvm/test/CodeGen/AMDGPU/ctlz.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ctlz.ll +++ llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -524,10 +524,11 @@ ; SI-NEXT: s_flbit_i32_b32 s4, s4 ; SI-NEXT: s_flbit_i32_b32 s5, s5 ; SI-NEXT: s_min_u32 s4, s4, 0xffffffdf -; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: s_add_i32 s4, s4, 32 -; SI-NEXT: v_min3_u32 v0, s4, v0, 64 +; SI-NEXT: s_min_u32 s4, s4, s5 +; SI-NEXT: s_min_u32 s4, s4, 64 ; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -542,7 +543,8 @@ ; VI-NEXT: s_flbit_i32_b32 s4, s4 ; VI-NEXT: v_add_u32_e64 v0, s[6:7], s4, 32 clamp ; VI-NEXT: s_flbit_i32_b32 s4, s5 -; VI-NEXT: v_min3_u32 v0, v0, s4, 64 +; VI-NEXT: v_min_u32_e32 v0, s4, v0 +; VI-NEXT: v_min_u32_e32 v0, 64, v0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -574,7 +576,8 @@ ; GFX10-NEXT: s_flbit_i32_b32 s0, s2 ; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, 32 clamp ; GFX10-NEXT: s_flbit_i32_b32 s0, s3 -; GFX10-NEXT: v_min3_u32 v0, v0, s0, 64 +; GFX10-NEXT: v_min_u32_e32 v0, s0, v0 +; GFX10-NEXT: v_min_u32_e32 v0, 64, v0 ; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; @@ -603,14 +606,15 @@ ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_flbit_i32_b32 s4, s4 -; SI-NEXT: s_min_u32 s4, s4, 0xffffffdf -; SI-NEXT: s_flbit_i32_b32 s5, s5 -; SI-NEXT: s_add_i32 s4, s4, 32 -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_min3_u32 v0, s4, v0, 64 +; SI-NEXT: s_flbit_i32_b32 s2, s4 +; SI-NEXT: s_flbit_i32_b32 s4, s5 +; SI-NEXT: s_min_u32 s2, s2, 0xffffffdf +; SI-NEXT: s_add_i32 s2, s2, 32 +; SI-NEXT: s_min_u32 s2, s2, s4 +; SI-NEXT: s_min_u32 s4, s2, 64 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -624,7 +628,8 @@ ; VI-NEXT: s_flbit_i32_b32 s4, s4 ; VI-NEXT: v_add_u32_e64 v0, s[6:7], s4, 32 clamp ; VI-NEXT: s_flbit_i32_b32 s4, s5 -; VI-NEXT: v_min3_u32 v0, v0, s4, 64 +; VI-NEXT: v_min_u32_e32 v0, s4, v0 +; VI-NEXT: v_min_u32_e32 v0, 64, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -655,7 +660,8 @@ ; GFX10-NEXT: s_flbit_i32_b32 s0, s2 ; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, 32 clamp ; GFX10-NEXT: s_flbit_i32_b32 s0, s3 -; GFX10-NEXT: v_min3_u32 v0, v0, s0, 64 +; GFX10-NEXT: v_min_u32_e32 v0, s0, v0 +; GFX10-NEXT: v_min_u32_e32 v0, 64, v0 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; @@ -775,9 +781,8 @@ ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp -; GFX10-GISEL-NEXT: v_min_u32_e32 v0, v1, v0 +; GFX10-GISEL-NEXT: v_min3_u32 v0, v1, v0, 64 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 64, v0 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -888,8 +893,7 @@ ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 ; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp -; GFX10-GISEL-NEXT: v_min_u32_e32 v1, v2, v1 -; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 64, v1 +; GFX10-GISEL-NEXT: v_min3_u32 v1, v2, v1, 64 ; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() Index: llvm/test/CodeGen/AMDGPU/cttz.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/cttz.ll +++ llvm/test/CodeGen/AMDGPU/cttz.ll @@ -515,12 +515,13 @@ ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ff1_i32_b32 s5, s5 +; SI-NEXT: s_ff1_i32_b32 s4, s4 ; SI-NEXT: s_min_u32 s5, s5, 0xffffffdf ; SI-NEXT: s_add_i32 s5, s5, 32 -; SI-NEXT: s_ff1_i32_b32 s4, s4 -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_min3_u32 v0, s4, v0, 64 +; SI-NEXT: s_min_u32 s4, s4, s5 +; SI-NEXT: s_min_u32 s4, s4, 64 ; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -535,7 +536,8 @@ ; VI-NEXT: s_ff1_i32_b32 s5, s5 ; VI-NEXT: v_add_u32_e64 v0, s[6:7], s5, 32 clamp ; VI-NEXT: s_ff1_i32_b32 s4, s4 -; VI-NEXT: v_min3_u32 v0, s4, v0, 64 +; VI-NEXT: v_min_u32_e32 v0, s4, v0 +; VI-NEXT: v_min_u32_e32 v0, 64, v0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -567,7 +569,8 @@ ; GFX10-NEXT: s_ff1_i32_b32 s0, s3 ; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, 32 clamp ; GFX10-NEXT: s_ff1_i32_b32 s0, s2 -; GFX10-NEXT: v_min3_u32 v0, s0, v0, 64 +; GFX10-NEXT: v_min_u32_e32 v0, s0, v0 +; GFX10-NEXT: v_min_u32_e32 v0, 64, v0 ; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; @@ -596,14 +599,15 @@ ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ff1_i32_b32 s5, s5 -; SI-NEXT: s_min_u32 s5, s5, 0xffffffdf -; SI-NEXT: s_add_i32 s5, s5, 32 +; SI-NEXT: s_ff1_i32_b32 s2, s5 ; SI-NEXT: s_ff1_i32_b32 s4, s4 -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_min3_u32 v0, s4, v0, 64 +; SI-NEXT: s_min_u32 s2, s2, 0xffffffdf +; SI-NEXT: s_add_i32 s2, s2, 32 +; SI-NEXT: s_min_u32 s2, s4, s2 +; SI-NEXT: s_min_u32 s4, s2, 64 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -617,7 +621,8 @@ ; VI-NEXT: s_ff1_i32_b32 s5, s5 ; VI-NEXT: v_add_u32_e64 v0, s[6:7], s5, 32 clamp ; VI-NEXT: s_ff1_i32_b32 s4, s4 -; VI-NEXT: v_min3_u32 v0, s4, v0, 64 +; VI-NEXT: v_min_u32_e32 v0, s4, v0 +; VI-NEXT: v_min_u32_e32 v0, 64, v0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; @@ -648,7 +653,8 @@ ; GFX10-NEXT: s_ff1_i32_b32 s0, s3 ; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, 32 clamp ; GFX10-NEXT: s_ff1_i32_b32 s0, s2 -; GFX10-NEXT: v_min3_u32 v0, s0, v0, 64 +; GFX10-NEXT: v_min_u32_e32 v0, s0, v0 +; GFX10-NEXT: v_min_u32_e32 v0, 64, v0 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; @@ -768,9 +774,8 @@ ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp -; GFX10-GISEL-NEXT: v_min_u32_e32 v0, v0, v1 +; GFX10-GISEL-NEXT: v_min3_u32 v0, v0, v1, 64 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 64, v0 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -881,8 +886,7 @@ ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp -; GFX10-GISEL-NEXT: v_min_u32_e32 v1, v1, v2 -; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 64, v1 +; GFX10-GISEL-NEXT: v_min3_u32 v1, v1, v2, 64 ; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() Index: llvm/test/CodeGen/AMDGPU/fmax3.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fmax3.ll +++ llvm/test/CodeGen/AMDGPU/fmax3.ll @@ -1,13 +1,20 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,CST_BUS_LIM_1 %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,CST_BUS_LIM_1 %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SI_VI,CST_BUS_LIM_1 %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SI_VI,CST_BUS_LIM_1 %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9_10,CST_BUS_LIM_1 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10,GFX9_10,CST_BUS_LIM_2 %s ; GCN-LABEL: {{^}}test_fmax3_olt_0_f32: -; GCN: buffer_load_dword [[REGC:v[0-9]+]] -; GCN: buffer_load_dword [[REGB:v[0-9]+]] ; GCN: buffer_load_dword [[REGA:v[0-9]+]] -; GCN: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] +; GCN: buffer_load_dword [[REGB:v[0-9]+]] +; GCN: buffer_load_dword [[REGC:v[0-9]+]] +; SI_VI: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[REGA]] +; SI_VI: v_mul_f32_e32 [[QUIET_B:v[0-9]+]], 1.0, [[REGB]] +; GFX9_10: v_max_f32_e32 [[QUIET_A:v[0-9]+]], [[REGA]], [[REGA]] +; GFX9_10: v_max_f32_e32 [[QUIET_B:v[0-9]+]], [[REGB]], [[REGB]] +; GCN: v_max_f32_e32 [[FMAX:v[0-9]+]], [[QUIET_A]], [[QUIET_B]] +; SI_VI: v_mul_f32_e32 [[QUIET_C:v[0-9]+]], 1.0, [[REGC]] +; GFX9_10: v_max_f32_e32 [[QUIET_C:v[0-9]+]], [[REGC]], [[REGC]] +; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], [[FMAX]], [[QUIET_C]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm define amdgpu_kernel void @test_fmax3_olt_0_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #0 { @@ -22,10 +29,17 @@ ; Commute operand of second fmax ; GCN-LABEL: {{^}}test_fmax3_olt_1_f32: -; GCN: buffer_load_dword [[REGB:v[0-9]+]] ; GCN: buffer_load_dword [[REGA:v[0-9]+]] +; GCN: buffer_load_dword [[REGB:v[0-9]+]] ; GCN: buffer_load_dword [[REGC:v[0-9]+]] -; GCN: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] +; SI_VI: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[REGA]] +; SI_VI: v_mul_f32_e32 [[QUIET_B:v[0-9]+]], 1.0, [[REGB]] +; GFX9_10: v_max_f32_e32 [[QUIET_A:v[0-9]+]], [[REGA]], [[REGA]] +; GFX9_10: v_max_f32_e32 [[QUIET_B:v[0-9]+]], [[REGB]], [[REGB]] +; GCN: v_max_f32_e32 [[FMAX:v[0-9]+]], [[QUIET_A]], [[QUIET_B]] +; SI_VI: v_mul_f32_e32 [[QUIET_C:v[0-9]+]], 1.0, [[REGC]] +; GFX9_10: v_max_f32_e32 [[QUIET_C:v[0-9]+]], [[REGC]], [[REGC]] +; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], [[QUIET_C]], [[FMAX]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm define amdgpu_kernel void @test_fmax3_olt_1_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #0 { @@ -46,7 +60,11 @@ ; SI-DAG: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], [[REGA]] ; SI-DAG: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], [[REGB]] ; SI-DAG: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], [[REGC]] -; SI: v_max3_f32 [[RESULT_F32:v[0-9]+]], [[CVT_A]], [[CVT_B]], [[CVT_C]] +; SI-DAG: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[CVT_A]] +; SI-DAG: v_mul_f32_e32 [[QUIET_B:v[0-9]+]], 1.0, [[CVT_B]] +; SI-DAG: v_max_f32_e32 [[FMAX:v[0-9]+]], [[QUIET_A]], [[QUIET_B]] +; SI-DAG: v_mul_f32_e32 [[QUIET_C:v[0-9]+]], 1.0, [[CVT_C]] +; SI-DAG: v_max_f32_e32 [[RESULT_F32:v[0-9]+]], [[FMAX]], [[QUIET_C]] ; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT_F32]] ; VI-DAG: v_max_f16_e32 [[QUIET_A:v[0-9]+]], [[REGA]], [[REGA]] @@ -55,7 +73,12 @@ ; VI: v_max_f16_e32 [[QUIET_C:v[0-9]+]], [[REGC]], [[REGC]] ; VI: v_max_f16_e32 [[RESULT:v[0-9]+]], [[MAX0]], [[QUIET_C]] -; GFX9_10: v_max3_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]] + +; GFX9_10: v_max_f16_e32 [[QUIET_A:v[0-9]+]], [[REGA]], [[REGA]] +; GFX9_10: v_max_f16_e32 [[QUIET_B:v[0-9]+]], [[REGB]], [[REGB]] +; GFX9_10: v_max_f16_e32 [[FMAX:v[0-9]+]], [[QUIET_A]], [[QUIET_B]] +; GFX9_10: v_max_f16_e32 [[QUIET_C:v[0-9]+]], [[REGC]], [[REGC]] +; GFX9_10: v_max_f16_e32 [[RESULT:v[0-9]+]], [[FMAX]], [[QUIET_C]] ; GCN: buffer_store_short [[RESULT]], define amdgpu_kernel void @test_fmax3_olt_0_f16(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #0 { %a = load volatile half, half addrspace(1)* %aptr, align 2 @@ -76,7 +99,11 @@ ; SI-DAG: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], [[REGA]] ; SI-DAG: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], [[REGB]] ; SI-DAG: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], [[REGC]] -; SI: v_max3_f32 [[RESULT_F32:v[0-9]+]], [[CVT_C]], [[CVT_A]], [[CVT_B]] +; SI-DAG: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[CVT_A]] +; SI-DAG: v_mul_f32_e32 [[QUIET_B:v[0-9]+]], 1.0, [[CVT_B]] +; SI-DAG: v_max_f32_e32 [[FMAX:v[0-9]+]], [[QUIET_A]], [[QUIET_B]] +; SI-DAG: v_mul_f32_e32 [[QUIET_C:v[0-9]+]], 1.0, [[CVT_C]] +; SI-DAG: v_max_f32_e32 [[RESULT_F32:v[0-9]+]], [[QUIET_C]], [[FMAX]] ; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT_F32]] ; VI-DAG: v_max_f16_e32 [[QUIET_A:v[0-9]+]], [[REGA]], [[REGA]] @@ -85,7 +112,11 @@ ; VI: v_max_f16_e32 [[QUIET_C:v[0-9]+]], [[REGC]], [[REGC]] ; VI: v_max_f16_e32 [[RESULT:v[0-9]+]], [[QUIET_C]], [[MAX0]] -; GFX9_10: v_max3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGA]], [[REGB]] +; GFX9_10: v_max_f16_e32 [[QUIET_A:v[0-9]+]], [[REGA]], [[REGA]] +; GFX9_10: v_max_f16_e32 [[QUIET_B:v[0-9]+]], [[REGB]], [[REGB]] +; GFX9_10: v_max_f16_e32 [[FMAX:v[0-9]+]], [[QUIET_A]], [[QUIET_B]] +; GFX9_10: v_max_f16_e32 [[QUIET_C:v[0-9]+]], [[REGC]], [[REGC]] +; GFX9_10: v_max_f16_e32 [[RESULT:v[0-9]+]], [[QUIET_C]], [[FMAX]] ; GCN: buffer_store_short [[RESULT]], define amdgpu_kernel void @test_fmax3_olt_1_f16(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #0 { %a = load volatile half, half addrspace(1)* %aptr, align 2 @@ -206,11 +237,11 @@ ; GCN-LABEL: {{^}}test_max3_f32_sss: ; CST_BUS_LIM_1: v_mov_b32_e32 v[[B:[0-9]+]], s1 -; CST_BUS_LIM_1: v_mov_b32_e32 v[[C:[0-9]+]], s2 -; CST_BUS_LIM_1: v_max3_f32 v{{[0-9]+}}, s0, v[[B]], v[[C]] +; CST_BUS_LIM_1: v_max_f32_e32 v[[FMAX:[0-9]+]], s0, v[[B]] +; CST_BUS_LIM_1: v_max_f32_e32 v{{[0-9]+}}, s2, v[[FMAX]] -; CST_BUS_LIM_2: v_mov_b32_e32 v[[C:[0-9]+]], s2 -; CST_BUS_LIM_2: v_max3_f32 v{{[0-9]+}}, s0, s1, v[[C]] +; CST_BUS_LIM_2: v_max_f32_e64 v[[FMAX:[0-9]+]], s0, s1 +; CST_BUS_LIM_2: v_max_f32_e32 v{{[0-9]+}}, s2, v[[FMAX]] define amdgpu_ps void @test_max3_f32_sss(float inreg %a, float inreg %b, float inreg %c, float addrspace(1)* %out) { %fmax = call float @llvm.maxnum.f32(float %a, float %b) %fmax3 = call float @llvm.maxnum.f32(float %fmax, float %c) @@ -320,15 +351,16 @@ ; SI: v_cvt_f32_f16_e32 v[[C:[0-9]+]], v[[C_F16]] ; SI: v_cvt_f32_f16_e32 v[[B:[0-9]+]], v[[B_F16]] ; SI: v_cvt_f32_f16_e32 v[[A:[0-9]+]], v[[A_F16]] -; SI: v_max3_f32 v{{[0-9]+}}, v[[A]], v[[B]], v[[C]] +; SI: v_max_f32_e32 v[[FMAX:[0-9]+]], v[[A]], v[[B]] +; SI: v_max_f32_e32 v{{[0-9]+}}, v[[FMAX]], v[[C]] ; VI: v_mov_b32_e32 v[[B:[0-9]+]], s1 ; VI: v_max_f16_e32 v[[MAX:[0-9]+]], s0, v[[B]] ; VI: v_max_f16_e32 v{{[0-9]+}}, s2, v[[MAX]] ; GFX9: v_mov_b32_e32 v[[B:[0-9]+]], s1 -; GFX9: v_mov_b32_e32 v[[C:[0-9]+]], s2 -; GFX9: v_max3_f16 v{{[0-9]+}}, s0, v[[B]], v[[C]] -; GFX10: v_mov_b32_e32 v[[C:[0-9]+]], s2 -; GFX10: v_max3_f16 v{{[0-9]+}}, s0, s1, v[[C]] +; GFX9: v_max_f16_e32 v[[FMAX:[0-9]+]], s0, v[[B]] +; GFX9: v_max_f16_e32 v{{[0-9]+}}, s2, v[[FMAX]] +; GFX10: v_max_f16_e64 v[[FMAX:[0-9]+]], s0, s1 +; GFX10: v_max_f16_e32 v{{[0-9]+}}, s2, v[[FMAX]] define amdgpu_ps void @test_max3_f16_sss(half inreg %a, half inreg %b, half inreg %c, half addrspace(1)* %out) { %fmax = call half @llvm.maxnum.f16(half %a, half %b) %fmax3 = call half @llvm.maxnum.f16(half %fmax, half %c) Index: llvm/test/CodeGen/AMDGPU/fmin3.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fmin3.ll +++ llvm/test/CodeGen/AMDGPU/fmin3.ll @@ -1,13 +1,20 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,CST_BUS_LIM_1 %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,CST_BUS_LIM_1 %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SI_VI,CST_BUS_LIM_1 %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SI_VI,CST_BUS_LIM_1 %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9_10,CST_BUS_LIM_1 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10,GFX9_10,CST_BUS_LIM_2 %s ; GCN-LABEL: {{^}}test_fmin3_olt_0_f32: -; GCN: buffer_load_dword [[REGC:v[0-9]+]] -; GCN: buffer_load_dword [[REGB:v[0-9]+]] ; GCN: buffer_load_dword [[REGA:v[0-9]+]] -; GCN: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] +; GCN: buffer_load_dword [[REGB:v[0-9]+]] +; GCN: buffer_load_dword [[REGC:v[0-9]+]] +; SI_VI: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[REGA]] +; SI_VI: v_mul_f32_e32 [[QUIET_B:v[0-9]+]], 1.0, [[REGB]] +; GFX9_10: v_max_f32_e32 [[QUIET_A:v[0-9]+]], [[REGA]], [[REGA]] +; GFX9_10: v_max_f32_e32 [[QUIET_B:v[0-9]+]], [[REGB]], [[REGB]] +; GCN: v_min_f32_e32 [[FMIN:v[0-9]+]], [[QUIET_A]], [[QUIET_B]] +; SI_VI: v_mul_f32_e32 [[QUIET_C:v[0-9]+]], 1.0, [[REGC]] +; GFX9_10: v_max_f32_e32 [[QUIET_C:v[0-9]+]], [[REGC]], [[REGC]] +; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], [[FMIN]], [[QUIET_C]] ; GCN: buffer_store_dword [[RESULT]], define amdgpu_kernel void @test_fmin3_olt_0_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #0 { %a = load volatile float, float addrspace(1)* %aptr, align 4 @@ -21,10 +28,17 @@ ; Commute operand of second fmin ; GCN-LABEL: {{^}}test_fmin3_olt_1_f32: -; GCN: buffer_load_dword [[REGB:v[0-9]+]] ; GCN: buffer_load_dword [[REGA:v[0-9]+]] +; GCN: buffer_load_dword [[REGB:v[0-9]+]] ; GCN: buffer_load_dword [[REGC:v[0-9]+]] -; GCN: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] +; SI_VI: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[REGA]] +; SI_VI: v_mul_f32_e32 [[QUIET_B:v[0-9]+]], 1.0, [[REGB]] +; GFX9_10: v_max_f32_e32 [[QUIET_A:v[0-9]+]], [[REGA]], [[REGA]] +; GFX9_10: v_max_f32_e32 [[QUIET_B:v[0-9]+]], [[REGB]], [[REGB]] +; GCN: v_min_f32_e32 [[FMIN:v[0-9]+]], [[QUIET_A]], [[QUIET_B]] +; SI_VI: v_mul_f32_e32 [[QUIET_C:v[0-9]+]], 1.0, [[REGC]] +; GFX9_10: v_max_f32_e32 [[QUIET_C:v[0-9]+]], [[REGC]], [[REGC]] +; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], [[QUIET_C]], [[FMIN]] ; GCN: buffer_store_dword [[RESULT]], define amdgpu_kernel void @test_fmin3_olt_1_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #0 { %a = load volatile float, float addrspace(1)* %aptr, align 4 @@ -37,17 +51,29 @@ } ; GCN-LABEL: {{^}}test_fmin3_olt_0_f16: -; GCN: buffer_load_ushort [[REGC:v[0-9]+]] -; GCN: buffer_load_ushort [[REGB:v[0-9]+]] ; GCN: buffer_load_ushort [[REGA:v[0-9]+]] +; GCN: buffer_load_ushort [[REGB:v[0-9]+]] +; GCN: buffer_load_ushort [[REGC:v[0-9]+]] -; SI: v_min3_f32 [[RESULT_F32:v[0-9]+]], +; SI-DAG: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], [[REGA]] +; SI-DAG: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], [[REGB]] +; SI-DAG: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], [[REGC]] +; SI-DAG: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[CVT_A]] +; SI-DAG: v_mul_f32_e32 [[QUIET_B:v[0-9]+]], 1.0, [[CVT_B]] +; SI-DAG: v_min_f32_e32 [[FMIN:v[0-9]+]], [[QUIET_A]], [[QUIET_B]] +; SI-DAG: v_mul_f32_e32 [[QUIET_C:v[0-9]+]], 1.0, [[CVT_C]] +; SI-DAG: v_min_f32_e32 [[RESULT_F32:v[0-9]+]], [[FMIN]], [[QUIET_C]] ; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT]] ; VI: v_min_f16_e32 ; VI: v_min_f16_e32 [[RESULT:v[0-9]+]], -; GFX9_10: v_min3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] +; GFX9_10: v_max_f16_e32 [[QUIET_A:v[0-9]+]], [[REGA]], [[REGA]] +; GFX9_10: v_max_f16_e32 [[QUIET_B:v[0-9]+]], [[REGB]], [[REGB]] +; GFX9_10: v_min_f16_e32 [[FMIN:v[0-9]+]], [[QUIET_A]], [[QUIET_B]] +; GFX9_10: v_max_f16_e32 [[QUIET_C:v[0-9]+]], [[REGC]], [[REGC]] +; GFX9_10: v_min_f16_e32 [[RESULT:v[0-9]+]], [[FMIN]], [[QUIET_C]] + ; GCN: buffer_store_short [[RESULT]], define amdgpu_kernel void @test_fmin3_olt_0_f16(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #0 { %a = load volatile half, half addrspace(1)* %aptr, align 2 @@ -68,13 +94,22 @@ ; SI-DAG: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], [[REGA]] ; SI-DAG: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], [[REGB]] ; SI-DAG: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], [[REGC]] -; SI: v_min3_f32 [[RESULT_F32:v[0-9]+]], [[CVT_C]], [[CVT_A]], [[CVT_B]] +; SI-DAG: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[CVT_A]] +; SI-DAG: v_mul_f32_e32 [[QUIET_B:v[0-9]+]], 1.0, [[CVT_B]] +; SI-DAG: v_min_f32_e32 [[FMIN:v[0-9]+]], [[QUIET_A]], [[QUIET_B]] +; SI-DAG: v_mul_f32_e32 [[QUIET_C:v[0-9]+]], 1.0, [[CVT_C]] +; SI-DAG: v_min_f32_e32 [[RESULT_F32:v[0-9]+]], [[QUIET_C]], [[FMIN]] ; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT_F32]] ; VI: v_min_f16_e32 ; VI: v_min_f16_e32 [[RESULT:v[0-9]+]], -; GFX9_10: v_min3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGA]], [[REGB]] +; GFX9_10: v_max_f16_e32 [[QUIET_A:v[0-9]+]], [[REGA]], [[REGA]] +; GFX9_10: v_max_f16_e32 [[QUIET_B:v[0-9]+]], [[REGB]], [[REGB]] +; GFX9_10: v_min_f16_e32 [[FMIN:v[0-9]+]], [[QUIET_A]], [[QUIET_B]] +; GFX9_10: v_max_f16_e32 [[QUIET_C:v[0-9]+]], [[REGC]], [[REGC]] +; GFX9_10: v_min_f16_e32 [[RESULT:v[0-9]+]], [[QUIET_C]], [[FMIN]] + ; GCN: buffer_store_short [[RESULT]], define amdgpu_kernel void @test_fmin3_olt_1_f16(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #0 { %a = load volatile half, half addrspace(1)* %aptr, align 2 @@ -221,11 +256,11 @@ ; GCN-LABEL: {{^}}test_fmin3_f32_sss: ; CST_BUS_LIM_1: v_mov_b32_e32 v[[B:[0-9]+]], s1 -; CST_BUS_LIM_1: v_mov_b32_e32 v[[C:[0-9]+]], s2 -; CST_BUS_LIM_1: v_min3_f32 v{{[0-9]+}}, s0, v[[B]], v[[C]] +; CST_BUS_LIM_1: v_min_f32_e32 v[[FMIN:[0-9]+]], s0, v[[B]] +; CST_BUS_LIM_1: v_min_f32_e32 v{{[0-9]+}}, s2, v[[FMIN]] -; CST_BUS_LIM_2: v_mov_b32_e32 v[[C:[0-9]+]], s2 -; CST_BUS_LIM_2: v_min3_f32 v{{[0-9]+}}, s0, s1, v[[C]] +; CST_BUS_LIM_2: v_min_f32_e64 v[[FMIN:[0-9]+]], s0, s1 +; CST_BUS_LIM_2: v_min_f32_e32 v{{[0-9]+}}, s2, v[[FMIN]] define amdgpu_ps void @test_fmin3_f32_sss(float inreg %a, float inreg %b, float inreg %c, float addrspace(1)* %out) { %fmin = call float @llvm.minnum.f32(float %a, float %b) %fmin3 = call float @llvm.minnum.f32(float %fmin, float %c) @@ -335,15 +370,16 @@ ; SI: v_cvt_f32_f16_e32 v[[C:[0-9]+]], v[[C_F16]] ; SI: v_cvt_f32_f16_e32 v[[B:[0-9]+]], v[[B_F16]] ; SI: v_cvt_f32_f16_e32 v[[A:[0-9]+]], v[[A_F16]] -; SI: v_min3_f32 v{{[0-9]+}}, v[[A]], v[[B]], v[[C]] +; SI: v_min_f32_e32 v[[FMIN:[0-9]+]], v[[A]], v[[B]] +; SI: v_min_f32_e32 v{{[0-9]+}}, v[[FMIN]], v[[C]] ; VI: v_mov_b32_e32 v[[B:[0-9]+]], s1 ; VI: v_min_f16_e32 v[[MIN:[0-9]+]], s0, v[[B]] ; VI: v_min_f16_e32 v{{[0-9]+}}, s2, v[[MIN]] ; GFX9: v_mov_b32_e32 v[[B:[0-9]+]], s1 -; GFX9: v_mov_b32_e32 v[[C:[0-9]+]], s2 -; GFX9: v_min3_f16 v{{[0-9]+}}, s0, v[[B]], v[[C]] -; GFX10: v_mov_b32_e32 v[[C:[0-9]+]], s2 -; GFX10: v_min3_f16 v{{[0-9]+}}, s0, s1, v[[C]] +; GFX9: v_min_f16_e32 v[[FMIN:[0-9]+]], s0, v[[B]] +; GFX9: v_min_f16_e32 v{{[0-9]+}}, s2, v[[FMIN]] +; GFX10: v_min_f16_e64 v[[FMIN:[0-9]+]], s0, s1 +; GFX10: v_min_f16_e32 v{{[0-9]+}}, s2, v[[FMIN]] define amdgpu_ps void @test_min3_f16_sss(half inreg %a, half inreg %b, half inreg %c, half addrspace(1)* %out) { %fmin = call half @llvm.minnum.f16(half %a, half %b) %fmin3 = call half @llvm.minnum.f16(half %fmin, half %c) Index: llvm/test/CodeGen/AMDGPU/known-never-snan.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/known-never-snan.ll +++ llvm/test/CodeGen/AMDGPU/known-never-snan.ll @@ -540,6 +540,9 @@ ; GCN-LABEL: v_test_known_not_snan_fmin3_input_fmed3_r_i_i_f32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: v_min3_f32 v0, v0, v1, v2 ; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GCN-NEXT: s_setpc_b64 s[30:31] Index: llvm/test/CodeGen/AMDGPU/max3.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/max3.ll +++ llvm/test/CodeGen/AMDGPU/max3.ll @@ -302,8 +302,8 @@ } ; GCN-LABEL: {{^}}test_max3_i32_ssv: -; CST_BUS_LIM_1: v_mov_b32_e32 v[[B:[0-9]+]], s1 -; CST_BUS_LIM_1: v_max3_i32 v{{[0-9]+}}, s0, v[[B]], v0 +; CST_BUS_LIM_1: s_max_i32 s[[S_MIN:[0-9]+]], s0, s1 +; CST_BUS_LIM_1: v_max_i32_e32 v{{[0-9]+}}, s[[S_MIN]], v0 ; CST_BUS_LIM_2: v_max3_i32 v{{[0-9]+}}, s0, s1, v0 define amdgpu_ps void @test_max3_i32_ssv(i32 inreg %a, i32 inreg %b, i32 %c, i32 addrspace(1)* %out) { @@ -314,8 +314,8 @@ } ; GCN-LABEL: {{^}}test_max3_i32_vss: -; CST_BUS_LIM_1: v_mov_b32_e32 v[[C:[0-9]+]], s1 -; CST_BUS_LIM_1: v_max3_i32 v{{[0-9]+}}, v0, s0, v[[C]] +; CST_BUS_LIM_1: v_max_i32_e32 v[[MIN:[0-9]+]], s0, v0 +; CST_BUS_LIM_1: v_max_i32_e32 v{{[0-9]+}}, s1, v[[MIN]] ; CST_BUS_LIM_2: v_max3_i32 v{{[0-9]+}}, v0, s0, s1 define amdgpu_ps void @test_max3_i32_vss(i32 %a, i32 inreg %b, i32 inreg %c, i32 addrspace(1)* %out) { @@ -326,12 +326,8 @@ } ; GCN-LABEL: {{^}}test_max3_i32_sss: -; CST_BUS_LIM_1: v_mov_b32_e32 v[[B:[0-9]+]], s1 -; CST_BUS_LIM_1: v_mov_b32_e32 v[[C:[0-9]+]], s2 -; CST_BUS_LIM_1: v_max3_i32 v{{[0-9]+}}, s0, v[[B]], v[[C]] - -; CST_BUS_LIM_2: v_mov_b32_e32 v[[C:[0-9]+]], s2 -; CST_BUS_LIM_2: v_max3_i32 v{{[0-9]+}}, s0, s1, v[[C]] +; GCN: s_max_i32 s[[S_MIN:[0-9]+]], s0, s1 +; GCN: s_max_i32 s{{[0-9]+}}, s[[S_MIN]], s2 define amdgpu_ps void @test_max3_i32_sss(i32 inreg %a, i32 inreg %b, i32 inreg %c, i32 addrspace(1)* %out) { %max = call i32 @llvm.smax.i32(i32 %a, i32 %b) %max3 = call i32 @llvm.smax.i32(i32 %max, i32 %c) @@ -389,15 +385,16 @@ ; GCN-LABEL: {{^}}test_max3_i16_ssv: ; SI: v_bfe_i32 v[[C:[0-9]+]], v0, 0, 16 -; SI: s_sext_i32_i16 s[[B_TMP:[0-9]+]], s1 +; SI: s_sext_i32_i16 s[[B:[0-9]+]], s1 ; SI: s_sext_i32_i16 s[[A:[0-9]+]], s0 -; SI: v_mov_b32_e32 v[[B:[0-9]+]], s[[B_TMP]] -; SI: v_max3_i32 v{{[0-9]+}}, s[[A]], v[[B]], v[[C]] +; SI: s_max_i32 s[[MAX:[0-9]+]], s[[A]], s[[B]] +; SI: v_max_i32_e32 v{{[0-9]+}}, s[[MAX]], v[[C]] ; VI: v_mov_b32_e32 v[[B:[0-9]+]], s1 ; VI: v_max_i16_e32 v[[MAX:[0-9]+]], s0, v[[B]] ; VI: v_max_i16_e32 v{{[0-9]+}}, v[[MAX]], v0 ; GFX9: v_mov_b32_e32 v[[B:[0-9]+]], s1 -; GFX9: v_max3_i16 v{{[0-9]+}}, s0, v[[B]], v0 +; GFX9: v_max_i16_e32 v[[MAX:[0-9]+]], s0, v[[B]] +; GFX9: v_max_i16_e32 v{{[0-9]+}}, v[[MAX]], v0 ; GFX10: v_max3_i16 v{{[0-9]+}}, s0, s1, v0 define amdgpu_ps void @test_max3_i16_ssv(i16 inreg %a, i16 inreg %b, i16 %c, i16 addrspace(1)* %out) { %max = call i16 @llvm.smax.i16(i16 %a, i16 %b) @@ -407,15 +404,15 @@ } ; GCN-LABEL: {{^}}test_max3_i16_vss: -; SI: s_sext_i32_i16 s[[C_TMP:[0-9]+]], s1 +; SI: s_sext_i32_i16 s[[C:[0-9]+]], s1 ; SI: s_sext_i32_i16 s[[B:[0-9]+]], s0 ; SI: v_bfe_i32 v[[A:[0-9]+]], v0, 0, 16 -; SI: v_mov_b32_e32 v[[C:[0-9]+]], s[[C_TMP]] -; SI: v_max3_i32 v{{[0-9]+}}, v[[A]], s[[B]], v[[C]] +; SI: v_max_i32_e32 v[[MAX:[0-9]+]], s[[B]], v[[A]] +; SI: v_max_i32_e32 v{{[0-9]+}}, s[[C]], v[[MAX]] ; VI: v_max_i16_e32 v[[MAX:[0-9]+]], s0, v0 ; VI: v_max_i16_e32 v{{[0-9]+}}, s1, v[[MAX]] -; GFX9: v_mov_b32_e32 v[[C:[0-9]+]], s1 -; GFX9: v_max3_i16 v{{[0-9]+}}, v0, s0, v[[C]] +; GFX9: v_max_i16_e32 v[[MAX:[0-9]+]], s0, v0 +; GFX9: v_max_i16_e32 v{{[0-9]+}}, s1, v[[MAX]] ; GFX10: v_max3_i16 v{{[0-9]+}}, v0, s0, s1 define amdgpu_ps void @test_max3_i16_vss(i16 %a, i16 inreg %b, i16 inreg %c, i16 addrspace(1)* %out) { %max = call i16 @llvm.smax.i16(i16 %a, i16 %b) @@ -425,20 +422,19 @@ } ; GCN-LABEL: {{^}}test_max3_i16_sss: -; SI: s_sext_i32_i16 s[[C_TMP:[0-9]+]], s2 -; SI: s_sext_i32_i16 s[[B_TMP:[0-9]+]], s1 +; SI: s_sext_i32_i16 s[[C:[0-9]+]], s2 +; SI: s_sext_i32_i16 s[[B:[0-9]+]], s1 ; SI: s_sext_i32_i16 s[[A:[0-9]+]], s0 -; SI: v_mov_b32_e32 v[[B:[0-9]+]], s[[B_TMP]] -; SI: v_mov_b32_e32 v[[C:[0-9]+]], s[[C_TMP]] -; SI: v_max3_i32 v{{[0-9]+}}, s[[A]], v[[B]], v[[C]] +; SI: s_max_i32 s[[S_MIN:[0-9]+]], s[[A]], s[[B]] +; SI: s_max_i32 s{{[0-9]+}}, s[[S_MIN]], s[[C]] ; VI: v_mov_b32_e32 v[[B:[0-9]+]], s1 ; VI: v_max_i16_e32 v[[MAX:[0-9]+]], s0, v[[B]] ; VI: v_max_i16_e32 v{{[0-9]+}}, s2, v[[MAX]] ; GFX9: v_mov_b32_e32 v[[B:[0-9]+]], s1 -; GFX9: v_mov_b32_e32 v[[C:[0-9]+]], s2 -; GFX9: v_max3_i16 v{{[0-9]+}}, s0, v[[B]], v[[C]] -; GFX10: v_mov_b32_e32 v[[C:[0-9]+]], s2 -; GFX10: v_max3_i16 v{{[0-9]+}}, s0, s1, v[[C]] +; GFX9: v_max_i16_e32 v[[MAX:[0-9]+]], s0, v[[B]] +; GFX9: v_max_i16_e32 v{{[0-9]+}}, s2, v[[MAX]] +; GFX10: v_max_i16 v[[MAX:[0-9]+]], s0, s1 +; GFX10: v_max_i16 v{{[0-9]+}}, v[[MAX]], s2 define amdgpu_ps void @test_max3_i16_sss(i16 inreg %a, i16 inreg %b, i16 inreg %c, i16 addrspace(1)* %out) { %max = call i16 @llvm.smax.i16(i16 %a, i16 %b) %max3 = call i16 @llvm.smax.i16(i16 %max, i16 %c) Index: llvm/test/CodeGen/AMDGPU/min3.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/min3.ll +++ llvm/test/CodeGen/AMDGPU/min3.ll @@ -361,8 +361,8 @@ } ; GCN-LABEL: {{^}}test_min3_i32_ssv: -; CST_BUS_LIM_1: v_mov_b32_e32 v[[B:[0-9]+]], s1 -; CST_BUS_LIM_1: v_min3_i32 v{{[0-9]+}}, s0, v[[B]], v0 +; CST_BUS_LIM_1: s_min_i32 s[[S_MIN:[0-9]+]], s0, s1 +; CST_BUS_LIM_1: v_min_i32_e32 v{{[0-9]+}}, s[[S_MIN]], v0 ; CST_BUS_LIM_2: v_min3_i32 v{{[0-9]+}}, s0, s1, v0 define amdgpu_ps void @test_min3_i32_ssv(i32 inreg %a, i32 inreg %b, i32 %c, i32 addrspace(1)* %out) { @@ -373,8 +373,8 @@ } ; GCN-LABEL: {{^}}test_min3_i32_vss: -; CST_BUS_LIM_1: v_mov_b32_e32 v[[C:[0-9]+]], s1 -; CST_BUS_LIM_1: v_min3_i32 v{{[0-9]+}}, v0, s0, v[[C]] +; CST_BUS_LIM_1: v_min_i32_e32 v[[MIN:[0-9]+]], s0, v0 +; CST_BUS_LIM_1: v_min_i32_e32 v{{[0-9]+}}, s1, v[[MIN]] ; CST_BUS_LIM_2: v_min3_i32 v{{[0-9]+}}, v0, s0, s1 define amdgpu_ps void @test_min3_i32_vss(i32 %a, i32 inreg %b, i32 inreg %c, i32 addrspace(1)* %out) { @@ -385,12 +385,8 @@ } ; GCN-LABEL: {{^}}test_min3_i32_sss: -; CST_BUS_LIM_1: v_mov_b32_e32 v[[B:[0-9]+]], s1 -; CST_BUS_LIM_1: v_mov_b32_e32 v[[C:[0-9]+]], s2 -; CST_BUS_LIM_1: v_min3_i32 v{{[0-9]+}}, s0, v[[B]], v[[C]] - -; CST_BUS_LIM_2: v_mov_b32_e32 v[[C:[0-9]+]], s2 -; CST_BUS_LIM_2: v_min3_i32 v{{[0-9]+}}, s0, s1, v[[C]] +; GCN: s_min_i32 s[[S_MIN:[0-9]+]], s0, s1 +; GCN: s_min_i32 s{{[0-9]+}}, s[[S_MIN]], s2 define amdgpu_ps void @test_min3_i32_sss(i32 inreg %a, i32 inreg %b, i32 inreg %c, i32 addrspace(1)* %out) { %min = call i32 @llvm.smin.i32(i32 %a, i32 %b) %min3 = call i32 @llvm.smin.i32(i32 %min, i32 %c) @@ -448,15 +444,16 @@ ; GCN-LABEL: {{^}}test_min3_i16_ssv: ; SI: v_bfe_i32 v[[C:[0-9]+]], v0, 0, 16 -; SI: s_sext_i32_i16 s[[B_TMP:[0-9]+]], s1 +; SI: s_sext_i32_i16 s[[B:[0-9]+]], s1 ; SI: s_sext_i32_i16 s[[A:[0-9]+]], s0 -; SI: v_mov_b32_e32 v[[B:[0-9]+]], s[[B_TMP]] -; SI: v_min3_i32 v{{[0-9]+}}, s[[A]], v[[B]], v[[C]] +; SI: s_min_i32 s[[MIN:[0-9]+]], s[[A]], s[[B]] +; SI: v_min_i32_e32 v{{[0-9]+}}, s[[MIN]], v[[C]] ; VI: v_mov_b32_e32 v[[B:[0-9]+]], s1 ; VI: v_min_i16_e32 v[[MIN:[0-9]+]], s0, v[[B]] ; VI: v_min_i16_e32 v{{[0-9]+}}, v[[MIN]], v0 ; GFX9: v_mov_b32_e32 v[[B:[0-9]+]], s1 -; GFX9: v_min3_i16 v{{[0-9]+}}, s0, v[[B]], v0 +; GFX9: v_min_i16_e32 v[[MIN:[0-9]+]], s0, v[[B]] +; GFX9: v_min_i16_e32 v{{[0-9]+}}, v[[MIN]], v0 ; GFX10: v_min3_i16 v{{[0-9]+}}, s0, s1, v0 define amdgpu_ps void @test_min3_i16_ssv(i16 inreg %a, i16 inreg %b, i16 %c, i16 addrspace(1)* %out) { %min = call i16 @llvm.smin.i16(i16 %a, i16 %b) @@ -466,15 +463,15 @@ } ; GCN-LABEL: {{^}}test_min3_i16_vss: -; SI: s_sext_i32_i16 s[[C_TMP:[0-9]+]], s1 +; SI: s_sext_i32_i16 s[[C:[0-9]+]], s1 ; SI: s_sext_i32_i16 s[[B:[0-9]+]], s0 ; SI: v_bfe_i32 v[[A:[0-9]+]], v0, 0, 16 -; SI: v_mov_b32_e32 v[[C:[0-9]+]], s[[C_TMP]] -; SI: v_min3_i32 v{{[0-9]+}}, v[[A]], s[[B]], v[[C]] +; SI: v_min_i32_e32 v[[MIN:[0-9]+]], s[[B]], v[[A]] +; SI: v_min_i32_e32 v{{[0-9]+}}, s[[C]], v[[MIN]] ; VI: v_min_i16_e32 v[[MIN:[0-9]+]], s0, v0 ; VI: v_min_i16_e32 v{{[0-9]+}}, s1, v[[MIN]] -; GFX9: v_mov_b32_e32 v[[C:[0-9]+]], s1 -; GFX9: v_min3_i16 v{{[0-9]+}}, v0, s0, v[[C]] +; GFX9: v_min_i16_e32 v[[MIN:[0-9]+]], s0, v0 +; GFX9: v_min_i16_e32 v{{[0-9]+}}, s1, v[[MIN]] ; GFX10: v_min3_i16 v{{[0-9]+}}, v0, s0, s1 define amdgpu_ps void @test_min3_i16_vss(i16 %a, i16 inreg %b, i16 inreg %c, i16 addrspace(1)* %out) { %min = call i16 @llvm.smin.i16(i16 %a, i16 %b) @@ -484,20 +481,19 @@ } ; GCN-LABEL: {{^}}test_min3_i16_sss: -; SI: s_sext_i32_i16 s[[C_TMP:[0-9]+]], s2 -; SI: s_sext_i32_i16 s[[B_TMP:[0-9]+]], s1 +; SI: s_sext_i32_i16 s[[C:[0-9]+]], s2 +; SI: s_sext_i32_i16 s[[B:[0-9]+]], s1 ; SI: s_sext_i32_i16 s[[A:[0-9]+]], s0 -; SI: v_mov_b32_e32 v[[B:[0-9]+]], s[[B_TMP]] -; SI: v_mov_b32_e32 v[[C:[0-9]+]], s[[C_TMP]] -; SI: v_min3_i32 v{{[0-9]+}}, s[[A]], v[[B]], v[[C]] +; SI: s_min_i32 s[[S_MIN:[0-9]+]], s[[A]], s[[B]] +; SI: s_min_i32 s{{[0-9]+}}, s[[S_MIN]], s[[C]] ; VI: v_mov_b32_e32 v[[B:[0-9]+]], s1 ; VI: v_min_i16_e32 v[[MIN:[0-9]+]], s0, v[[B]] ; VI: v_min_i16_e32 v{{[0-9]+}}, s2, v[[MIN]] ; GFX9: v_mov_b32_e32 v[[B:[0-9]+]], s1 -; GFX9: v_mov_b32_e32 v[[C:[0-9]+]], s2 -; GFX9: v_min3_i16 v{{[0-9]+}}, s0, v[[B]], v[[C]] -; GFX10: v_mov_b32_e32 v[[C:[0-9]+]], s2 -; GFX10: v_min3_i16 v{{[0-9]+}}, s0, s1, v[[C]] +; GFX9: v_min_i16_e32 v[[MIN:[0-9]+]], s0, v[[B]] +; GFX9: v_min_i16_e32 v{{[0-9]+}}, s2, v[[MIN]] +; GFX10: v_min_i16 v[[MIN:[0-9]+]], s0, s1 +; GFX10: v_min_i16 v{{[0-9]+}}, v[[MIN]], s2 define amdgpu_ps void @test_min3_i16_sss(i16 inreg %a, i16 inreg %b, i16 inreg %c, i16 addrspace(1)* %out) { %min = call i16 @llvm.smin.i16(i16 %a, i16 %b) %min3 = call i16 @llvm.smin.i16(i16 %min, i16 %c)