diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -191,7 +191,8 @@ SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, SDValue Op0, SDValue Op1) const; SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, - SDValue Op0, SDValue Op1, bool Signed) const; + SDValue Src, SDValue MinVal, SDValue MaxVal, + bool Signed) const; SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -10527,30 +10527,36 @@ } } -SDValue SITargetLowering::performIntMed3ImmCombine( - SelectionDAG &DAG, const SDLoc &SL, - SDValue Op0, SDValue Op1, bool Signed) const { - ConstantSDNode *K1 = dyn_cast(Op1); - if (!K1) - return SDValue(); +SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG, + const SDLoc &SL, SDValue Src, + SDValue MinVal, + SDValue MaxVal, + bool Signed) const { + + // med3 comes from + // min(max(x, K0), K1), K0 < K1 + // max(min(x, K0), K1), K1 < K0 + // + // "MinVal" and "MaxVal" respectively refer to the rhs of the + // min/max op. + ConstantSDNode *MinK = dyn_cast(MinVal); + ConstantSDNode *MaxK = dyn_cast(MaxVal); - ConstantSDNode *K0 = dyn_cast(Op0.getOperand(1)); - if (!K0) + if (!MinK || !MaxK) return SDValue(); if (Signed) { - if (K0->getAPIntValue().sge(K1->getAPIntValue())) + if (MaxK->getAPIntValue().sge(MinK->getAPIntValue())) return SDValue(); } else { - if (K0->getAPIntValue().uge(K1->getAPIntValue())) + if (MaxK->getAPIntValue().uge(MinK->getAPIntValue())) return SDValue(); } - EVT VT = K0->getValueType(0); + EVT VT = MinK->getValueType(0); unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3; if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) { - return DAG.getNode(Med3Opc, SL, VT, - Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0)); + return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal); } // If there isn't a 16-bit med3 operation, convert to 32-bit. @@ -10558,9 +10564,9 @@ MVT NVT = MVT::i32; unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; - SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0)); - SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1)); - SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1); + SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Src); + SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, MaxVal); + SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, MinVal); SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3); return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3); @@ -10677,13 +10683,26 @@ } // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1) + // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0) if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) { - if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true)) + if (SDValue Med3 = performIntMed3ImmCombine( + DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true)) + return Med3; + } else if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && + Op0.hasOneUse()) { + if (SDValue Med3 = performIntMed3ImmCombine( + DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true)) return Med3; } if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) { - if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false)) + if (SDValue Med3 = performIntMed3ImmCombine( + DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false)) + return Med3; + } else if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && + Op0.hasOneUse()) { + if (SDValue Med3 = performIntMed3ImmCombine( + DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false)) return Med3; } diff --git a/llvm/test/CodeGen/AMDGPU/saddsat.ll b/llvm/test/CodeGen/AMDGPU/saddsat.ll --- a/llvm/test/CodeGen/AMDGPU/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/saddsat.ll @@ -12,16 +12,19 @@ ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_min_i32_e32 v0, 0x7f, v0 -; GFX6-NEXT: v_max_i32_e32 v0, 0xffffff80, v0 +; GFX6-NEXT: s_movk_i32 s4, 0xff80 +; GFX6-NEXT: v_mov_b32_e32 v1, 0x7f +; GFX6-NEXT: v_med3_i32 v0, v0, s4, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_saddsat_i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_add_u16_sdwa v0, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX8-NEXT: v_min_i16_e32 v0, 0x7f, v0 -; GFX8-NEXT: v_max_i16_e32 v0, 0xff80, v0 +; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX8-NEXT: s_movk_i32 s4, 0xff80 +; GFX8-NEXT: v_mov_b32_e32 v1, 0x7f +; GFX8-NEXT: v_med3_i32 v0, v0, s4, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_saddsat_i8: @@ -53,8 +56,9 @@ ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0 -; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0 +; GFX6-NEXT: s_movk_i32 s4, 0x8000 +; GFX6-NEXT: v_mov_b32_e32 v1, 0x7fff +; GFX6-NEXT: v_med3_i32 v0, v0, s4, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_saddsat_i16: @@ -135,14 +139,14 @@ ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GFX6-NEXT: s_movk_i32 s4, 0x8000 +; GFX6-NEXT: v_mov_b32_e32 v3, 0x7fff ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1 -; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0 -; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1 -; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX6-NEXT: v_med3_i32 v1, v1, s4, v3 +; GFX6-NEXT: v_med3_i32 v0, v0, s4, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -196,16 +200,15 @@ ; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 +; GFX6-NEXT: s_movk_i32 s4, 0x8000 +; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fff ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1 -; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0 +; GFX6-NEXT: v_med3_i32 v1, v1, s4, v4 +; GFX6-NEXT: v_med3_i32 v0, v0, s4, v4 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1 -; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0 -; GFX6-NEXT: v_min_i32_e32 v2, 0x7fff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_max_i32_e32 v3, 0xffff8000, v2 +; GFX6-NEXT: v_med3_i32 v3, v2, s4, v4 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_or_b32_e32 v2, 0xffff0000, v3 ; GFX6-NEXT: v_alignbit_b32 v1, v3, v1, 16 @@ -268,11 +271,11 @@ ; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v5 +; GFX6-NEXT: s_movk_i32 s4, 0x8000 +; GFX6-NEXT: v_mov_b32_e32 v5, 0x7fff ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1 -; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0 -; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1 -; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0 +; GFX6-NEXT: v_med3_i32 v1, v1, s4, v5 +; GFX6-NEXT: v_med3_i32 v0, v0, s4, v5 ; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16 @@ -282,10 +285,8 @@ ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v7 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1 -; GFX6-NEXT: v_min_i32_e32 v2, 0x7fff, v2 -; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1 -; GFX6-NEXT: v_max_i32_e32 v2, 0xffff8000, v2 +; GFX6-NEXT: v_med3_i32 v1, v1, s4, v5 +; GFX6-NEXT: v_med3_i32 v2, v2, s4, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll --- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll @@ -12,16 +12,19 @@ ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_min_i32_e32 v0, 0x7f, v0 -; GFX6-NEXT: v_max_i32_e32 v0, 0xffffff80, v0 +; GFX6-NEXT: s_movk_i32 s4, 0xff80 +; GFX6-NEXT: v_mov_b32_e32 v1, 0x7f +; GFX6-NEXT: v_med3_i32 v0, v0, s4, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_sub_u16_sdwa v0, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX8-NEXT: v_min_i16_e32 v0, 0x7f, v0 -; GFX8-NEXT: v_max_i16_e32 v0, 0xff80, v0 +; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX8-NEXT: s_movk_i32 s4, 0xff80 +; GFX8-NEXT: v_mov_b32_e32 v1, 0x7f +; GFX8-NEXT: v_med3_i32 v0, v0, s4, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_i8: @@ -53,8 +56,9 @@ ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0 -; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0 +; GFX6-NEXT: s_movk_i32 s4, 0x8000 +; GFX6-NEXT: v_mov_b32_e32 v1, 0x7fff +; GFX6-NEXT: v_med3_i32 v0, v0, s4, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_i16: @@ -135,14 +139,14 @@ ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 +; GFX6-NEXT: s_movk_i32 s4, 0x8000 +; GFX6-NEXT: v_mov_b32_e32 v3, 0x7fff ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1 -; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0 -; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1 -; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX6-NEXT: v_med3_i32 v1, v1, s4, v3 +; GFX6-NEXT: v_med3_i32 v0, v0, s4, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -196,16 +200,15 @@ ; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 +; GFX6-NEXT: s_movk_i32 s4, 0x8000 +; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fff ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 -; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1 -; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0 +; GFX6-NEXT: v_med3_i32 v1, v1, s4, v4 +; GFX6-NEXT: v_med3_i32 v0, v0, s4, v4 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 -; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1 -; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0 -; GFX6-NEXT: v_min_i32_e32 v2, 0x7fff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_max_i32_e32 v3, 0xffff8000, v2 +; GFX6-NEXT: v_med3_i32 v3, v2, s4, v4 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX6-NEXT: v_alignbit_b32 v1, v3, v1, 16 @@ -268,11 +271,11 @@ ; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 +; GFX6-NEXT: s_movk_i32 s4, 0x8000 +; GFX6-NEXT: v_mov_b32_e32 v5, 0x7fff ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1 -; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0 -; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1 -; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0 +; GFX6-NEXT: v_med3_i32 v1, v1, s4, v5 +; GFX6-NEXT: v_med3_i32 v0, v0, s4, v5 ; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16 @@ -282,10 +285,8 @@ ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v3, v7 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 -; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1 -; GFX6-NEXT: v_min_i32_e32 v2, 0x7fff, v2 -; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1 -; GFX6-NEXT: v_max_i32_e32 v2, 0xffff8000, v2 +; GFX6-NEXT: v_med3_i32 v1, v1, s4, v5 +; GFX6-NEXT: v_med3_i32 v2, v2, s4, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll --- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll @@ -193,37 +193,34 @@ ; SDAG-VI-LABEL: basic_smin_smax: ; SDAG-VI: ; %bb.0: ; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-VI-NEXT: v_min_i16_e32 v0, 0xff, v0 -; SDAG-VI-NEXT: v_min_i16_e32 v1, 0xff, v1 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, 0 -; SDAG-VI-NEXT: v_max_i16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; SDAG-VI-NEXT: v_max_i16_e32 v0, 0, v0 -; SDAG-VI-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-VI-NEXT: s_movk_i32 s4, 0xff +; SDAG-VI-NEXT: v_bfe_i32 v1, v1, 0, 16 +; SDAG-VI-NEXT: v_bfe_i32 v0, v0, 0, 16 +; SDAG-VI-NEXT: v_med3_i32 v1, v1, 0, s4 +; SDAG-VI-NEXT: v_med3_i32 v0, v0, 0, s4 +; SDAG-VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SDAG-VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; SDAG-VI-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-GFX9-LABEL: basic_smin_smax: -; SDAG-GFX9: ; %bb.0: -; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX9-NEXT: v_min_i16_e32 v0, 0xff, v0 -; SDAG-GFX9-NEXT: v_min_i16_e32 v1, 0xff, v1 -; SDAG-GFX9-NEXT: v_max_i16_e32 v0, 0, v0 -; SDAG-GFX9-NEXT: v_max_i16_e32 v1, 0, v1 -; SDAG-GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; SDAG-GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 -; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: basic_smin_smax: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; GFX9-NEXT: v_med3_i16 v0, v0, 0, v2 +; GFX9-NEXT: v_med3_i16 v1, v1, 0, v2 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-GFX11-LABEL: basic_smin_smax: -; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; SDAG-GFX11-NEXT: v_min_i16 v0, 0xff, v0 -; SDAG-GFX11-NEXT: v_min_i16 v1, 0xff, v1 -; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; SDAG-GFX11-NEXT: v_max_i16 v0, v0, 0 -; SDAG-GFX11-NEXT: v_max_i16 v1, v1, 0 -; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; SDAG-GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: basic_smin_smax: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff +; GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-VI-LABEL: basic_smin_smax: ; GISEL-VI: ; %bb.0: @@ -235,26 +232,6 @@ ; GISEL-VI-NEXT: v_max_i16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1 ; GISEL-VI-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-GFX9-LABEL: basic_smin_smax: -; GISEL-GFX9: ; %bb.0: -; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX9-NEXT: v_mov_b32_e32 v2, 0xff -; GISEL-GFX9-NEXT: v_med3_i16 v0, v0, 0, v2 -; GISEL-GFX9-NEXT: v_med3_i16 v1, v1, 0, v2 -; GISEL-GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GISEL-GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 -; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-GFX11-LABEL: basic_smin_smax: -; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GISEL-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff -; GISEL-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff -; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GISEL-GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] %src0.min = call i16 @llvm.smin.i16(i16 %src0, i16 255) %src0.clamp = call i16 @llvm.smax.i16(i16 %src0.min, i16 0) %src1.min = call i16 @llvm.smin.i16(i16 %src1, i16 255) @@ -268,36 +245,34 @@ ; SDAG-VI-LABEL: basic_smin_smax_combined: ; SDAG-VI: ; %bb.0: ; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: s_movk_i32 s4, 0xff ; SDAG-VI-NEXT: v_bfe_i32 v1, v1, 0, 16 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, 0xff -; SDAG-VI-NEXT: v_min_i16_e32 v0, 0xff, v0 -; SDAG-VI-NEXT: v_med3_i32 v1, v1, 0, v2 +; SDAG-VI-NEXT: v_bfe_i32 v0, v0, 0, 16 +; SDAG-VI-NEXT: v_med3_i32 v1, v1, 0, s4 +; SDAG-VI-NEXT: v_med3_i32 v0, v0, 0, s4 ; SDAG-VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SDAG-VI-NEXT: v_max_i16_e32 v0, 0, v0 -; SDAG-VI-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; SDAG-VI-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-GFX9-LABEL: basic_smin_smax_combined: -; SDAG-GFX9: ; %bb.0: -; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX9-NEXT: v_min_i16_e32 v0, 0xff, v0 -; SDAG-GFX9-NEXT: v_mov_b32_e32 v2, 0xff -; SDAG-GFX9-NEXT: v_max_i16_e32 v0, 0, v0 -; SDAG-GFX9-NEXT: v_med3_i16 v1, v1, 0, v2 -; SDAG-GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; SDAG-GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 -; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: basic_smin_smax_combined: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; GFX9-NEXT: v_med3_i16 v0, v0, 0, v2 +; GFX9-NEXT: v_med3_i16 v1, v1, 0, v2 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-GFX11-LABEL: basic_smin_smax_combined: -; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; SDAG-GFX11-NEXT: v_min_i16 v0, 0xff, v0 -; SDAG-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff -; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX11-NEXT: v_max_i16 v0, v0, 0 -; SDAG-GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: basic_smin_smax_combined: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff +; GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-VI-LABEL: basic_smin_smax_combined: ; GISEL-VI: ; %bb.0: @@ -309,26 +284,6 @@ ; GISEL-VI-NEXT: v_min_i16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1 ; GISEL-VI-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-GFX9-LABEL: basic_smin_smax_combined: -; GISEL-GFX9: ; %bb.0: -; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX9-NEXT: v_mov_b32_e32 v2, 0xff -; GISEL-GFX9-NEXT: v_med3_i16 v0, v0, 0, v2 -; GISEL-GFX9-NEXT: v_med3_i16 v1, v1, 0, v2 -; GISEL-GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GISEL-GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 -; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-GFX11-LABEL: basic_smin_smax_combined: -; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GISEL-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff -; GISEL-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff -; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GISEL-GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] %src0.min = call i16 @llvm.smin.i16(i16 %src0, i16 255) %src0.clamp = call i16 @llvm.smax.i16(i16 %src0.min, i16 0) %src1.max = call i16 @llvm.smax.i16(i16 %src1, i16 0) @@ -520,13 +475,13 @@ ; SDAG-VI-LABEL: vec_smin_smax: ; SDAG-VI: ; %bb.0: ; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0xff -; SDAG-VI-NEXT: v_min_i16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; SDAG-VI-NEXT: v_min_i16_e32 v0, 0xff, v0 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, 0 -; SDAG-VI-NEXT: v_max_i16_e32 v0, 0, v0 -; SDAG-VI-NEXT: v_max_i16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; SDAG-VI-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-VI-NEXT: v_bfe_i32 v1, v0, 0, 16 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, 0xff +; SDAG-VI-NEXT: v_ashrrev_i32_e32 v0, 16, v0 +; SDAG-VI-NEXT: v_med3_i32 v0, v0, 0, v2 +; SDAG-VI-NEXT: v_med3_i32 v1, v1, 0, v2 +; SDAG-VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SDAG-VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; SDAG-VI-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX9-LABEL: vec_smin_smax: