diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -193,7 +193,8 @@ SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, SDValue Op0, SDValue Op1) const; SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, - SDValue Op0, SDValue Op1, bool Signed) const; + SDValue Src, SDValue MinVal, SDValue MaxVal, + bool Signed) const; SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -10588,45 +10588,41 @@ } } -SDValue SITargetLowering::performIntMed3ImmCombine( - SelectionDAG &DAG, const SDLoc &SL, - SDValue Op0, SDValue Op1, bool Signed) const { - ConstantSDNode *K1 = dyn_cast(Op1); - if (!K1) - return SDValue(); +SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG, + const SDLoc &SL, SDValue Src, + SDValue MinVal, + SDValue MaxVal, + bool Signed) const { + + // med3 comes from + // min(max(x, K0), K1), K0 < K1 + // max(min(x, K0), K1), K1 < K0 + // + // "MinVal" and "MaxVal" respectively refer to the rhs of the + // min/max op. + ConstantSDNode *MinK = dyn_cast(MinVal); + ConstantSDNode *MaxK = dyn_cast(MaxVal); - ConstantSDNode *K0 = dyn_cast(Op0.getOperand(1)); - if (!K0) + if (!MinK || !MaxK) return SDValue(); if (Signed) { - if (K0->getAPIntValue().sge(K1->getAPIntValue())) + if (MaxK->getAPIntValue().sge(MinK->getAPIntValue())) return SDValue(); } else { - if (K0->getAPIntValue().uge(K1->getAPIntValue())) + if (MaxK->getAPIntValue().uge(MinK->getAPIntValue())) return SDValue(); } - EVT VT = K0->getValueType(0); + EVT VT = MinK->getValueType(0); unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3; - if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) { - return DAG.getNode(Med3Opc, SL, VT, - Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0)); - } - - // If there isn't a 16-bit med3 operation, convert to 32-bit. - if (VT == MVT::i16) { - MVT NVT = MVT::i32; - unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; - - SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0)); - SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1)); - SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1); - - SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3); - return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3); - } + if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) + return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal); + // Note: we could also extend to i32 and use i32 med3 if i16 med3 is + // not available, but this is unlikely to be profitable as constants + // will often need to be materialized & extended, especially on + // pre-GFX10 where VOP3 instructions couldn't take literal operands. return SDValue(); } @@ -10738,13 +10734,26 @@ } // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1) + // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0) if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) { - if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true)) + if (SDValue Med3 = performIntMed3ImmCombine( + DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true)) + return Med3; + } + if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) { + if (SDValue Med3 = performIntMed3ImmCombine( + DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true)) return Med3; } if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) { - if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false)) + if (SDValue Med3 = performIntMed3ImmCombine( + DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false)) + return Med3; + } + if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) { + if (SDValue Med3 = performIntMed3ImmCombine( + DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false)) return Med3; } diff --git a/llvm/test/CodeGen/AMDGPU/mad_uint24.ll b/llvm/test/CodeGen/AMDGPU/mad_uint24.ll --- a/llvm/test/CodeGen/AMDGPU/mad_uint24.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_uint24.ll @@ -158,9 +158,11 @@ ; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x ; EG: 8 ; SI: v_mad_u32_u24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}} +; SI: v_bfe_i32 [[EXT:v[0-9]]], [[MAD]], 0, 16 +; SI: v_med3_i32 v{{[0-9]}}, [[EXT]], ; VI: v_mad_u16 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}} -; GCN: v_bfe_i32 [[EXT:v[0-9]]], [[MAD]], 0, 16 -; GCN: v_med3_i32 v{{[0-9]}}, [[EXT]], +; VI: v_max_i16_e32 [[MAX:v[0-9]]], 0xff80, [[MAD]] +; VI: v_min_i16_e32 {{v[0-9]}}, 0x7f, [[MAX]] define amdgpu_kernel void @i8_mad_sat_16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(5) %idx) { entry: %retval.0.i = load i64, ptr addrspace(5) %idx diff --git a/llvm/test/CodeGen/AMDGPU/saddsat.ll b/llvm/test/CodeGen/AMDGPU/saddsat.ll --- a/llvm/test/CodeGen/AMDGPU/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/saddsat.ll @@ -12,8 +12,9 @@ ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_min_i32_e32 v0, 0x7f, v0 -; GFX6-NEXT: v_max_i32_e32 v0, 0xffffff80, v0 +; GFX6-NEXT: s_movk_i32 s4, 0xff80 +; GFX6-NEXT: v_mov_b32_e32 v1, 0x7f +; GFX6-NEXT: v_med3_i32 v0, v0, s4, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_saddsat_i8: @@ -53,8 +54,9 @@ ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0 -; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0 +; GFX6-NEXT: s_movk_i32 s4, 0x8000 +; GFX6-NEXT: v_mov_b32_e32 v1, 0x7fff +; GFX6-NEXT: v_med3_i32 v0, v0, s4, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_saddsat_i16: @@ -135,14 +137,14 @@ ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GFX6-NEXT: s_movk_i32 s4, 0x8000 +; GFX6-NEXT: v_mov_b32_e32 v3, 0x7fff ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1 -; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0 -; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1 -; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX6-NEXT: v_med3_i32 v1, v1, s4, v3 +; GFX6-NEXT: v_med3_i32 v0, v0, s4, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -196,16 +198,15 @@ ; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 +; GFX6-NEXT: s_movk_i32 s4, 0x8000 +; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fff ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1 -; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0 +; GFX6-NEXT: v_med3_i32 v1, v1, s4, v4 +; GFX6-NEXT: v_med3_i32 v0, v0, s4, v4 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1 -; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0 -; GFX6-NEXT: v_min_i32_e32 v2, 0x7fff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_max_i32_e32 v3, 0xffff8000, v2 +; GFX6-NEXT: v_med3_i32 v3, v2, s4, v4 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_or_b32_e32 v2, 0xffff0000, v3 ; GFX6-NEXT: v_alignbit_b32 v1, v3, v1, 16 @@ -268,11 +269,11 @@ ; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v5 +; GFX6-NEXT: s_movk_i32 s4, 0x8000 +; GFX6-NEXT: v_mov_b32_e32 v5, 0x7fff ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1 -; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0 -; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1 -; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0 +; GFX6-NEXT: v_med3_i32 v1, v1, s4, v5 +; GFX6-NEXT: v_med3_i32 v0, v0, s4, v5 ; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16 @@ -282,10 +283,8 @@ ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v7 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1 -; GFX6-NEXT: v_min_i32_e32 v2, 0x7fff, v2 -; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1 -; GFX6-NEXT: v_max_i32_e32 v2, 0xffff8000, v2 +; GFX6-NEXT: v_med3_i32 v1, v1, s4, v5 +; GFX6-NEXT: v_med3_i32 v2, v2, s4, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 diff --git a/llvm/test/CodeGen/AMDGPU/smed3.ll b/llvm/test/CodeGen/AMDGPU/smed3.ll --- a/llvm/test/CodeGen/AMDGPU/smed3.ll +++ b/llvm/test/CodeGen/AMDGPU/smed3.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SICIVI -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SICIVI -check-prefix=VI %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s declare i32 @llvm.amdgcn.workitem.id.x() #0 @@ -94,7 +94,9 @@ declare i64 @llvm.smin.i64(i64, i64) ; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i16: -; SICIVI: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 +; SI: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 +; VI: v_max_i16_e32 [[MAX:v[0-9]]], 12, {{v[0-9]}} +; VI: v_min_i16_e32 {{v[0-9]}}, 17, [[MAX]] ; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 define amdgpu_kernel void @v_test_smed3_r_i_i_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll --- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll @@ -12,8 +12,9 @@ ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_min_i32_e32 v0, 0x7f, v0 -; GFX6-NEXT: v_max_i32_e32 v0, 0xffffff80, v0 +; GFX6-NEXT: s_movk_i32 s4, 0xff80 +; GFX6-NEXT: v_mov_b32_e32 v1, 0x7f +; GFX6-NEXT: v_med3_i32 v0, v0, s4, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_i8: @@ -53,8 +54,9 @@ ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0 -; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0 +; GFX6-NEXT: s_movk_i32 s4, 0x8000 +; GFX6-NEXT: v_mov_b32_e32 v1, 0x7fff +; GFX6-NEXT: v_med3_i32 v0, v0, s4, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_i16: @@ -135,14 +137,14 @@ ; GFX6-NEXT: v_bfe_i32 v3, v3, 0, 16 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 +; GFX6-NEXT: s_movk_i32 s4, 0x8000 +; GFX6-NEXT: v_mov_b32_e32 v3, 0x7fff ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1 -; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0 -; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1 -; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX6-NEXT: v_med3_i32 v1, v1, s4, v3 +; GFX6-NEXT: v_med3_i32 v0, v0, s4, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -196,16 +198,15 @@ ; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 +; GFX6-NEXT: s_movk_i32 s4, 0x8000 +; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fff ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 -; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1 -; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0 +; GFX6-NEXT: v_med3_i32 v1, v1, s4, v4 +; GFX6-NEXT: v_med3_i32 v0, v0, s4, v4 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 -; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1 -; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0 -; GFX6-NEXT: v_min_i32_e32 v2, 0x7fff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_max_i32_e32 v3, 0xffff8000, v2 +; GFX6-NEXT: v_med3_i32 v3, v2, s4, v4 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 ; GFX6-NEXT: v_alignbit_b32 v1, v3, v1, 16 @@ -268,11 +269,11 @@ ; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16 ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 +; GFX6-NEXT: s_movk_i32 s4, 0x8000 +; GFX6-NEXT: v_mov_b32_e32 v5, 0x7fff ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1 -; GFX6-NEXT: v_min_i32_e32 v0, 0x7fff, v0 -; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1 -; GFX6-NEXT: v_max_i32_e32 v0, 0xffff8000, v0 +; GFX6-NEXT: v_med3_i32 v1, v1, s4, v5 +; GFX6-NEXT: v_med3_i32 v0, v0, s4, v5 ; GFX6-NEXT: v_bfe_i32 v6, v6, 0, 16 ; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX6-NEXT: v_bfe_i32 v7, v7, 0, 16 @@ -282,10 +283,8 @@ ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v3, v7 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 -; GFX6-NEXT: v_min_i32_e32 v1, 0x7fff, v1 -; GFX6-NEXT: v_min_i32_e32 v2, 0x7fff, v2 -; GFX6-NEXT: v_max_i32_e32 v1, 0xffff8000, v1 -; GFX6-NEXT: v_max_i32_e32 v2, 0xffff8000, v2 +; GFX6-NEXT: v_med3_i32 v1, v1, s4, v5 +; GFX6-NEXT: v_med3_i32 v2, v2, s4, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 diff --git a/llvm/test/CodeGen/AMDGPU/umed3.ll b/llvm/test/CodeGen/AMDGPU/umed3.ll --- a/llvm/test/CodeGen/AMDGPU/umed3.ll +++ b/llvm/test/CodeGen/AMDGPU/umed3.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SICIVI -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SICIVI -check-prefix=VI %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s declare i32 @llvm.amdgcn.workitem.id.x() #0 @@ -81,7 +81,9 @@ } ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i16: -; SICIVI: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 +; SI: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 +; VI: v_max_u16_e32 [[MAX:v[0-9]]], 12, {{v[0-9]}} +; VI: v_min_u16_e32 {{v[0-9]}}, 17, [[MAX]] ; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 define amdgpu_kernel void @v_test_umed3_r_i_i_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll --- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll @@ -19,13 +19,12 @@ ; SDAG-VI-LABEL: basic_smax_smin: ; SDAG-VI: ; %bb.0: ; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-VI-NEXT: s_movk_i32 s4, 0xff -; SDAG-VI-NEXT: v_bfe_i32 v1, v1, 0, 16 -; SDAG-VI-NEXT: v_bfe_i32 v0, v0, 0, 16 -; SDAG-VI-NEXT: v_med3_i32 v1, v1, 0, s4 -; SDAG-VI-NEXT: v_med3_i32 v0, v0, 0, s4 -; SDAG-VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SDAG-VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; SDAG-VI-NEXT: v_max_i16_e32 v0, 0, v0 +; SDAG-VI-NEXT: v_max_i16_e32 v1, 0, v1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, 0xff +; SDAG-VI-NEXT: v_min_i16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_min_i16_e32 v0, 0xff, v0 +; SDAG-VI-NEXT: v_or_b32_e32 v0, v0, v1 ; SDAG-VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: basic_smax_smin: @@ -74,12 +73,11 @@ ; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0xff ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: s_sext_i32_i16 s2, s2 -; SDAG-VI-NEXT: s_sext_i32_i16 s3, s3 -; SDAG-VI-NEXT: v_med3_i32 v1, s2, 0, v0 -; SDAG-VI-NEXT: v_med3_i32 v0, s3, 0, v0 -; SDAG-VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SDAG-VI-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; SDAG-VI-NEXT: v_max_i16_e64 v1, s2, 0 +; SDAG-VI-NEXT: v_max_i16_e64 v2, s3, 0 +; SDAG-VI-NEXT: v_min_i16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_min_i16_e32 v1, 0xff, v1 +; SDAG-VI-NEXT: v_or_b32_e32 v2, v1, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 ; SDAG-VI-NEXT: flat_store_dword v[0:1], v2 @@ -201,29 +199,25 @@ ; SDAG-VI-NEXT: v_or_b32_e32 v0, v0, v1 ; SDAG-VI-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-GFX9-LABEL: basic_smin_smax: -; SDAG-GFX9: ; %bb.0: -; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX9-NEXT: v_min_i16_e32 v0, 0xff, v0 -; SDAG-GFX9-NEXT: v_min_i16_e32 v1, 0xff, v1 -; SDAG-GFX9-NEXT: v_max_i16_e32 v0, 0, v0 -; SDAG-GFX9-NEXT: v_max_i16_e32 v1, 0, v1 -; SDAG-GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; SDAG-GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 -; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: basic_smin_smax: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; GFX9-NEXT: v_med3_i16 v0, v0, 0, v2 +; GFX9-NEXT: v_med3_i16 v1, v1, 0, v2 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-GFX11-LABEL: basic_smin_smax: -; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; SDAG-GFX11-NEXT: v_min_i16 v0, 0xff, v0 -; SDAG-GFX11-NEXT: v_min_i16 v1, 0xff, v1 -; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; SDAG-GFX11-NEXT: v_max_i16 v0, v0, 0 -; SDAG-GFX11-NEXT: v_max_i16 v1, v1, 0 -; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; SDAG-GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: basic_smin_smax: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff +; GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-VI-LABEL: basic_smin_smax: ; GISEL-VI: ; %bb.0: @@ -235,26 +229,6 @@ ; GISEL-VI-NEXT: v_max_i16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1 ; GISEL-VI-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-GFX9-LABEL: basic_smin_smax: -; GISEL-GFX9: ; %bb.0: -; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX9-NEXT: v_mov_b32_e32 v2, 0xff -; GISEL-GFX9-NEXT: v_med3_i16 v0, v0, 0, v2 -; GISEL-GFX9-NEXT: v_med3_i16 v1, v1, 0, v2 -; GISEL-GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GISEL-GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 -; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-GFX11-LABEL: basic_smin_smax: -; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GISEL-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff -; GISEL-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff -; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GISEL-GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] %src0.min = call i16 @llvm.smin.i16(i16 %src0, i16 255) %src0.clamp = call i16 @llvm.smax.i16(i16 %src0.min, i16 0) %src1.min = call i16 @llvm.smin.i16(i16 %src1, i16 255) @@ -268,36 +242,33 @@ ; SDAG-VI-LABEL: basic_smin_smax_combined: ; SDAG-VI: ; %bb.0: ; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-VI-NEXT: v_bfe_i32 v1, v1, 0, 16 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, 0xff ; SDAG-VI-NEXT: v_min_i16_e32 v0, 0xff, v0 -; SDAG-VI-NEXT: v_med3_i32 v1, v1, 0, v2 -; SDAG-VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SDAG-VI-NEXT: v_max_i16_e32 v1, 0, v1 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, 0xff +; SDAG-VI-NEXT: v_min_i16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; SDAG-VI-NEXT: v_max_i16_e32 v0, 0, v0 ; SDAG-VI-NEXT: v_or_b32_e32 v0, v0, v1 ; SDAG-VI-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-GFX9-LABEL: basic_smin_smax_combined: -; SDAG-GFX9: ; %bb.0: -; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX9-NEXT: v_min_i16_e32 v0, 0xff, v0 -; SDAG-GFX9-NEXT: v_mov_b32_e32 v2, 0xff -; SDAG-GFX9-NEXT: v_max_i16_e32 v0, 0, v0 -; SDAG-GFX9-NEXT: v_med3_i16 v1, v1, 0, v2 -; SDAG-GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; SDAG-GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 -; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: basic_smin_smax_combined: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, 0xff +; GFX9-NEXT: v_med3_i16 v0, v0, 0, v2 +; GFX9-NEXT: v_med3_i16 v1, v1, 0, v2 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-GFX11-LABEL: basic_smin_smax_combined: -; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; SDAG-GFX11-NEXT: v_min_i16 v0, 0xff, v0 -; SDAG-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff -; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX11-NEXT: v_max_i16 v0, v0, 0 -; SDAG-GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: basic_smin_smax_combined: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff +; GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-VI-LABEL: basic_smin_smax_combined: ; GISEL-VI: ; %bb.0: @@ -309,26 +280,6 @@ ; GISEL-VI-NEXT: v_min_i16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GISEL-VI-NEXT: v_or_b32_e32 v0, v0, v1 ; GISEL-VI-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-GFX9-LABEL: basic_smin_smax_combined: -; GISEL-GFX9: ; %bb.0: -; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX9-NEXT: v_mov_b32_e32 v2, 0xff -; GISEL-GFX9-NEXT: v_med3_i16 v0, v0, 0, v2 -; GISEL-GFX9-NEXT: v_med3_i16 v1, v1, 0, v2 -; GISEL-GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GISEL-GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 -; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-GFX11-LABEL: basic_smin_smax_combined: -; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GISEL-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff -; GISEL-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff -; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GISEL-GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] %src0.min = call i16 @llvm.smin.i16(i16 %src0, i16 255) %src0.clamp = call i16 @llvm.smax.i16(i16 %src0.min, i16 0) %src1.max = call i16 @llvm.smax.i16(i16 %src1, i16 0) @@ -342,13 +293,13 @@ ; SDAG-VI-LABEL: vec_smax_smin: ; SDAG-VI: ; %bb.0: ; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-VI-NEXT: v_bfe_i32 v1, v0, 0, 16 +; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-VI-NEXT: v_max_i16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; SDAG-VI-NEXT: v_max_i16_e32 v0, 0, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v2, 0xff -; SDAG-VI-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; SDAG-VI-NEXT: v_med3_i32 v0, v0, 0, v2 -; SDAG-VI-NEXT: v_med3_i32 v1, v1, 0, v2 -; SDAG-VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SDAG-VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; SDAG-VI-NEXT: v_min_i16_e32 v0, 0xff, v0 +; SDAG-VI-NEXT: v_min_i16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_or_b32_e32 v0, v0, v1 ; SDAG-VI-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX9-LABEL: vec_smax_smin: @@ -400,12 +351,12 @@ ; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0xff ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: s_sext_i32_i16 s3, s2 -; SDAG-VI-NEXT: s_ashr_i32 s2, s2, 16 -; SDAG-VI-NEXT: v_med3_i32 v1, s3, 0, v0 -; SDAG-VI-NEXT: v_med3_i32 v0, s2, 0, v0 -; SDAG-VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SDAG-VI-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; SDAG-VI-NEXT: s_lshr_b32 s3, s2, 16 +; SDAG-VI-NEXT: v_max_i16_e64 v1, s2, 0 +; SDAG-VI-NEXT: v_max_i16_e64 v2, s3, 0 +; SDAG-VI-NEXT: v_min_i16_e32 v1, 0xff, v1 +; SDAG-VI-NEXT: v_min_i16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_or_b32_e32 v2, v1, v0 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 ; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 ; SDAG-VI-NEXT: flat_store_dword v[0:1], v2