Index: lib/Target/AMDGPU/AMDGPUInstructions.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstructions.td +++ lib/Target/AMDGPU/AMDGPUInstructions.td @@ -631,9 +631,10 @@ class IntMed3Pat : Pat< - (max (min_oneuse i32:$src0, i32:$src1), - (min_oneuse (max_oneuse i32:$src0, i32:$src1), i32:$src2)), + SDPatternOperator min_oneuse, + ValueType vt = i32> : Pat< + (max (min_oneuse vt:$src0, vt:$src1), + (min_oneuse (max_oneuse vt:$src0, vt:$src1), vt:$src2)), (med3Inst $src0, $src1, $src2) >; Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -271,6 +271,10 @@ return (getGeneration() >= EVERGREEN); } + bool hasMed3_16() const { + return getGeneration() >= GFX9; + } + bool hasCARRY() const { return (getGeneration() >= EVERGREEN); } Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -84,6 +84,8 @@ SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFCanonicalizeCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, + SDValue Op0, SDValue Op1, bool Signed) const; SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const; unsigned getFusedOpcode(const SelectionDAG &DAG, Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -4037,8 +4037,9 @@ } } -static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, - SDValue Op0, SDValue Op1, bool Signed) { +SDValue SITargetLowering::performIntMed3ImmCombine( + SelectionDAG &DAG, const SDLoc &SL, + SDValue Op0, SDValue Op1, bool Signed) const { ConstantSDNode *K1 = dyn_cast(Op1); if (!K1) return SDValue(); @@ -4056,23 +4057,22 @@ } EVT VT = K0->getValueType(0); + unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3; + if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) { + return DAG.getNode(Med3Opc, SL, VT, + Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0)); + } + // If there isn't a 16-bit med3 operation, convert to 32-bit. MVT NVT = MVT::i32; unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; - SDValue Tmp1, Tmp2, Tmp3; - Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0)); - Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1)); - Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1); + SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0)); + SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1)); + SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1); - if (VT == MVT::i16) { - Tmp1 = DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, NVT, - Tmp1, Tmp2, Tmp3); - - return DAG.getNode(ISD::TRUNCATE, SL, VT, Tmp1); - } else - return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT, - Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0)); + SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3); + return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3); } static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) { @@ -4161,7 +4161,9 @@ if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) || (Opc == AMDGPUISD::FMIN_LEGACY && Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) && - N->getValueType(0) == MVT::f32 && Op0.hasOneUse()) { + (N->getValueType(0) == MVT::f32 || + (N->getValueType(0) == MVT::f16 && Subtarget->hasMed3_16())) && + Op0.hasOneUse()) { if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1)) return Res; } Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -1318,7 +1318,7 @@ def VOP_F16_F16_I32 : VOPProfile <[f16, f16, i32, untyped]>; def VOP_I16_I16_I16 : VOPProfile <[i32, i32, i32, untyped]>; -def VOP_I16_I16_I16_I16 : VOPProfile <[i32, i32, i32, i32, untyped]>; +def VOP_I16_I16_I16_I16 : VOPProfile <[i16, i16, i16, i16, untyped]>; def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>; def VOP_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, untyped]>; Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -1194,6 +1194,14 @@ // Miscellaneous Optimization Patterns //============================================================================// +// Undo sub x, c -> add x, -c canonicalization since c is more likely +// an inline immediate than -c. +// TODO: Also do for 64-bit. +def : Pat< + (add i32:$src0, (i32 NegSubInlineConst32:$src1)), + (S_SUB_I32 $src0, NegSubInlineConst32:$src1) +>; + def : SHA256MaPattern ; def : IntMed3Pat; @@ -1213,14 +1221,11 @@ def : FPMed3Pat; - -// Undo sub x, c -> add x, -c canonicalization since c is more likely -// an inline immediate than -c. -// TODO: Also do for 64-bit. -def : Pat< - (add i32:$src0, (i32 NegSubInlineConst32:$src1)), - (S_SUB_I32 $src0, NegSubInlineConst32:$src1) ->; +let Predicates = [isGFX9] in { +def : FPMed3Pat; +def : IntMed3Pat; +def : IntMed3Pat; +} // End Predicates = [isGFX9] //============================================================================// // Assembler aliases Index: lib/Target/AMDGPU/VOP3Instructions.td =================================================================== --- lib/Target/AMDGPU/VOP3Instructions.td +++ lib/Target/AMDGPU/VOP3Instructions.td @@ -258,8 +258,8 @@ let Predicates = [isVI] in { -multiclass Tenary_i16_Pats { +multiclass Ternary_i16_Pats { def : Pat< (op2 (op1 i16:$src0, i16:$src1), i16:$src2), (inst i16:$src0, i16:$src1, i16:$src2) @@ -278,8 +278,8 @@ >; } -defm: Tenary_i16_Pats; -defm: Tenary_i16_Pats; +defm: Ternary_i16_Pats; +defm: Ternary_i16_Pats; } // End Predicates = [isVI] @@ -291,6 +291,10 @@ def V_LSHL_OR_B32 : VOP3Inst <"v_lshl_or_b32", VOP3_Profile>; def V_AND_OR_B32 : VOP3Inst <"v_and_or_b32", VOP3_Profile>; def V_OR3_B32 : VOP3Inst <"v_or3_b32", VOP3_Profile>; + +def V_MED3_F16 : VOP3Inst <"v_med3_f16", VOP3_Profile, AMDGPUfmed3>; +def V_MED3_I16 : VOP3Inst <"v_med3_i16", VOP3_Profile, AMDGPUsmed3>; +def V_MED3_U16 : VOP3Inst <"v_med3_u16", VOP3_Profile, AMDGPUumed3>; } @@ -487,3 +491,7 @@ defm V_AND_OR_B32 : VOP3_Real_vi <0x201>; defm V_OR3_B32 : VOP3_Real_vi <0x202>; defm V_PACK_B32_F16 : VOP3_Real_vi <0x2a0>; + +defm V_MED3_F16 : VOP3_Real_vi <0x1fa>; +defm V_MED3_I16 : VOP3_Real_vi <0x1fb>; +defm V_MED3_U16 : VOP3_Real_vi <0x1fc>; Index: test/CodeGen/AMDGPU/fmed3.ll =================================================================== --- test/CodeGen/AMDGPU/fmed3.ll +++ test/CodeGen/AMDGPU/fmed3.ll @@ -1,5 +1,10 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=NOSNAN -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mattr=+fp-exceptions -verify-machineinstrs < %s | FileCheck -check-prefix=SNAN -check-prefix=GCN %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=NOSNAN -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mattr=+fp-exceptions -verify-machineinstrs < %s | FileCheck -check-prefix=SNAN -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=NOSNAN -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp-exceptions -verify-machineinstrs < %s | FileCheck -check-prefix=SNAN -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=NOSNAN -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp-exceptions -verify-machineinstrs < %s | FileCheck -check-prefix=SNAN -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s + ; GCN-LABEL: {{^}}v_test_nnan_input_fmed3_r_i_i_f32: ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v{{[0-9]+}} @@ -688,8 +693,8 @@ ; --------------------------------------------------------------------- ; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0_multi_use0: -; GCN: v_min_f32 -; GCN: v_max_f32 +; GCN-DAG: v_min_f32 +; GCN-DAG: v_max_f32 ; GCN: v_min_f32 ; GCN: v_max_f32 define void @v_test_safe_med3_f32_pat0_multi_use0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 { @@ -884,12 +889,86 @@ ret void } +; GCN-LABEL: {{^}}v_test_nnan_input_fmed3_r_i_i_f16: +; SI: v_cvt_f32_f16 +; SI: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} +; SI: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0 +; SI: v_cvt_f16_f32 + +; VI: v_add_f16_e32 v{{[0-9]+}}, 1.0 +; VI: v_max_f16_e32 v{{[0-9]+}}, 2.0 +; VI: v_min_f16_e32 v{{[0-9]+}}, 4.0 + +; GFX9: v_add_f16_e32 v{{[0-9]+}}, 1.0 +; GFX9: v_med3_f16 v{{[0-9]+}}, [[ADD]], 2.0, 4.0 +define void @v_test_nnan_input_fmed3_r_i_i_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr half, half addrspace(1)* %out, i32 %tid + %a = load half, half addrspace(1)* %gep0 + %a.add = fadd nnan half %a, 1.0 + %max = call half @llvm.maxnum.f16(half %a.add, half 2.0) + %med = call half @llvm.minnum.f16(half %max, half 4.0) + + store half %med, half addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_nnan_inputs_med3_f16_pat0: +; GCN: {{buffer_|flat_}}load_ushort [[A:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_ushort [[B:v[0-9]+]] +; GCN: {{buffer_|flat_}}load_ushort [[C:v[0-9]+]] + +; SI: v_cvt_f32_f16 +; SI: v_cvt_f32_f16 +; SI: v_add_f32_e32 +; SI: v_add_f32_e32 +; SI: v_add_f32_e32 +; SI: v_med3_f32 +; SI: v_cvt_f16_f32_e32 + + +; GFX89-DAG: v_add_f16_e32 [[A_ADD:v[0-9]+]], 1.0, [[A]] +; GFX89-DAG: v_add_f16_e32 [[B_ADD:v[0-9]+]], 2.0, [[B]] +; GFX89-DAG: v_add_f16_e32 [[C_ADD:v[0-9]+]], 4.0, [[C]] + +; VI-DAG: v_min_f16 +; VI-DAG: v_max_f16 +; VI: v_min_f16 +; VI: v_max_f16 + +; GFX9: v_med3_f16 v{{[0-9]+}}, [[A_ADD]], [[B_ADD]], [[C_ADD]] +define void @v_nnan_inputs_med3_f16_pat0(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr half, half addrspace(1)* %bptr, i32 %tid + %gep2 = getelementptr half, half addrspace(1)* %cptr, i32 %tid + %outgep = getelementptr half, half addrspace(1)* %out, i32 %tid + %a = load volatile half, half addrspace(1)* %gep0 + %b = load volatile half, half addrspace(1)* %gep1 + %c = load volatile half, half addrspace(1)* %gep2 + + %a.nnan = fadd nnan half %a, 1.0 + %b.nnan = fadd nnan half %b, 2.0 + %c.nnan = fadd nnan half %c, 4.0 + + %tmp0 = call half @llvm.minnum.f16(half %a.nnan, half %b.nnan) + %tmp1 = call half @llvm.maxnum.f16(half %a.nnan, half %b.nnan) + %tmp2 = call half @llvm.minnum.f16(half %tmp1, half %c.nnan) + %med3 = call half @llvm.maxnum.f16(half %tmp0, half %tmp2) + store half %med3, half addrspace(1)* %outgep + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #0 declare float @llvm.fabs.f32(float) #0 declare float @llvm.minnum.f32(float, float) #0 declare float @llvm.maxnum.f32(float, float) #0 declare double @llvm.minnum.f64(double, double) #0 declare double @llvm.maxnum.f64(double, double) #0 +declare half @llvm.fabs.f16(half) #0 +declare half @llvm.minnum.f16(half, half) #0 +declare half @llvm.maxnum.f16(half, half) #0 attributes #0 = { nounwind readnone } attributes #1 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="false" } Index: test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.f16.ll @@ -0,0 +1,39 @@ +; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}test_fmed3_f16: +; GCN: v_med3_f16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define void @test_fmed3_f16(half addrspace(1)* %out, i32 %src0.arg, i32 %src1.arg, i32 %src2.arg) #1 { + %src0.f16 = trunc i32 %src0.arg to i16 + %src0 = bitcast i16 %src0.f16 to half + %src1.f16 = trunc i32 %src1.arg to i16 + %src1 = bitcast i16 %src1.f16 to half + %src2.f16 = trunc i32 %src2.arg to i16 + %src2 = bitcast i16 %src2.f16 to half + %mad = call half @llvm.amdgcn.fmed3.f16(half %src0, half %src1, half %src2) + store half %mad, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_fmed3_srcmods_f16: +; GCN: v_med3_f16 v{{[0-9]+}}, -s{{[0-9]+}}, |v{{[0-9]+}}|, -|v{{[0-9]+}}| +define void @test_fmed3_srcmods_f16(half addrspace(1)* %out, i32 %src0.arg, i32 %src1.arg, i32 %src2.arg) #1 { + %src0.f16 = trunc i32 %src0.arg to i16 + %src0 = bitcast i16 %src0.f16 to half + %src1.f16 = trunc i32 %src1.arg to i16 + %src1 = bitcast i16 %src1.f16 to half + %src2.f16 = trunc i32 %src2.arg to i16 + %src2 = bitcast i16 %src2.f16 to half + %src0.fneg = fsub half -0.0, %src0 + %src1.fabs = call half @llvm.fabs.f16(half %src1) + %src2.fabs = call half @llvm.fabs.f16(half %src2) + %src2.fneg.fabs = fsub half -0.0, %src2.fabs + %mad = call half @llvm.amdgcn.fmed3.f16(half %src0.fneg, half %src1.fabs, half %src2.fneg.fabs) + store half %mad, half addrspace(1)* %out + ret void +} + +declare half @llvm.amdgcn.fmed3.f16(half, half, half) #0 +declare half @llvm.fabs.f16(half) #0 + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } Index: test/CodeGen/AMDGPU/smed3.ll =================================================================== --- test/CodeGen/AMDGPU/smed3.ll +++ test/CodeGen/AMDGPU/smed3.ll @@ -1,12 +1,13 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SICIVI -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SICIVI -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s -declare i32 @llvm.r600.read.tidig.x() #0 +declare i32 @llvm.amdgcn.workitem.id.x() #0 ; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i32: ; GCN: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 define void @v_test_smed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() + %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid %a = load i32, i32 addrspace(1)* %gep0 @@ -25,7 +26,7 @@ ; GCN: v_max_i32 ; GCN: v_min_i32 define void @v_test_smed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() + %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid %a = load i32, i32 addrspace(1)* %gep0 @@ -45,7 +46,7 @@ ; GCN: v_max_i32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}} ; GCN: v_min_i32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}} define void @v_test_smed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() + %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid %a = load i32, i32 addrspace(1)* %gep0 @@ -64,7 +65,7 @@ ; GCN: v_max_u32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}} ; GCN: v_min_i32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}} define void @v_test_smed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() + %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid %a = load i32, i32 addrspace(1)* %gep0 @@ -83,7 +84,7 @@ ; GCN: v_cmp_lt_i64 ; GCN: v_cmp_gt_i64 define void @v_test_smed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() + %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid %outgep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid %a = load i64, i64 addrspace(1)* %gep0 @@ -99,9 +100,10 @@ } ; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i16: -; GCN: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 +; SICIVI: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 +; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 define void @v_test_smed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() + %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid %a = load i16, i16 addrspace(1)* %gep0 @@ -362,6 +364,7 @@ ret void } +; FIXME: Should keep scalar or not promote ; GCN-LABEL: {{^}}s_test_smed3_i16_pat_0: ; GCN: s_sext_i32_i16 ; GCN: s_sext_i32_i16 @@ -444,6 +447,35 @@ ret void } +; GCN-LABEL: {{^}}v_test_smed3_i16_pat_0: +; SI: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + +; FIXME: VI not matching med3 +; VI: v_min_i16 +; VI: v_max_i16 +; VI: v_min_i16 +; VI: v_max_i16 + +; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define void @v_test_smed3_i16_pat_0(i16 addrspace(1)* %arg, i16 addrspace(1)* %out, i16 addrspace(1)* %a.ptr) #1 { +bb: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i32 %tid + %gep1 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 3 + %gep2 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 8 + %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid + %x = load i16, i16 addrspace(1)* %gep0 + %y = load i16, i16 addrspace(1)* %gep1 + %z = load i16, i16 addrspace(1)* %gep2 + + %tmp0 = call i16 @smin16(i16 %x, i16 %y) + %tmp1 = call i16 @smax16(i16 %x, i16 %y) + %tmp2 = call i16 @smin16(i16 %tmp1, i16 %z) + %tmp3 = call i16 @smax16(i16 %tmp0, i16 %tmp2) + store i16 %tmp3, i16 addrspace(1)* %out.gep + ret void +} + attributes #0 = { nounwind readnone } attributes #1 = { nounwind } attributes #2 = { nounwind readnone alwaysinline } Index: test/CodeGen/AMDGPU/umed3.ll =================================================================== --- test/CodeGen/AMDGPU/umed3.ll +++ test/CodeGen/AMDGPU/umed3.ll @@ -1,12 +1,13 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SICIVI -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SICIVI -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s -declare i32 @llvm.r600.read.tidig.x() #0 +declare i32 @llvm.amdgcn.workitem.id.x() #0 ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i32: ; GCN: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 define void @v_test_umed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() + %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid %a = load i32, i32 addrspace(1)* %gep0 @@ -25,7 +26,7 @@ ; GCN: v_max_u32 ; GCN: v_min_u32 define void @v_test_umed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() + %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid %a = load i32, i32 addrspace(1)* %gep0 @@ -45,7 +46,7 @@ ; GCN: v_max_u32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}} ; GCN: v_min_u32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}} define void @v_test_umed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() + %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid %a = load i32, i32 addrspace(1)* %gep0 @@ -64,7 +65,7 @@ ; GCN: v_max_i32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}} ; GCN: v_min_u32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}} define void @v_test_umed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() + %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid %a = load i32, i32 addrspace(1)* %gep0 @@ -83,7 +84,7 @@ ; GCN: v_cmp_lt_u64 ; GCN: v_cmp_gt_u64 define void @v_test_umed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() + %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid %outgep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid %a = load i64, i64 addrspace(1)* %gep0 @@ -99,9 +100,10 @@ } ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i16: -; GCN: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 +; SICIVI: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 +; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 define void @v_test_umed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 { - %tid = call i32 @llvm.r600.read.tidig.x() + %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid %a = load i16, i16 addrspace(1)* %gep0 @@ -479,6 +481,35 @@ ret void } +; GCN-LABEL: {{^}}v_test_umed3_i16_pat_0: +; SI: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + +; FIXME: VI not matching med3 +; VI: v_min_u16 +; VI: v_max_u16 +; VI: v_min_u16 +; VI: v_max_u16 + +; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define void @v_test_umed3_i16_pat_0(i16 addrspace(1)* %arg, i16 addrspace(1)* %out, i16 addrspace(1)* %a.ptr) #1 { +bb: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i32 %tid + %gep1 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 3 + %gep2 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 8 + %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid + %x = load i16, i16 addrspace(1)* %gep0 + %y = load i16, i16 addrspace(1)* %gep1 + %z = load i16, i16 addrspace(1)* %gep2 + + %tmp0 = call i16 @umin16(i16 %x, i16 %y) + %tmp1 = call i16 @umax16(i16 %x, i16 %y) + %tmp2 = call i16 @umin16(i16 %tmp1, i16 %z) + %tmp3 = call i16 @umax16(i16 %tmp0, i16 %tmp2) + store i16 %tmp3, i16 addrspace(1)* %out.gep + ret void +} + attributes #0 = { nounwind readnone } attributes #1 = { nounwind } attributes #2 = { nounwind readnone alwaysinline } Index: test/MC/AMDGPU/vop3-gfx9.s =================================================================== --- test/MC/AMDGPU/vop3-gfx9.s +++ test/MC/AMDGPU/vop3-gfx9.s @@ -30,3 +30,15 @@ v_pack_b32_f16 v1, v2, v3 // GFX9: v_pack_b32_f16 v1, v2, v3 ; encoding: [0x01,0x00,0xa0,0xd2,0x02,0x07,0x02,0x00] // NOVI: :1: error: instruction not supported on this GPU + +v_med3_f16 v1, v2, v3, v4 +// GFX9: v_med3_f16 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xfa,0xd1,0x02,0x07,0x12,0x04] +// NOVI: :1: error: instruction not supported on this GPU + +v_med3_i16 v1, v2, v3, v4 +// GFX9: v_med3_i16 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xfb,0xd1,0x02,0x07,0x12,0x04] +// NOVI: :1: error: instruction not supported on this GPU + +v_med3_u16 v1, v2, v3, v4 +// GFX9: v_med3_u16 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xfc,0xd1,0x02,0x07,0x12,0x04] +// NOVI: :1: error: instruction not supported on this GPU