Index: llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -445,8 +445,6 @@ case AMDGPUISD::CLAMP: case AMDGPUISD::COS_HW: case AMDGPUISD::SIN_HW: - case AMDGPUISD::FMIN3: - case AMDGPUISD::FMAX3: case AMDGPUISD::FMED3: case AMDGPUISD::FMAD_FTZ: case AMDGPUISD::RCP: Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -385,12 +385,6 @@ FMAX_LEGACY, FMIN_LEGACY, - FMAX3, - SMAX3, - UMAX3, - FMIN3, - SMIN3, - UMIN3, FMED3, SMED3, UMED3, Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4365,12 +4365,6 @@ NODE_NAME_CASE(SIN_HW) NODE_NAME_CASE(FMAX_LEGACY) NODE_NAME_CASE(FMIN_LEGACY) - NODE_NAME_CASE(FMAX3) - NODE_NAME_CASE(SMAX3) - NODE_NAME_CASE(UMAX3) - NODE_NAME_CASE(FMIN3) - NODE_NAME_CASE(SMIN3) - NODE_NAME_CASE(UMIN3) NODE_NAME_CASE(FMED3) NODE_NAME_CASE(SMED3) NODE_NAME_CASE(UMED3) @@ -4754,8 +4748,6 @@ DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1); } case AMDGPUISD::FMED3: - case AMDGPUISD::FMIN3: - case AMDGPUISD::FMAX3: case AMDGPUISD::FMAD_FTZ: { if (SNaN) return true; Index: llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -150,38 +150,6 @@ [] >; -// FIXME: TableGen doesn't like commutative instructions with more -// than 2 operands. -// out = max(a, b, c) a, b and c are floats -def AMDGPUfmax3 : SDNode<"AMDGPUISD::FMAX3", SDTFPTernaryOp, - [/*SDNPCommutative, SDNPAssociative*/] ->; - -// out = max(a, b, c) a, b, and c are signed ints -def AMDGPUsmax3 : SDNode<"AMDGPUISD::SMAX3", AMDGPUDTIntTernaryOp, - [/*SDNPCommutative, SDNPAssociative*/] ->; - -// out = max(a, b, c) a, b and c are unsigned ints -def AMDGPUumax3 : SDNode<"AMDGPUISD::UMAX3", AMDGPUDTIntTernaryOp, - [/*SDNPCommutative, SDNPAssociative*/] ->; - -// out = min(a, b, c) a, b and c are floats -def AMDGPUfmin3 : SDNode<"AMDGPUISD::FMIN3", SDTFPTernaryOp, - [/*SDNPCommutative, SDNPAssociative*/] ->; - -// out = min(a, b, c) a, b and c are signed ints -def AMDGPUsmin3 : SDNode<"AMDGPUISD::SMIN3", AMDGPUDTIntTernaryOp, - [/*SDNPCommutative, SDNPAssociative*/] ->; - -// out = min(a, b) a and b are unsigned ints -def AMDGPUumin3 : SDNode<"AMDGPUISD::UMIN3", AMDGPUDTIntTernaryOp, - [/*SDNPCommutative, SDNPAssociative*/] ->; - // out = (src0 + src1 > 0xFFFFFFFF) ? 1 : 0 def AMDGPUcarry : SDNode<"AMDGPUISD::CARRY", SDTIntBinOp, []>; Index: llvm/lib/Target/AMDGPU/GCNSubtarget.h =================================================================== --- llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -359,10 +359,6 @@ return getGeneration() >= AMDGPUSubtarget::GFX9; } - bool hasMin3Max3_16() const { - return getGeneration() >= AMDGPUSubtarget::GFX9; - } - bool hasFmaMixInsts() const { return HasFmaMixInsts; } Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -9653,9 +9653,7 @@ case ISD::FMINNUM_IEEE: case ISD::FMAXNUM_IEEE: case AMDGPUISD::CLAMP: - case AMDGPUISD::FMED3: - case AMDGPUISD::FMAX3: - case AMDGPUISD::FMIN3: { + case AMDGPUISD::FMED3: { // FIXME: Shouldn't treat the generic operations different based these. // However, we aren't really required to flush the result from // minnum/maxnum.. @@ -9904,27 +9902,6 @@ return isCanonicalized(DAG, N0) ? N0 : SDValue(); } -static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) { - switch (Opc) { - case ISD::FMAXNUM: - case ISD::FMAXNUM_IEEE: - return AMDGPUISD::FMAX3; - case ISD::SMAX: - return AMDGPUISD::SMAX3; - case ISD::UMAX: - return AMDGPUISD::UMAX3; - case ISD::FMINNUM: - case ISD::FMINNUM_IEEE: - return AMDGPUISD::FMIN3; - case ISD::SMIN: - return AMDGPUISD::SMIN3; - case ISD::UMIN: - return AMDGPUISD::UMIN3; - default: - llvm_unreachable("Not a min/max opcode"); - } -} - SDValue SITargetLowering::performIntMed3ImmCombine( SelectionDAG &DAG, const SDLoc &SL, SDValue Op0, SDValue Op1, bool Signed) const { @@ -10044,36 +10021,6 @@ // Only do this if the inner op has one use since this will just increases // register pressure for no benefit. - if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY && - !VT.isVector() && - (VT == MVT::i32 || VT == MVT::f32 || - ((VT == MVT::f16 || VT == MVT::i16) && Subtarget->hasMin3Max3_16()))) { - // max(max(a, b), c) -> max3(a, b, c) - // min(min(a, b), c) -> min3(a, b, c) - if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { - SDLoc DL(N); - return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), - DL, - N->getValueType(0), - Op0.getOperand(0), - Op0.getOperand(1), - Op1); - } - - // Try commuted. - // max(a, max(b, c)) -> max3(a, b, c) - // min(a, min(b, c)) -> min3(a, b, c) - if (Op1.getOpcode() == Opc && Op1.hasOneUse()) { - SDLoc DL(N); - return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), - DL, - N->getValueType(0), - Op0, - Op1.getOperand(0), - Op1.getOperand(1)); - } - } - // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1) if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) { if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true)) Index: llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstructions.td +++ llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2600,6 +2600,48 @@ (i32 (EXTRACT_SUBREG SReg_64:$y, sub1))), sub1) >; +// min(min(x, y), z)) or max(max(x, y), z)) -> min3(x, y, z) or max3(x, y, z) +class Int32Min3OrMax3Pat : AMDGPUPat < + (DivergentBinFrag (min_or_max_oneuse i32:$src0, i32:$src1), + i32:$src2), + (min3_or_max3 VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2) +>; + +class Int16Min3OrMax3Pat : GCNPat < + (DivergentBinFrag (min_or_max_oneuse i16:$src0, i16:$src1), + i16:$src2), + (min3_or_max3 SRCMODS.NONE, VSrc_b16:$src0, SRCMODS.NONE, VSrc_b16:$src1, + SRCMODS.NONE, VSrc_b16:$src2, DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +class FPMin3OrMax3Pat : GCNPat < + (min_or_max (min_or_max_oneuse (VOP3Mods vt:$src0, i32:$src0_mods), + (VOP3Mods vt:$src1, i32:$src1_mods)), + (vt (VOP3Mods vt:$src2, i32:$src2_mods))), + (min3_or_max3 $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, + DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +def : Int32Min3OrMax3Pat; +def : Int32Min3OrMax3Pat; +def : Int32Min3OrMax3Pat; +def : Int32Min3OrMax3Pat; +def : FPMin3OrMax3Pat; +def : FPMin3OrMax3Pat; + +let SubtargetPredicate = isGFX9Plus in { +def : Int16Min3OrMax3Pat; +def : Int16Min3OrMax3Pat; +def : Int16Min3OrMax3Pat; +def : Int16Min3OrMax3Pat; +def : FPMin3OrMax3Pat; +def : FPMin3OrMax3Pat; +} + multiclass IntMed3Pat, AMDGPUsmin3>; - defm V_MIN3_U32 : VOP3Inst <"v_min3_u32", VOP3_Profile, AMDGPUumin3>; - defm V_MAX3_I32 : VOP3Inst <"v_max3_i32", VOP3_Profile, AMDGPUsmax3>; - defm V_MAX3_U32 : VOP3Inst <"v_max3_u32", VOP3_Profile, AMDGPUumax3>; + defm V_MIN3_I32 : VOP3Inst <"v_min3_i32", VOP3_Profile>; + defm V_MIN3_U32 : VOP3Inst <"v_min3_u32", VOP3_Profile>; + defm V_MAX3_I32 : VOP3Inst <"v_max3_i32", VOP3_Profile>; + defm V_MAX3_U32 : VOP3Inst <"v_max3_u32", VOP3_Profile>; defm V_MED3_I32 : VOP3Inst <"v_med3_i32", VOP3_Profile, AMDGPUsmed3>; defm V_MED3_U32 : VOP3Inst <"v_med3_u32", VOP3_Profile, AMDGPUumed3>; } // End isCommutable = 1 - defm V_MIN3_F32 : VOP3Inst <"v_min3_f32", VOP3_Profile, AMDGPUfmin3>; - defm V_MAX3_F32 : VOP3Inst <"v_max3_f32", VOP3_Profile, AMDGPUfmax3>; + defm V_MIN3_F32 : VOP3Inst <"v_min3_f32", VOP3_Profile>; + defm V_MAX3_F32 : VOP3Inst <"v_max3_f32", VOP3_Profile>; defm V_MED3_F32 : VOP3Inst <"v_med3_f32", VOP3_Profile, AMDGPUfmed3>; } // End mayRaiseFPException = 0 @@ -624,13 +624,13 @@ defm V_MED3_I16 : VOP3Inst <"v_med3_i16", VOP3_Profile, AMDGPUsmed3>; defm V_MED3_U16 : VOP3Inst <"v_med3_u16", VOP3_Profile, AMDGPUumed3>; -defm V_MIN3_F16 : VOP3Inst <"v_min3_f16", VOP3_Profile, AMDGPUfmin3>; -defm V_MIN3_I16 : VOP3Inst <"v_min3_i16", VOP3_Profile, AMDGPUsmin3>; -defm V_MIN3_U16 : VOP3Inst <"v_min3_u16", VOP3_Profile, AMDGPUumin3>; +defm V_MIN3_F16 : VOP3Inst <"v_min3_f16", VOP3_Profile>; +defm V_MIN3_I16 : VOP3Inst <"v_min3_i16", VOP3_Profile>; +defm V_MIN3_U16 : VOP3Inst <"v_min3_u16", VOP3_Profile>; -defm V_MAX3_F16 : VOP3Inst <"v_max3_f16", VOP3_Profile, AMDGPUfmax3>; -defm V_MAX3_I16 : VOP3Inst <"v_max3_i16", VOP3_Profile, AMDGPUsmax3>; -defm V_MAX3_U16 : VOP3Inst <"v_max3_u16", VOP3_Profile, AMDGPUumax3>; +defm V_MAX3_F16 : VOP3Inst <"v_max3_f16", VOP3_Profile>; +defm V_MAX3_I16 : VOP3Inst <"v_max3_i16", VOP3_Profile>; +defm V_MAX3_U16 : VOP3Inst <"v_max3_u16", VOP3_Profile>; defm V_ADD_I16 : VOP3Inst <"v_add_i16", VOP3_Profile>; defm V_SUB_I16 : VOP3Inst <"v_sub_i16", VOP3_Profile>; Index: llvm/test/CodeGen/AMDGPU/GlobalISel/min3-and-max3.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/min3-and-max3.ll @@ -0,0 +1,717 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s + +; umin3 + +define i32 @min3_u32(i32 %a, i32 %b, i32 %c) { +; VI-LABEL: min3_u32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_min3_u32 v0, v0, v1, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: min3_u32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_min3_u32 v0, v0, v1, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %umin = call i32 @llvm.umin.i32(i32 %a, i32 %b) + %umin3 = call i32 @llvm.umin.i32(i32 %umin, i32 %c) + ret i32 %umin3 +} + +define amdgpu_ps i32 @min3_u32_uniform(i32 inreg %a, i32 inreg %b, i32 inreg %c) { +; VI-LABEL: min3_u32_uniform: +; VI: ; %bb.0: +; VI-NEXT: s_min_u32 s0, s2, s3 +; VI-NEXT: s_min_u32 s0, s0, s4 +; VI-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: min3_u32_uniform: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_min_u32 s0, s2, s3 +; GFX10-NEXT: s_min_u32 s0, s0, s4 +; GFX10-NEXT: ; return to shader part epilog + %umin = call i32 @llvm.umin.i32(i32 %a, i32 %b) + %umin3 = call i32 @llvm.umin.i32(i32 %umin, i32 %c) + ret i32 %umin3 +} + +define i32 @min3_u32_commute(i32 %a, i32 %b, i32 %c) { +; VI-LABEL: min3_u32_commute: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_min3_u32 v0, v0, v1, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: min3_u32_commute: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_min3_u32 v0, v0, v1, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %umin = call i32 @llvm.umin.i32(i32 %a, i32 %b) + %umin3 = call i32 @llvm.umin.i32(i32 %c, i32 %umin) + ret i32 %umin3 +} + +define i16 @min3_u16(i16 %a, i16 %b, i16 %c) { +; VI-LABEL: min3_u16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_min_u16_e32 v0, v0, v1 +; VI-NEXT: v_min_u16_e32 v0, v0, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: min3_u16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_min3_u16 v0, v0, v1, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %umin = call i16 @llvm.umin.i16(i16 %a, i16 %b) + %umin3 = call i16 @llvm.umin.i16(i16 %umin, i16 %c) + ret i16 %umin3 +} + +define i16 @min3_u16_commute(i16 %a, i16 %b, i16 %c) { +; VI-LABEL: min3_u16_commute: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_min_u16_e32 v0, v0, v1 +; VI-NEXT: v_min_u16_e32 v0, v2, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: min3_u16_commute: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_min3_u16 v0, v0, v1, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %umin = call i16 @llvm.umin.i16(i16 %a, i16 %b) + %umin3 = call i16 @llvm.umin.i16(i16 %c, i16 %umin) + ret i16 %umin3 +} + +; smin3 + +define i32 @min3_i32(i32 %a, i32 %b, i32 %c) { +; VI-LABEL: min3_i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_min3_i32 v0, v0, v1, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: min3_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_min3_i32 v0, v0, v1, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %smin = call i32 @llvm.smin.i32(i32 %a, i32 %b) + %smin3 = call i32 @llvm.smin.i32(i32 %smin, i32 %c) + ret i32 %smin3 +} + +define amdgpu_ps i32 @min3_i32_uniform(i32 inreg %a, i32 inreg %b, i32 inreg %c) { +; VI-LABEL: min3_i32_uniform: +; VI: ; %bb.0: +; VI-NEXT: s_min_i32 s0, s2, s3 +; VI-NEXT: s_min_i32 s0, s0, s4 +; VI-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: min3_i32_uniform: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_min_i32 s0, s2, s3 +; GFX10-NEXT: s_min_i32 s0, s0, s4 +; GFX10-NEXT: ; return to shader part epilog + %smin = call i32 @llvm.smin.i32(i32 %a, i32 %b) + %smin3 = call i32 @llvm.smin.i32(i32 %smin, i32 %c) + ret i32 %smin3 +} + +define i32 @min3_i32_commute(i32 %a, i32 %b, i32 %c) { +; VI-LABEL: min3_i32_commute: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_min3_i32 v0, v0, v1, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: min3_i32_commute: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_min3_i32 v0, v0, v1, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %smin = call i32 @llvm.smin.i32(i32 %a, i32 %b) + %smin3 = call i32 @llvm.smin.i32(i32 %c, i32 %smin) + ret i32 %smin3 +} + +define i16 @min3_i16(i16 %a, i16 %b, i16 %c) { +; VI-LABEL: min3_i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_min_i16_e32 v0, v0, v1 +; VI-NEXT: v_min_i16_e32 v0, v0, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: min3_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_min3_i16 v0, v0, v1, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %smin = call i16 @llvm.smin.i16(i16 %a, i16 %b) + %smin3 = call i16 @llvm.smin.i16(i16 %smin, i16 %c) + ret i16 %smin3 +} + +define i16 @min3_i16_commute(i16 %a, i16 %b, i16 %c) { +; VI-LABEL: min3_i16_commute: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_min_i16_e32 v0, v0, v1 +; VI-NEXT: v_min_i16_e32 v0, v2, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: min3_i16_commute: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_min3_i16 v0, v0, v1, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %smin = call i16 @llvm.smin.i16(i16 %a, i16 %b) + %smin3 = call i16 @llvm.smin.i16(i16 %c, i16 %smin) + ret i16 %smin3 +} + + +; umax3 + +define i32 @max3_u32(i32 %a, i32 %b, i32 %c) { +; VI-LABEL: max3_u32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_max3_u32 v0, v0, v1, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: max3_u32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_max3_u32 v0, v0, v1, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %umax = call i32 @llvm.umax.i32(i32 %a, i32 %b) + %umax3 = call i32 @llvm.umax.i32(i32 %umax, i32 %c) + ret i32 %umax3 +} + +define amdgpu_ps i32 @max3_u32_uniform(i32 inreg %a, i32 inreg %b, i32 inreg %c) { +; VI-LABEL: max3_u32_uniform: +; VI: ; %bb.0: +; VI-NEXT: s_max_u32 s0, s2, s3 +; VI-NEXT: s_max_u32 s0, s0, s4 +; VI-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: max3_u32_uniform: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_max_u32 s0, s2, s3 +; GFX10-NEXT: s_max_u32 s0, s0, s4 +; GFX10-NEXT: ; return to shader part epilog + %umax = call i32 @llvm.umax.i32(i32 %a, i32 %b) + %umax3 = call i32 @llvm.umax.i32(i32 %umax, i32 %c) + ret i32 %umax3 +} + +define i32 @max3_u32_commute(i32 %a, i32 %b, i32 %c) { +; VI-LABEL: max3_u32_commute: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_max3_u32 v0, v0, v1, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: max3_u32_commute: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_max3_u32 v0, v0, v1, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %umax = call i32 @llvm.umax.i32(i32 %a, i32 %b) + %umax3 = call i32 @llvm.umax.i32(i32 %c, i32 %umax) + ret i32 %umax3 +} + +define i16 @max3_u16(i16 %a, i16 %b, i16 %c) { +; VI-LABEL: max3_u16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_max_u16_e32 v0, v0, v1 +; VI-NEXT: v_max_u16_e32 v0, v0, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: max3_u16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_max3_u16 v0, v0, v1, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %umax = call i16 @llvm.umax.i16(i16 %a, i16 %b) + %umax3 = call i16 @llvm.umax.i16(i16 %umax, i16 %c) + ret i16 %umax3 +} + +define i16 @max3_u16_commute(i16 %a, i16 %b, i16 %c) { +; VI-LABEL: max3_u16_commute: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_max_u16_e32 v0, v0, v1 +; VI-NEXT: v_max_u16_e32 v0, v2, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: max3_u16_commute: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_max3_u16 v0, v0, v1, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %umax = call i16 @llvm.umax.i16(i16 %a, i16 %b) + %umax3 = call i16 @llvm.umax.i16(i16 %c, i16 %umax) + ret i16 %umax3 +} + +; smax3 + +define i32 @max3_i32(i32 %a, i32 %b, i32 %c) { +; VI-LABEL: max3_i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_max3_i32 v0, v0, v1, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: max3_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_max3_i32 v0, v0, v1, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %smax = call i32 @llvm.smax.i32(i32 %a, i32 %b) + %smax3 = call i32 @llvm.smax.i32(i32 %smax, i32 %c) + ret i32 %smax3 +} + +define amdgpu_ps i32 @max3_i32_uniform(i32 inreg %a, i32 inreg %b, i32 inreg %c) { +; VI-LABEL: max3_i32_uniform: +; VI: ; %bb.0: +; VI-NEXT: s_max_i32 s0, s2, s3 +; VI-NEXT: s_max_i32 s0, s0, s4 +; VI-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: max3_i32_uniform: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_max_i32 s0, s2, s3 +; GFX10-NEXT: s_max_i32 s0, s0, s4 +; GFX10-NEXT: ; return to shader part epilog + %smax = call i32 @llvm.smax.i32(i32 %a, i32 %b) + %smax3 = call i32 @llvm.smax.i32(i32 %smax, i32 %c) + ret i32 %smax3 +} + +define i32 @max3_i32_commute(i32 %a, i32 %b, i32 %c) { +; VI-LABEL: max3_i32_commute: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_max3_i32 v0, v0, v1, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: max3_i32_commute: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_max3_i32 v0, v0, v1, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %smax = call i32 @llvm.smax.i32(i32 %a, i32 %b) + %smax3 = call i32 @llvm.smax.i32(i32 %c, i32 %smax) + ret i32 %smax3 +} + +define i16 @max3_i16(i16 %a, i16 %b, i16 %c) { +; VI-LABEL: max3_i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_max_i16_e32 v0, v0, v1 +; VI-NEXT: v_max_i16_e32 v0, v0, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: max3_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_max3_i16 v0, v0, v1, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %smax = call i16 @llvm.smax.i16(i16 %a, i16 %b) + %smax3 = call i16 @llvm.smax.i16(i16 %smax, i16 %c) + ret i16 %smax3 +} + +define i16 @max3_i16_commute(i16 %a, i16 %b, i16 %c) { +; VI-LABEL: max3_i16_commute: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_max_i16_e32 v0, v0, v1 +; VI-NEXT: v_max_i16_e32 v0, v2, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: max3_i16_commute: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_max3_i16 v0, v0, v1, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %smax = call i16 @llvm.smax.i16(i16 %a, i16 %b) + %smax3 = call i16 @llvm.smax.i16(i16 %c, i16 %smax) + ret i16 %smax3 +} + +; fmin ieee=true + +define float @min3_f32_ieee_true(float %a, float %b, float %c) { +; VI-LABEL: min3_f32_ieee_true: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_min3_f32 v0, v0, v1, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: min3_f32_ieee_true: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX10-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX10-NEXT: v_min3_f32 v0, v0, v1, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %min = call float @llvm.minnum.f32(float %a, float %b) + %min3 = call float @llvm.minnum.f32(float %min, float %c) + ret float %min3 +} + +define float @min3_f32_commute_ieee_true(float %a, float %b, float %c) { +; VI-LABEL: min3_f32_commute_ieee_true: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_min3_f32 v0, v0, v1, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: min3_f32_commute_ieee_true: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX10-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX10-NEXT: v_min3_f32 v0, v0, v1, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %min = call float @llvm.minnum.f32(float %a, float %b) + %min3 = call float @llvm.minnum.f32(float %c, float %min) + ret float %min3 +} + +define half @min3_f16_ieee_true(half %a, half %b, half %c) { +; VI-LABEL: min3_f16_ieee_true: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_max_f16_e32 v0, v0, v0 +; VI-NEXT: v_max_f16_e32 v1, v1, v1 +; VI-NEXT: v_min_f16_e32 v0, v0, v1 +; VI-NEXT: v_max_f16_e32 v1, v2, v2 +; VI-NEXT: v_min_f16_e32 v0, v0, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: min3_f16_ieee_true: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: v_min3_f16 v0, v0, v1, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %min = call half @llvm.minnum.f16(half %a, half %b) + %min3 = call half @llvm.minnum.f16(half %min, half %c) + ret half %min3 +} + +define half @min3_f16_commute_ieee_true(half %a, half %b, half %c) { +; VI-LABEL: min3_f16_commute_ieee_true: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_max_f16_e32 v0, v0, v0 +; VI-NEXT: v_max_f16_e32 v1, v1, v1 +; VI-NEXT: v_min_f16_e32 v0, v0, v1 +; VI-NEXT: v_max_f16_e32 v1, v2, v2 +; VI-NEXT: v_min_f16_e32 v0, v1, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: min3_f16_commute_ieee_true: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: v_min3_f16 v0, v0, v1, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %min = call half @llvm.minnum.f16(half %a, half %b) + %min3 = call half @llvm.minnum.f16(half %c, half %min) + ret half %min3 +} + +; fmin ieee=false + +define amdgpu_ps float @min3_f32_ieee_false(float %a, float %b, float %c) { +; VI-LABEL: min3_f32_ieee_false: +; VI: ; %bb.0: +; VI-NEXT: v_min3_f32 v0, v0, v1, v2 +; VI-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: min3_f32_ieee_false: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_min3_f32 v0, v0, v1, v2 +; GFX10-NEXT: ; return to shader part epilog + %min = call float @llvm.minnum.f32(float %a, float %b) + %min3 = call float @llvm.minnum.f32(float %min, float %c) + ret float %min3 +} + +define amdgpu_ps float @min3_f32_commute_ieee_false(float %a, float %b, float %c) { +; VI-LABEL: min3_f32_commute_ieee_false: +; VI: ; %bb.0: +; VI-NEXT: v_min3_f32 v0, v0, v1, v2 +; VI-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: min3_f32_commute_ieee_false: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_min3_f32 v0, v0, v1, v2 +; GFX10-NEXT: ; return to shader part epilog + %min = call float @llvm.minnum.f32(float %a, float %b) + %min3 = call float @llvm.minnum.f32(float %c, float %min) + ret float %min3 +} + +define amdgpu_ps half @min3_f16_ieee_false(half %a, half %b, half %c) { +; VI-LABEL: min3_f16_ieee_false: +; VI: ; %bb.0: +; VI-NEXT: v_min_f16_e32 v0, v0, v1 +; VI-NEXT: v_min_f16_e32 v0, v0, v2 +; VI-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: min3_f16_ieee_false: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_min3_f16 v0, v0, v1, v2 +; GFX10-NEXT: ; return to shader part epilog + %min = call half @llvm.minnum.f16(half %a, half %b) + %min3 = call half @llvm.minnum.f16(half %min, half %c) + ret half %min3 +} + +define amdgpu_ps half @min3_f16_commute_ieee_false(half %a, half %b, half %c) { +; VI-LABEL: min3_f16_commute_ieee_false: +; VI: ; %bb.0: +; VI-NEXT: v_min_f16_e32 v0, v0, v1 +; VI-NEXT: v_min_f16_e32 v0, v2, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: min3_f16_commute_ieee_false: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_min3_f16 v0, v0, v1, v2 +; GFX10-NEXT: ; return to shader part epilog + %min = call half @llvm.minnum.f16(half %a, half %b) + %min3 = call half @llvm.minnum.f16(half %c, half %min) + ret half %min3 +} + +; fmax ieee=true + +define float @max3_f32_ieee_true(float %a, float %b, float %c) { +; VI-LABEL: max3_f32_ieee_true: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_max3_f32 v0, v0, v1, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: max3_f32_ieee_true: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX10-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX10-NEXT: v_max3_f32 v0, v0, v1, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %max = call float @llvm.maxnum.f32(float %a, float %b) + %max3 = call float @llvm.maxnum.f32(float %max, float %c) + ret float %max3 +} + +define float @max3_f32_commute_ieee_true(float %a, float %b, float %c) { +; VI-LABEL: max3_f32_commute_ieee_true: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_max3_f32 v0, v0, v1, v2 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: max3_f32_commute_ieee_true: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX10-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX10-NEXT: v_max3_f32 v0, v0, v1, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %max = call float @llvm.maxnum.f32(float %a, float %b) + %max3 = call float @llvm.maxnum.f32(float %c, float %max) + ret float %max3 +} + +define half @max3_f16_ieee_true(half %a, half %b, half %c) { +; VI-LABEL: max3_f16_ieee_true: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_max_f16_e32 v0, v0, v0 +; VI-NEXT: v_max_f16_e32 v1, v1, v1 +; VI-NEXT: v_max_f16_e32 v0, v0, v1 +; VI-NEXT: v_max_f16_e32 v1, v2, v2 +; VI-NEXT: v_max_f16_e32 v0, v0, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: max3_f16_ieee_true: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: v_max3_f16 v0, v0, v1, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %max = call half @llvm.maxnum.f16(half %a, half %b) + %max3 = call half @llvm.maxnum.f16(half %max, half %c) + ret half %max3 +} + +define half @max3_f16_commute_ieee_true(half %a, half %b, half %c) { +; VI-LABEL: max3_f16_commute_ieee_true: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_max_f16_e32 v0, v0, v0 +; VI-NEXT: v_max_f16_e32 v1, v1, v1 +; VI-NEXT: v_max_f16_e32 v0, v0, v1 +; VI-NEXT: v_max_f16_e32 v1, v2, v2 +; VI-NEXT: v_max_f16_e32 v0, v1, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: max3_f16_commute_ieee_true: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX10-NEXT: v_max_f16_e32 v2, v2, v2 +; GFX10-NEXT: v_max3_f16 v0, v0, v1, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] + %max = call half @llvm.maxnum.f16(half %a, half %b) + %max3 = call half @llvm.maxnum.f16(half %c, half %max) + ret half %max3 +} + +; fmax ieee=false + +define amdgpu_ps float @max3_f32_ieee_false(float %a, float %b, float %c) { +; VI-LABEL: max3_f32_ieee_false: +; VI: ; %bb.0: +; VI-NEXT: v_max3_f32 v0, v0, v1, v2 +; VI-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: max3_f32_ieee_false: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_max3_f32 v0, v0, v1, v2 +; GFX10-NEXT: ; return to shader part epilog + %max = call float @llvm.maxnum.f32(float %a, float %b) + %max3 = call float @llvm.maxnum.f32(float %max, float %c) + ret float %max3 +} + +define amdgpu_ps float @max3_f32_commute_ieee_false(float %a, float %b, float %c) { +; VI-LABEL: max3_f32_commute_ieee_false: +; VI: ; %bb.0: +; VI-NEXT: v_max3_f32 v0, v0, v1, v2 +; VI-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: max3_f32_commute_ieee_false: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_max3_f32 v0, v0, v1, v2 +; GFX10-NEXT: ; return to shader part epilog + %max = call float @llvm.maxnum.f32(float %a, float %b) + %max3 = call float @llvm.maxnum.f32(float %c, float %max) + ret float %max3 +} + +define amdgpu_ps half @max3_f16_ieee_false(half %a, half %b, half %c) { +; VI-LABEL: max3_f16_ieee_false: +; VI: ; %bb.0: +; VI-NEXT: v_max_f16_e32 v0, v0, v1 +; VI-NEXT: v_max_f16_e32 v0, v0, v2 +; VI-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: max3_f16_ieee_false: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_max3_f16 v0, v0, v1, v2 +; GFX10-NEXT: ; return to shader part epilog + %max = call half @llvm.maxnum.f16(half %a, half %b) + %max3 = call half @llvm.maxnum.f16(half %max, half %c) + ret half %max3 +} + +define amdgpu_ps half @max3_f16_commute_ieee_false(half %a, half %b, half %c) { +; VI-LABEL: max3_f16_commute_ieee_false: +; VI: ; %bb.0: +; VI-NEXT: v_max_f16_e32 v0, v0, v1 +; VI-NEXT: v_max_f16_e32 v0, v2, v0 +; VI-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: max3_f16_commute_ieee_false: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_max3_f16 v0, v0, v1, v2 +; GFX10-NEXT: ; return to shader part epilog + %max = call half @llvm.maxnum.f16(half %a, half %b) + %max3 = call half @llvm.maxnum.f16(half %c, half %max) + ret half %max3 +} + +declare i16 @llvm.umin.i16(i16, i16) +declare i32 @llvm.umin.i32(i32, i32) +declare i16 @llvm.smin.i16(i16, i16) +declare i32 @llvm.smin.i32(i32, i32) +declare i16 @llvm.umax.i16(i16, i16) +declare i32 @llvm.umax.i32(i32, i32) +declare i16 @llvm.smax.i16(i16, i16) +declare i32 @llvm.smax.i32(i32, i32) +declare half @llvm.minnum.f16(half, half) +declare half @llvm.maxnum.f16(half, half) +declare float @llvm.minnum.f32(float, float) +declare float @llvm.maxnum.f32(float, float) Index: llvm/test/CodeGen/AMDGPU/ctlz.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ctlz.ll +++ llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -523,10 +523,11 @@ ; SI-NEXT: s_flbit_i32_b32 s4, s4 ; SI-NEXT: s_flbit_i32_b32 s5, s5 ; SI-NEXT: s_min_u32 s4, s4, 0xffffffdf -; SI-NEXT: v_mov_b32_e32 v0, s5 ; SI-NEXT: s_add_i32 s4, s4, 32 -; SI-NEXT: v_min3_u32 v0, s4, v0, 64 +; SI-NEXT: s_min_u32 s4, s4, s5 +; SI-NEXT: s_min_u32 s4, s4, 64 ; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -541,7 +542,8 @@ ; VI-NEXT: s_flbit_i32_b32 s0, s0 ; VI-NEXT: v_add_u32_e64 v0, s[2:3], s0, 32 clamp ; VI-NEXT: s_flbit_i32_b32 s0, s1 -; VI-NEXT: v_min3_u32 v0, v0, s0, 64 +; VI-NEXT: v_min_u32_e32 v0, s0, v0 +; VI-NEXT: v_min_u32_e32 v0, 64, v0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; @@ -573,7 +575,8 @@ ; GFX10-NEXT: s_flbit_i32_b32 s0, s2 ; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, 32 clamp ; GFX10-NEXT: s_flbit_i32_b32 s0, s3 -; GFX10-NEXT: v_min3_u32 v0, v0, s0, 64 +; GFX10-NEXT: v_min_u32_e32 v0, s0, v0 +; GFX10-NEXT: v_min_u32_e32 v0, 64, v0 ; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; @@ -602,14 +605,15 @@ ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_flbit_i32_b32 s4, s4 -; SI-NEXT: s_min_u32 s4, s4, 0xffffffdf -; SI-NEXT: s_flbit_i32_b32 s5, s5 -; SI-NEXT: s_add_i32 s4, s4, 32 -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_min3_u32 v0, s4, v0, 64 +; SI-NEXT: s_flbit_i32_b32 s2, s4 +; SI-NEXT: s_flbit_i32_b32 s4, s5 +; SI-NEXT: s_min_u32 s2, s2, 0xffffffdf +; SI-NEXT: s_add_i32 s2, s2, 32 +; SI-NEXT: s_min_u32 s2, s2, s4 +; SI-NEXT: s_min_u32 s4, s2, 64 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -623,7 +627,8 @@ ; VI-NEXT: s_flbit_i32_b32 s0, s0 ; VI-NEXT: v_add_u32_e64 v0, s[2:3], s0, 32 clamp ; VI-NEXT: s_flbit_i32_b32 s0, s1 -; VI-NEXT: v_min3_u32 v0, v0, s0, 64 +; VI-NEXT: v_min_u32_e32 v0, s0, v0 +; VI-NEXT: v_min_u32_e32 v0, 64, v0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; @@ -654,7 +659,8 @@ ; GFX10-NEXT: s_flbit_i32_b32 s0, s2 ; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, 32 clamp ; GFX10-NEXT: s_flbit_i32_b32 s0, s3 -; GFX10-NEXT: v_min3_u32 v0, v0, s0, 64 +; GFX10-NEXT: v_min_u32_e32 v0, s0, v0 +; GFX10-NEXT: v_min_u32_e32 v0, 64, v0 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; @@ -774,9 +780,8 @@ ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp -; GFX10-GISEL-NEXT: v_min_u32_e32 v0, v1, v0 +; GFX10-GISEL-NEXT: v_min3_u32 v0, v1, v0, 64 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 64, v0 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -887,8 +892,7 @@ ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 ; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp -; GFX10-GISEL-NEXT: v_min_u32_e32 v1, v2, v1 -; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 64, v1 +; GFX10-GISEL-NEXT: v_min3_u32 v1, v2, v1, 64 ; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() Index: llvm/test/CodeGen/AMDGPU/cttz.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/cttz.ll +++ llvm/test/CodeGen/AMDGPU/cttz.ll @@ -514,12 +514,13 @@ ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_ff1_i32_b32 s5, s5 +; SI-NEXT: s_ff1_i32_b32 s4, s4 ; SI-NEXT: s_min_u32 s5, s5, 0xffffffdf ; SI-NEXT: s_add_i32 s5, s5, 32 -; SI-NEXT: s_ff1_i32_b32 s4, s4 -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_min3_u32 v0, s4, v0, 64 +; SI-NEXT: s_min_u32 s4, s4, s5 +; SI-NEXT: s_min_u32 s4, s4, 64 ; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -534,7 +535,8 @@ ; VI-NEXT: s_ff1_i32_b32 s1, s1 ; VI-NEXT: v_add_u32_e64 v0, s[2:3], s1, 32 clamp ; VI-NEXT: s_ff1_i32_b32 s0, s0 -; VI-NEXT: v_min3_u32 v0, s0, v0, 64 +; VI-NEXT: v_min_u32_e32 v0, s0, v0 +; VI-NEXT: v_min_u32_e32 v0, 64, v0 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; @@ -566,7 +568,8 @@ ; GFX10-NEXT: s_ff1_i32_b32 s0, s3 ; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, 32 clamp ; GFX10-NEXT: s_ff1_i32_b32 s0, s2 -; GFX10-NEXT: v_min3_u32 v0, s0, v0, 64 +; GFX10-NEXT: v_min_u32_e32 v0, s0, v0 +; GFX10-NEXT: v_min_u32_e32 v0, 64, v0 ; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; @@ -595,14 +598,15 @@ ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_ff1_i32_b32 s5, s5 -; SI-NEXT: s_min_u32 s5, s5, 0xffffffdf -; SI-NEXT: s_add_i32 s5, s5, 32 +; SI-NEXT: s_ff1_i32_b32 s2, s5 ; SI-NEXT: s_ff1_i32_b32 s4, s4 -; SI-NEXT: v_mov_b32_e32 v0, s5 -; SI-NEXT: v_min3_u32 v0, s4, v0, 64 +; SI-NEXT: s_min_u32 s2, s2, 0xffffffdf +; SI-NEXT: s_add_i32 s2, s2, 32 +; SI-NEXT: s_min_u32 s2, s4, s2 +; SI-NEXT: s_min_u32 s4, s2, 64 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -616,7 +620,8 @@ ; VI-NEXT: s_ff1_i32_b32 s1, s1 ; VI-NEXT: v_add_u32_e64 v0, s[2:3], s1, 32 clamp ; VI-NEXT: s_ff1_i32_b32 s0, s0 -; VI-NEXT: v_min3_u32 v0, s0, v0, 64 +; VI-NEXT: v_min_u32_e32 v0, s0, v0 +; VI-NEXT: v_min_u32_e32 v0, 64, v0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; @@ -647,7 +652,8 @@ ; GFX10-NEXT: s_ff1_i32_b32 s0, s3 ; GFX10-NEXT: v_add_nc_u32_e64 v0, s0, 32 clamp ; GFX10-NEXT: s_ff1_i32_b32 s0, s2 -; GFX10-NEXT: v_min3_u32 v0, s0, v0, 64 +; GFX10-NEXT: v_min_u32_e32 v0, s0, v0 +; GFX10-NEXT: v_min_u32_e32 v0, 64, v0 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; @@ -767,9 +773,8 @@ ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp -; GFX10-GISEL-NEXT: v_min_u32_e32 v0, v0, v1 +; GFX10-GISEL-NEXT: v_min3_u32 v0, v0, v1, 64 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 64, v0 ; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -880,8 +885,7 @@ ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp -; GFX10-GISEL-NEXT: v_min_u32_e32 v1, v1, v2 -; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 64, v1 +; GFX10-GISEL-NEXT: v_min3_u32 v1, v1, v2, 64 ; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() Index: llvm/test/CodeGen/AMDGPU/fmax3.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fmax3.ll +++ llvm/test/CodeGen/AMDGPU/fmax3.ll @@ -1,12 +1,18 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SI_VI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SI_VI %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s ; GCN-LABEL: {{^}}test_fmax3_olt_0_f32: ; GCN: buffer_load_dword [[REGC:v[0-9]+]] ; GCN: buffer_load_dword [[REGB:v[0-9]+]] ; GCN: buffer_load_dword [[REGA:v[0-9]+]] -; GCN: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] +; SI_VI: v_mul_f32_e32 [[QUIET_C:v[0-9]+]], 1.0, [[REGC]] +; SI_VI: v_mul_f32_e32 [[QUIET_B:v[0-9]+]], 1.0, [[REGB]] +; SI_VI: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[REGA]] +; GFX9: v_max_f32_e32 [[QUIET_C:v[0-9]+]], [[REGC]], [[REGC]] +; GFX9: v_max_f32_e32 [[QUIET_B:v[0-9]+]], [[REGB]], [[REGB]] +; GFX9: v_max_f32_e32 [[QUIET_A:v[0-9]+]], [[REGA]], [[REGA]] +; GCN: v_max3_f32 [[RESULT:v[0-9]+]], [[QUIET_C]], [[QUIET_B]], [[QUIET_A]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm define amdgpu_kernel void @test_fmax3_olt_0_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #0 { @@ -24,7 +30,13 @@ ; GCN: buffer_load_dword [[REGB:v[0-9]+]] ; GCN: buffer_load_dword [[REGA:v[0-9]+]] ; GCN: buffer_load_dword [[REGC:v[0-9]+]] -; GCN: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] +; SI_VI: v_mul_f32_e32 [[QUIET_B:v[0-9]+]], 1.0, [[REGB]] +; SI_VI: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[REGA]] +; SI_VI: v_mul_f32_e32 [[QUIET_C:v[0-9]+]], 1.0, [[REGC]] +; GFX9: v_max_f32_e32 [[QUIET_B:v[0-9]+]], [[REGB]], [[REGB]] +; GFX9: v_max_f32_e32 [[QUIET_A:v[0-9]+]], [[REGA]], [[REGA]] +; GFX9: v_max_f32_e32 [[QUIET_C:v[0-9]+]], [[REGC]], [[REGC]] +; GCN: v_max3_f32 [[RESULT:v[0-9]+]], [[QUIET_B]], [[QUIET_A]], [[QUIET_C]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm define amdgpu_kernel void @test_fmax3_olt_1_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #0 { @@ -45,7 +57,10 @@ ; SI-DAG: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], [[REGA]] ; SI-DAG: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], [[REGB]] ; SI-DAG: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], [[REGC]] -; SI: v_max3_f32 [[RESULT_F32:v[0-9]+]], [[CVT_A]], [[CVT_B]], [[CVT_C]] +; SI-DAG: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[CVT_A]] +; SI-DAG: v_mul_f32_e32 [[QUIET_B:v[0-9]+]], 1.0, [[CVT_B]] +; SI-DAG: v_mul_f32_e32 [[QUIET_C:v[0-9]+]], 1.0, [[CVT_C]] +; SI: v_max3_f32 [[RESULT_F32:v[0-9]+]], [[QUIET_A]], [[QUIET_B]], [[QUIET_C]] ; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT_F32]] ; VI-DAG: v_max_f16_e32 [[QUIET_A:v[0-9]+]], [[REGA]], [[REGA]] @@ -54,7 +69,10 @@ ; VI: v_max_f16_e32 [[QUIET_C:v[0-9]+]], [[REGC]], [[REGC]] ; VI: v_max_f16_e32 [[RESULT:v[0-9]+]], [[MAX0]], [[QUIET_C]] -; GFX9: v_max3_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]] +; GFX9: v_max_f16_e32 [[QUIET_A:v[0-9]+]], [[REGA]], [[REGA]] +; GFX9: v_max_f16_e32 [[QUIET_B:v[0-9]+]], [[REGB]], [[REGB]] +; GFX9: v_max_f16_e32 [[QUIET_C:v[0-9]+]], [[REGC]], [[REGC]] +; GFX9: v_max3_f16 [[RESULT:v[0-9]+]], [[QUIET_A]], [[QUIET_B]], [[QUIET_C]] ; GCN: buffer_store_short [[RESULT]], define amdgpu_kernel void @test_fmax3_olt_0_f16(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #0 { %a = load volatile half, half addrspace(1)* %aptr, align 2 @@ -75,7 +93,10 @@ ; SI-DAG: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], [[REGA]] ; SI-DAG: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], [[REGB]] ; SI-DAG: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], [[REGC]] -; SI: v_max3_f32 [[RESULT_F32:v[0-9]+]], [[CVT_C]], [[CVT_A]], [[CVT_B]] +; SI-DAG: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[CVT_A]] +; SI-DAG: v_mul_f32_e32 [[QUIET_B:v[0-9]+]], 1.0, [[CVT_B]] +; SI-DAG: v_mul_f32_e32 [[QUIET_C:v[0-9]+]], 1.0, [[CVT_C]] +; SI: v_max3_f32 [[RESULT_F32:v[0-9]+]], [[QUIET_A]], [[QUIET_B]], [[QUIET_C]] ; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT_F32]] ; VI-DAG: v_max_f16_e32 [[QUIET_A:v[0-9]+]], [[REGA]], [[REGA]] @@ -84,7 +105,10 @@ ; VI: v_max_f16_e32 [[QUIET_C:v[0-9]+]], [[REGC]], [[REGC]] ; VI: v_max_f16_e32 [[RESULT:v[0-9]+]], [[QUIET_C]], [[MAX0]] -; GFX9: v_max3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGA]], [[REGB]] +; GFX9: v_max_f16_e32 [[QUIET_A:v[0-9]+]], [[REGA]], [[REGA]] +; GFX9: v_max_f16_e32 [[QUIET_B:v[0-9]+]], [[REGB]], [[REGB]] +; GFX9: v_max_f16_e32 [[QUIET_C:v[0-9]+]], [[REGC]], [[REGC]] +; GFX9: v_max3_f16 [[RESULT:v[0-9]+]], [[QUIET_A]], [[QUIET_B]], [[QUIET_C]] ; GCN: buffer_store_short [[RESULT]], define amdgpu_kernel void @test_fmax3_olt_1_f16(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #0 { %a = load volatile half, half addrspace(1)* %aptr, align 2 Index: llvm/test/CodeGen/AMDGPU/fmin3.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fmin3.ll +++ llvm/test/CodeGen/AMDGPU/fmin3.ll @@ -1,12 +1,18 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SI_VI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SI_VI %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s ; GCN-LABEL: {{^}}test_fmin3_olt_0_f32: ; GCN: buffer_load_dword [[REGC:v[0-9]+]] ; GCN: buffer_load_dword [[REGB:v[0-9]+]] ; GCN: buffer_load_dword [[REGA:v[0-9]+]] -; GCN: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] +; SI_VI: v_mul_f32_e32 [[QUIET_C:v[0-9]+]], 1.0, [[REGC]] +; SI_VI: v_mul_f32_e32 [[QUIET_B:v[0-9]+]], 1.0, [[REGB]] +; SI_VI: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[REGA]] +; GFX9: v_max_f32_e32 [[QUIET_C:v[0-9]+]], [[REGC]], [[REGC]] +; GFX9: v_max_f32_e32 [[QUIET_B:v[0-9]+]], [[REGB]], [[REGB]] +; GFX9: v_max_f32_e32 [[QUIET_A:v[0-9]+]], [[REGA]], [[REGA]] +; GCN: v_min3_f32 [[RESULT:v[0-9]+]], [[QUIET_C]], [[QUIET_B]], [[QUIET_A]] ; GCN: buffer_store_dword [[RESULT]], define amdgpu_kernel void @test_fmin3_olt_0_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #0 { %a = load volatile float, float addrspace(1)* %aptr, align 4 @@ -23,7 +29,13 @@ ; GCN: buffer_load_dword [[REGB:v[0-9]+]] ; GCN: buffer_load_dword [[REGA:v[0-9]+]] ; GCN: buffer_load_dword [[REGC:v[0-9]+]] -; GCN: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] +; SI_VI: v_mul_f32_e32 [[QUIET_B:v[0-9]+]], 1.0, [[REGB]] +; SI_VI: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[REGA]] +; SI_VI: v_mul_f32_e32 [[QUIET_C:v[0-9]+]], 1.0, [[REGC]] +; GFX9: v_max_f32_e32 [[QUIET_B:v[0-9]+]], [[REGB]], [[REGB]] +; GFX9: v_max_f32_e32 [[QUIET_A:v[0-9]+]], [[REGA]], [[REGA]] +; GFX9: v_max_f32_e32 [[QUIET_C:v[0-9]+]], [[REGC]], [[REGC]] +; GCN: v_min3_f32 [[RESULT:v[0-9]+]], [[QUIET_B]], [[QUIET_A]], [[QUIET_C]] ; GCN: buffer_store_dword [[RESULT]], define amdgpu_kernel void @test_fmin3_olt_1_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #0 { %a = load volatile float, float addrspace(1)* %aptr, align 4 @@ -36,17 +48,26 @@ } ; GCN-LABEL: {{^}}test_fmin3_olt_0_f16: -; GCN: buffer_load_ushort [[REGC:v[0-9]+]] -; GCN: buffer_load_ushort [[REGB:v[0-9]+]] ; GCN: buffer_load_ushort [[REGA:v[0-9]+]] +; GCN: buffer_load_ushort [[REGB:v[0-9]+]] +; GCN: buffer_load_ushort [[REGC:v[0-9]+]] -; SI: v_min3_f32 [[RESULT_F32:v[0-9]+]], +; SI-DAG: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], [[REGA]] +; SI-DAG: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], [[REGB]] +; SI-DAG: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], [[REGC]] +; SI-DAG: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[CVT_A]] +; SI-DAG: v_mul_f32_e32 [[QUIET_B:v[0-9]+]], 1.0, [[CVT_B]] +; SI-DAG: v_mul_f32_e32 [[QUIET_C:v[0-9]+]], 1.0, [[CVT_C]] +; SI: v_min3_f32 [[RESULT_F32:v[0-9]+]], [[QUIET_A]], [[QUIET_B]], [[QUIET_C]] ; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT]] ; VI: v_min_f16_e32 ; VI: v_min_f16_e32 [[RESULT:v[0-9]+]], -; GFX9: v_min3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] +; GFX9: v_max_f16_e32 [[QUIET_A:v[0-9]+]], [[REGA]], [[REGA]] +; GFX9: v_max_f16_e32 [[QUIET_B:v[0-9]+]], [[REGB]], [[REGB]] +; GFX9: v_max_f16_e32 [[QUIET_C:v[0-9]+]], [[REGC]], [[REGC]] +; GFX9: v_min3_f16 [[RESULT:v[0-9]+]], [[QUIET_A]], [[QUIET_B]], [[QUIET_C]] ; GCN: buffer_store_short [[RESULT]], define amdgpu_kernel void @test_fmin3_olt_0_f16(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #0 { %a = load volatile half, half addrspace(1)* %aptr, align 2 @@ -67,13 +88,19 @@ ; SI-DAG: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], [[REGA]] ; SI-DAG: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], [[REGB]] ; SI-DAG: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], [[REGC]] -; SI: v_min3_f32 [[RESULT_F32:v[0-9]+]], [[CVT_C]], [[CVT_A]], [[CVT_B]] +; SI-DAG: v_mul_f32_e32 [[QUIET_A:v[0-9]+]], 1.0, [[CVT_A]] +; SI-DAG: v_mul_f32_e32 [[QUIET_B:v[0-9]+]], 1.0, [[CVT_B]] +; SI-DAG: v_mul_f32_e32 [[QUIET_C:v[0-9]+]], 1.0, [[CVT_C]] +; SI: v_min3_f32 [[RESULT_F32:v[0-9]+]], [[QUIET_A]], [[QUIET_B]], [[QUIET_C]] ; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT_F32]] ; VI: v_min_f16_e32 ; VI: v_min_f16_e32 [[RESULT:v[0-9]+]], -; GFX9: v_min3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGA]], [[REGB]] +; GFX9: v_max_f16_e32 [[QUIET_A:v[0-9]+]], [[REGA]], [[REGA]] +; GFX9: v_max_f16_e32 [[QUIET_B:v[0-9]+]], [[REGB]], [[REGB]] +; GFX9: v_max_f16_e32 [[QUIET_C:v[0-9]+]], [[REGC]], [[REGC]] +; GFX9: v_min3_f16 [[RESULT:v[0-9]+]], [[QUIET_A]], [[QUIET_B]], [[QUIET_C]] ; GCN: buffer_store_short [[RESULT]], define amdgpu_kernel void @test_fmin3_olt_1_f16(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #0 { %a = load volatile half, half addrspace(1)* %aptr, align 2 Index: llvm/test/CodeGen/AMDGPU/known-never-snan.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/known-never-snan.ll +++ llvm/test/CodeGen/AMDGPU/known-never-snan.ll @@ -540,6 +540,9 @@ ; GCN-LABEL: v_test_known_not_snan_fmin3_input_fmed3_r_i_i_f32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: v_min3_f32 v0, v0, v1, v2 ; GCN-NEXT: v_med3_f32 v0, v0, 2.0, 4.0 ; GCN-NEXT: s_setpc_b64 s[30:31] Index: llvm/test/CodeGen/AMDGPU/max3.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/max3.ll +++ llvm/test/CodeGen/AMDGPU/max3.ll @@ -260,6 +260,26 @@ ret void } +; GCN-LABEL: {{^}}max3_u32_uniform: +; GCN: s_max_u32 +; GCN: s_max_u32 +define amdgpu_ps i32 @max3_u32_uniform(i32 inreg %a, i32 inreg %b, i32 inreg %c) { + %umax = call i32 @llvm.umax.i32(i32 %a, i32 %b) + %umax3 = call i32 @llvm.umax.i32(i32 %umax, i32 %c) + ret i32 %umax3 +} + +; GCN-LABEL: {{^}}max3_i32_uniform: +; GCN: s_max_i32 +; GCN: s_max_i32 +define amdgpu_ps i32 @max3_i32_uniform(i32 inreg %a, i32 inreg %b, i32 inreg %c) { + %smax = call i32 @llvm.smax.i32(i32 %a, i32 %b) + %smax3 = call i32 @llvm.smax.i32(i32 %smax, i32 %c) + ret i32 %smax3 +} + +declare i32 @llvm.umax.i32(i32, i32) +declare i32 @llvm.smax.i32(i32, i32) declare i32 @llvm.amdgcn.workitem.id.x() #1 attributes #0 = { nounwind } Index: llvm/test/CodeGen/AMDGPU/min3.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/min3.ll +++ llvm/test/CodeGen/AMDGPU/min3.ll @@ -330,6 +330,27 @@ ret void } +; GCN-LABEL: {{^}}min3_u32_uniform: +; GCN: s_min_u32 +; GCN: s_min_u32 +define amdgpu_ps i32 @min3_u32_uniform(i32 inreg %a, i32 inreg %b, i32 inreg %c) { + %umin = call i32 @llvm.umin.i32(i32 %a, i32 %b) + %umin3 = call i32 @llvm.umin.i32(i32 %umin, i32 %c) + ret i32 %umin3 +} + +; GCN-LABEL: {{^}}min3_i32_uniform: +; GCN: s_min_i32 +; GCN: s_min_i32 +define amdgpu_ps i32 @min3_i32_uniform(i32 inreg %a, i32 inreg %b, i32 inreg %c) { + %smin = call i32 @llvm.smin.i32(i32 %a, i32 %b) + %smin3 = call i32 @llvm.smin.i32(i32 %smin, i32 %c) + ret i32 %smin3 +} + +declare i32 @llvm.umin.i32(i32, i32) +declare i32 @llvm.smin.i32(i32, i32) + declare i32 @llvm.amdgcn.workitem.id.x() #1 attributes #0 = { nounwind }