Index: lib/Target/R600/AMDGPUISelLowering.h =================================================================== --- lib/Target/R600/AMDGPUISelLowering.h +++ lib/Target/R600/AMDGPUISelLowering.h @@ -140,7 +140,14 @@ SDValue LowerIntrinsicIABS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerIntrinsicLRP(SDValue Op, SelectionDAG &DAG) const; - SDValue CombineMinMax(SDNode *N, SelectionDAG &DAG) const; + SDValue CombineMinMax(SDLoc DL, + EVT VT, + SDValue LHS, + SDValue RHS, + SDValue True, + SDValue False, + SDValue CC, + SelectionDAG &DAG) const; const char* getTargetNodeName(unsigned Opcode) const override; virtual SDNode *PostISelFolding(MachineSDNode *N, @@ -188,10 +195,10 @@ // Denormals handled on some parts. COS_HW, SIN_HW, - FMAX, + FMAX_LEGACY, SMAX, UMAX, - FMIN, + FMIN_LEGACY, SMIN, UMIN, URECIP, Index: lib/Target/R600/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/R600/AMDGPUISelLowering.cpp +++ lib/Target/R600/AMDGPUISelLowering.cpp @@ -378,6 +378,7 @@ setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom); setTargetDAGCombine(ISD::MUL); + setTargetDAGCombine(ISD::SELECT); setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::STORE); @@ -980,21 +981,21 @@ } /// \brief Generate Min/Max node -SDValue AMDGPUTargetLowering::CombineMinMax(SDNode *N, +SDValue AMDGPUTargetLowering::CombineMinMax(SDLoc DL, + EVT VT, + SDValue LHS, + SDValue RHS, + SDValue True, + SDValue False, + SDValue CC, SelectionDAG &DAG) const { - SDLoc DL(N); - EVT VT = N->getValueType(0); - - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); - SDValue True = N->getOperand(2); - SDValue False = N->getOperand(3); - SDValue CC = N->getOperand(4); + if (VT != MVT::f32 && + (VT != MVT::f64 || + Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)) + return SDValue(); - if (VT != MVT::f32 || - !((LHS == True && RHS == False) || (LHS == False && RHS == True))) { + if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True)) return SDValue(); - } ISD::CondCode CCOpcode = cast(CC)->get(); switch (CCOpcode) { @@ -1010,14 +1011,15 @@ case ISD::SETTRUE2: case ISD::SETUO: case ISD::SETO: - llvm_unreachable("Operation should already be optimised!"); + break; case ISD::SETULE: case ISD::SETULT: case ISD::SETOLE: case ISD::SETOLT: case ISD::SETLE: case ISD::SETLT: { - unsigned Opc = (LHS == True) ? AMDGPUISD::FMIN : AMDGPUISD::FMAX; + unsigned Opc + = (LHS == True) ? AMDGPUISD::FMIN_LEGACY : AMDGPUISD::FMAX_LEGACY; return DAG.getNode(Opc, DL, VT, LHS, RHS); } case ISD::SETGT: @@ -1026,7 +1028,8 @@ case ISD::SETOGE: case ISD::SETUGT: case ISD::SETOGT: { - unsigned Opc = (LHS == True) ? AMDGPUISD::FMAX : AMDGPUISD::FMIN; + unsigned Opc + = (LHS == True) ? AMDGPUISD::FMAX_LEGACY : AMDGPUISD::FMIN_LEGACY; return DAG.getNode(Opc, DL, VT, LHS, RHS); } case ISD::SETCC_INVALID: @@ -2091,9 +2094,37 @@ simplifyI24(N1, DCI); return SDValue(); } - case ISD::SELECT_CC: { - return CombineMinMax(N, DAG); + case ISD::SELECT_CC: { + SDLoc DL(N); + EVT VT = N->getValueType(0); + + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + SDValue True = N->getOperand(2); + SDValue False = N->getOperand(3); + SDValue CC = N->getOperand(4); + + return CombineMinMax(DL, VT, LHS, RHS, True, False, CC, DAG); + } + case ISD::SELECT: { + SDValue Cond = N->getOperand(0); + if (Cond.getOpcode() == ISD::SETCC) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + + SDValue LHS = Cond.getOperand(0); + SDValue RHS = Cond.getOperand(1); + SDValue CC = Cond.getOperand(2); + + SDValue True = N->getOperand(1); + SDValue False = N->getOperand(2); + + + return CombineMinMax(DL, VT, LHS, RHS, True, False, CC, DAG); } + + break; + } case AMDGPUISD::BFE_I32: case AMDGPUISD::BFE_U32: { assert(!N->getValueType(0).isVector() && @@ -2270,10 +2301,10 @@ NODE_NAME_CASE(FRACT) NODE_NAME_CASE(CLAMP) NODE_NAME_CASE(MAD) - NODE_NAME_CASE(FMAX) + NODE_NAME_CASE(FMAX_LEGACY) NODE_NAME_CASE(SMAX) NODE_NAME_CASE(UMAX) - NODE_NAME_CASE(FMIN) + NODE_NAME_CASE(FMIN_LEGACY) NODE_NAME_CASE(SMIN) NODE_NAME_CASE(UMIN) NODE_NAME_CASE(URECIP) Index: lib/Target/R600/AMDGPUInstrInfo.td =================================================================== --- lib/Target/R600/AMDGPUInstrInfo.td +++ lib/Target/R600/AMDGPUInstrInfo.td @@ -58,9 +58,12 @@ def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>; -// out = max(a, b) a and b are floats -def AMDGPUfmax : SDNode<"AMDGPUISD::FMAX", SDTFPBinOp, - [SDNPCommutative, SDNPAssociative] +// out = max(a, b) a and b are floats, where a nan comparison fails. +// This is not commutative because this gives the second operand: +// x < nan ? x : nan -> nan +// nan < x ? nan : x -> x +def AMDGPUfmax_legacy : SDNode<"AMDGPUISD::FMAX_LEGACY", SDTFPBinOp, + [SDNPAssociative] >; def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPTernaryOp, []>; @@ -76,9 +79,9 @@ [SDNPCommutative, SDNPAssociative] >; -// out = min(a, b) a and b are floats -def AMDGPUfmin : SDNode<"AMDGPUISD::FMIN", SDTFPBinOp, - [SDNPCommutative, SDNPAssociative] +// out = min(a, b) a and b are floats, where a nan comparison fails. +def AMDGPUfmin_legacy : SDNode<"AMDGPUISD::FMIN_LEGACY", SDTFPBinOp, + [SDNPAssociative] >; // out = min(a, b) a snd b are signed ints @@ -137,7 +140,7 @@ // MSKOR(dst, mask, src) MEM[dst] = ((MEM[dst] & ~mask) | src) // // src0: vec4(src, 0, 0, mask) -// src1: dst - rat offset (aka pointer) in dwords +// src1: dst - rat offset (aka pointer) in dwords def AMDGPUstore_mskor : SDNode<"AMDGPUISD::STORE_MSKOR", SDTypeProfile<0, 2, []>, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; Index: lib/Target/R600/R600Instructions.td =================================================================== --- lib/Target/R600/R600Instructions.td +++ lib/Target/R600/R600Instructions.td @@ -674,8 +674,9 @@ // Non-IEEE MUL: 0 * anything = 0 def MUL : R600_2OP_Helper <0x1, "MUL NON-IEEE", int_AMDGPU_mul>; def MUL_IEEE : R600_2OP_Helper <0x2, "MUL_IEEE", fmul>; -def MAX : R600_2OP_Helper <0x3, "MAX", AMDGPUfmax>; -def MIN : R600_2OP_Helper <0x4, "MIN", AMDGPUfmin>; +// TODO: Do these actually match the regular fmin/fmax behavior? +def MAX : R600_2OP_Helper <0x3, "MAX", AMDGPUfmax_legacy>; +def MIN : R600_2OP_Helper <0x4, "MIN", AMDGPUfmin_legacy>; // For the SET* instructions there is a naming conflict in TargetSelectionDAG.td, // so some of the instruction names don't match the asm string. Index: lib/Target/R600/SIInstructions.td =================================================================== --- lib/Target/R600/SIInstructions.td +++ lib/Target/R600/SIInstructions.td @@ -1400,11 +1400,11 @@ defm V_MIN_LEGACY_F32 : VOP2Inst , "V_MIN_LEGACY_F32", - VOP_F32_F32_F32, AMDGPUfmin + VOP_F32_F32_F32, AMDGPUfmin_legacy >; defm V_MAX_LEGACY_F32 : VOP2Inst , "V_MAX_LEGACY_F32", - VOP_F32_F32_F32, AMDGPUfmax + VOP_F32_F32_F32, AMDGPUfmax_legacy >; defm V_MIN_F32 : VOP2Inst , "V_MIN_F32", VOP_F32_F32_F32, fminnum>; Index: test/CodeGen/R600/fcmp64.ll =================================================================== --- test/CodeGen/R600/fcmp64.ll +++ test/CodeGen/R600/fcmp64.ll @@ -1,58 +1,58 @@ ; RUN: llc < %s -march=r600 -mcpu=tahiti -verify-machineinstrs | FileCheck %s -; CHECK: {{^}}flt_f64: +; CHECK-LABEL: {{^}}flt_f64 ; CHECK: V_CMP_LT_F64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} -define void @flt_f64(double addrspace(1)* %out, double addrspace(1)* %in1, +define void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double addrspace(1)* %in1 %r1 = load double addrspace(1)* %in2 %r2 = fcmp ult double %r0, %r1 - %r3 = select i1 %r2, double %r0, double %r1 - store double %r3, double addrspace(1)* %out + %r3 = zext i1 %r2 to i32 + store i32 %r3, i32 addrspace(1)* %out ret void } -; CHECK: {{^}}fle_f64: +; CHECK-LABEL: {{^}}fle_f64 ; CHECK: V_CMP_LE_F64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} -define void @fle_f64(double addrspace(1)* %out, double addrspace(1)* %in1, +define void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double addrspace(1)* %in1 %r1 = load double addrspace(1)* %in2 %r2 = fcmp ule double %r0, %r1 - %r3 = select i1 %r2, double %r0, double %r1 - store double %r3, double addrspace(1)* %out + %r3 = zext i1 %r2 to i32 + store i32 %r3, i32 addrspace(1)* %out ret void } -; CHECK: {{^}}fgt_f64: +; CHECK-LABEL: {{^}}fgt_f64 ; CHECK: V_CMP_GT_F64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} -define void @fgt_f64(double addrspace(1)* %out, double addrspace(1)* %in1, +define void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double addrspace(1)* %in1 %r1 = load double addrspace(1)* %in2 %r2 = fcmp ugt double %r0, %r1 - %r3 = select i1 %r2, double %r0, double %r1 - store double %r3, double addrspace(1)* %out + %r3 = zext i1 %r2 to i32 + store i32 %r3, i32 addrspace(1)* %out ret void } -; CHECK: {{^}}fge_f64: +; CHECK-LABEL: {{^}}fge_f64 ; CHECK: V_CMP_GE_F64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} -define void @fge_f64(double addrspace(1)* %out, double addrspace(1)* %in1, +define void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1, double addrspace(1)* %in2) { %r0 = load double addrspace(1)* %in1 %r1 = load double addrspace(1)* %in2 %r2 = fcmp uge double %r0, %r1 - %r3 = select i1 %r2, double %r0, double %r1 - store double %r3, double addrspace(1)* %out + %r3 = zext i1 %r2 to i32 + store i32 %r3, i32 addrspace(1)* %out ret void } -; CHECK: {{^}}fne_f64: +; CHECK-LABEL: {{^}}fne_f64 ; CHECK: V_CMP_NEQ_F64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} define void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1, @@ -65,7 +65,7 @@ ret void } -; CHECK: {{^}}feq_f64: +; CHECK-LABEL: {{^}}feq_f64 ; CHECK: V_CMP_EQ_F64_e64 {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}} define void @feq_f64(double addrspace(1)* %out, double addrspace(1)* %in1, Index: test/CodeGen/R600/fmax_legacy.ll =================================================================== --- /dev/null +++ test/CodeGen/R600/fmax_legacy.ll @@ -0,0 +1,42 @@ +; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; FUNC-LABEL: @test_fmax_legacy_uge_f32 +; SI: V_MAX_LEGACY_F32_e32 +; EG: MAX +define void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, float %a, float %b) nounwind { + %cmp = fcmp uge float %a, %b + %val = select i1 %cmp, float %a, float %b + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_fmax_legacy_oge_f32 +; SI: V_MAX_LEGACY_F32_e32 +; EG: MAX +define void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float %a, float %b) nounwind { + %cmp = fcmp oge float %a, %b + %val = select i1 %cmp, float %a, float %b + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_fmax_legacy_ugt_f32 +; SI: V_MAX_LEGACY_F32_e32 +; EG: MAX +define void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float %a, float %b) nounwind { + %cmp = fcmp ugt float %a, %b + %val = select i1 %cmp, float %a, float %b + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_fmax_legacy_ogt_f32 +; SI: V_MAX_LEGACY_F32_e32 +; EG: MAX +define void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float %a, float %b) nounwind { + %cmp = fcmp ogt float %a, %b + %val = select i1 %cmp, float %a, float %b + store float %val, float addrspace(1)* %out, align 4 + ret void +} Index: test/CodeGen/R600/fmin_legacy.ll =================================================================== --- /dev/null +++ test/CodeGen/R600/fmin_legacy.ll @@ -0,0 +1,51 @@ +; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; FUNC-LABEL: @test_fmin_legacy_f32 +; EG: MIN * +; SI: V_MIN_LEGACY_F32_e32 +define void @test_fmin_legacy_f32(<4 x float> addrspace(1)* %out, <4 x float> inreg %reg0) nounwind { + %r0 = extractelement <4 x float> %reg0, i32 0 + %r1 = extractelement <4 x float> %reg0, i32 1 + %r2 = fcmp uge float %r0, %r1 + %r3 = select i1 %r2, float %r1, float %r0 + %vec = insertelement <4 x float> undef, float %r3, i32 0 + store <4 x float> %vec, <4 x float> addrspace(1)* %out, align 16 + ret void +} + +; FUNC-LABEL: @test_fmin_legacy_ule_f32 +; SI: V_MIN_LEGACY_F32_e32 +define void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float %a, float %b) nounwind { + %cmp = fcmp ule float %a, %b + %val = select i1 %cmp, float %a, float %b + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_fmin_legacy_ole_f32 +; SI: V_MIN_LEGACY_F32_e32 +define void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float %a, float %b) nounwind { + %cmp = fcmp ole float %a, %b + %val = select i1 %cmp, float %a, float %b + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_fmin_legacy_olt_f32 +; SI: V_MIN_LEGACY_F32_e32 +define void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float %a, float %b) nounwind { + %cmp = fcmp olt float %a, %b + %val = select i1 %cmp, float %a, float %b + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_fmin_legacy_ult_f32 +; SI: V_MIN_LEGACY_F32_e32 +define void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float %a, float %b) nounwind { + %cmp = fcmp ult float %a, %b + %val = select i1 %cmp, float %a, float %b + store float %val, float addrspace(1)* %out, align 4 + ret void +}