Index: lib/Target/NVPTX/NVPTXISelLowering.cpp =================================================================== --- lib/Target/NVPTX/NVPTXISelLowering.cpp +++ lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -279,6 +279,7 @@ setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::SHL); + setTargetDAGCombine(ISD::SELECT); // Now deduce the information based on the above mentioned // actions @@ -4059,6 +4060,75 @@ return SDValue(); } +static SDValue PerformSELECTCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + // Currently this detects patterns for integer min and max and + // lowers them to PTX-specific intrinsics that enable hardware + // support. + + SDValue Cond = N->getOperand(0); + if (Cond.getOpcode() != ISD::SETCC) return SDValue(); + + SDValue LHS = Cond.getOperand(0); + SDValue RHS = Cond.getOperand(1); + SDValue True = N->getOperand(1); + SDValue False = N->getOperand(2); + if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True)) + return SDValue(); + + EVT VT = N->getValueType(0); + if (VT != MVT::i32 && VT != MVT::i64) return SDValue(); + + bool IsSigned; + bool IsMax; + switch (cast(Cond.getOperand(2))->get()) { + case ISD::SETULE: + case ISD::SETULT: + IsSigned = false; + IsMax = (RHS == True); + break; + + case ISD::SETLE: + case ISD::SETLT: + IsSigned = true; + IsMax = (RHS == True); + break; + + case ISD::SETGT: + case ISD::SETGE: + IsSigned = true; + IsMax = (LHS == True); + break; + + case ISD::SETUGE: + case ISD::SETUGT: + IsSigned = false; + IsMax = (LHS == True); + break; + + default: + return SDValue(); + } + + unsigned IntrinsicId; + if (VT == MVT::i32) { + if (IsSigned) + IntrinsicId = IsMax ? Intrinsic::nvvm_max_i : Intrinsic::nvvm_min_i; + else + IntrinsicId = IsMax ? Intrinsic::nvvm_max_ui : Intrinsic::nvvm_min_ui; + } else { + assert(VT == MVT::i64); + if (IsSigned) + IntrinsicId = IsMax ? Intrinsic::nvvm_max_ll : Intrinsic::nvvm_min_ll; + else + IntrinsicId = IsMax ? Intrinsic::nvvm_max_ull : Intrinsic::nvvm_min_ull; + } + + SDLoc DL(N); + return DCI.DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, + DCI.DAG.getConstant(IntrinsicId, DL, VT), LHS, RHS); +} + enum OperandSignedness { Signed = 0, Unsigned, @@ -4240,6 +4310,8 @@ return PerformSHLCombine(N, DCI, OptLevel); case ISD::AND: return PerformANDCombine(N, DCI); + case ISD::SELECT: + return PerformSELECTCombine(N, DCI); } return SDValue(); } Index: test/CodeGen/NVPTX/combine-min-max.ll =================================================================== --- /dev/null +++ test/CodeGen/NVPTX/combine-min-max.ll @@ -0,0 +1,307 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 -O2 | FileCheck %s + +; ************************************* +; * Cases with no min/max + +define i32 @ab_eq_i32(i32 %a, i32 %b) { +; LABEL: @ab_slt_i32 +; CHECK-NOT: min +; CHECK-NOT: max + %cmp = icmp eq i32 %a, %b + %sel = select i1 %cmp, i32 %a, i32 %b + ret i32 %sel +} + +define i64 @ba_ne_i64(i64 %a, i64 %b) { +; LABEL: @ab_ne_i64 +; CHECK-NOT: min +; CHECK-NOT: max + %cmp = icmp ne i64 %a, %b + %sel = select i1 %cmp, i64 %b, i64 %a + ret i64 %sel +} + +; PTX does have e.g. max.s16, but at least as of Kepler (sm_3x) that +; gets compiled to SASS that converts the 16 bit parameters to 32 bit +; before using a 32 bit instruction. That is probably not a win and +; NVCC 7.5 does not emit 16 bit min/max either, presumably for that +; reason. +define i16 @ab_ugt_i16(i16 %a, i16 %b) { +; LABEL: @ab_ugt_i16 +; CHECK-NOT: min +; CHECK-NOT: max + %cmp = icmp ugt i16 %a, %b + %sel = select i1 %cmp, i16 %a, i16 %b + ret i16 %sel +} + + +; ************************************* +; * All variations with i32 + +; *** ab, unsigned, i32 +define i32 @ab_ugt_i32(i32 %a, i32 %b) { +; LABEL: @ab_ugt_i32 +; CHECK: max.u32 + %cmp = icmp ugt i32 %a, %b + %sel = select i1 %cmp, i32 %a, i32 %b + ret i32 %sel +} + +define i32 @ab_uge_i32(i32 %a, i32 %b) { +; LABEL: @ab_uge_i32 +; CHECK: max.u32 + %cmp = icmp uge i32 %a, %b + %sel = select i1 %cmp, i32 %a, i32 %b + ret i32 %sel +} + +define i32 @ab_ult_i32(i32 %a, i32 %b) { +; LABEL: @ab_ult_i32 +; CHECK: min.u32 + %cmp = icmp ult i32 %a, %b + %sel = select i1 %cmp, i32 %a, i32 %b + ret i32 %sel +} + +define i32 @ab_ule_i32(i32 %a, i32 %b) { +; LABEL: @ab_ule_i32 +; CHECK: min.u32 + %cmp = icmp ule i32 %a, %b + %sel = select i1 %cmp, i32 %a, i32 %b + ret i32 %sel +} + +; *** ab, signed, i32 +define i32 @ab_sgt_i32(i32 %a, i32 %b) { +; LABEL: @ab_ugt_i32 +; CHECK: max.s32 + %cmp = icmp sgt i32 %a, %b + %sel = select i1 %cmp, i32 %a, i32 %b + ret i32 %sel +} + +define i32 @ab_sge_i32(i32 %a, i32 %b) { +; LABEL: @ab_sge_i32 +; CHECK: max.s32 + %cmp = icmp sge i32 %a, %b + %sel = select i1 %cmp, i32 %a, i32 %b + ret i32 %sel +} + +define i32 @ab_slt_i32(i32 %a, i32 %b) { +; LABEL: @ab_slt_i32 +; CHECK: min.s32 + %cmp = icmp slt i32 %a, %b + %sel = select i1 %cmp, i32 %a, i32 %b + ret i32 %sel +} + +define i32 @ab_sle_i32(i32 %a, i32 %b) { +; LABEL: @ab_sle_i32 +; CHECK: min.s32 + %cmp = icmp sle i32 %a, %b + %sel = select i1 %cmp, i32 %a, i32 %b + ret i32 %sel +} + +; *** ba, unsigned, i32 +define i32 @ba_ugt_i32(i32 %a, i32 %b) { +; LABEL: @ba_ugt_i32 +; CHECK: min.u32 + %cmp = icmp ugt i32 %a, %b + %sel = select i1 %cmp, i32 %b, i32 %a + ret i32 %sel +} + +define i32 @ba_uge_i32(i32 %a, i32 %b) { +; LABEL: @ba_uge_i32 +; CHECK: min.u32 + %cmp = icmp uge i32 %a, %b + %sel = select i1 %cmp, i32 %b, i32 %a + ret i32 %sel +} + +define i32 @ba_ult_i32(i32 %a, i32 %b) { +; LABEL: @ba_ult_i32 +; CHECK: max.u32 + %cmp = icmp ult i32 %a, %b + %sel = select i1 %cmp, i32 %b, i32 %a + ret i32 %sel +} + +define i32 @ba_ule_i32(i32 %a, i32 %b) { +; LABEL: @ba_ule_i32 +; CHECK: max.u32 + %cmp = icmp ule i32 %a, %b + %sel = select i1 %cmp, i32 %b, i32 %a + ret i32 %sel +} + +; *** ba, signed, i32 +define i32 @ba_sgt_i32(i32 %a, i32 %b) { +; LBAEL: @ba_ugt_i32 +; CHECK: min.s32 + %cmp = icmp sgt i32 %a, %b + %sel = select i1 %cmp, i32 %b, i32 %a + ret i32 %sel +} + +define i32 @ba_sge_i32(i32 %a, i32 %b) { +; LABEL: @ba_sge_i32 +; CHECK: min.s32 + %cmp = icmp sge i32 %a, %b + %sel = select i1 %cmp, i32 %b, i32 %a + ret i32 %sel +} + +define i32 @ba_slt_i32(i32 %a, i32 %b) { +; LABEL: @ba_slt_i32 +; CHECK: max.s32 + %cmp = icmp slt i32 %a, %b + %sel = select i1 %cmp, i32 %b, i32 %a + ret i32 %sel +} + +define i32 @ba_sle_i32(i32 %a, i32 %b) { +; LABEL: @ba_sle_i32 +; CHECK: max.s32 + %cmp = icmp sle i32 %a, %b + %sel = select i1 %cmp, i32 %b, i32 %a + ret i32 %sel +} + +; ************************************* +; * All variations with i64 + +; *** ab, unsigned, i64 +define i64 @ab_ugt_i64(i64 %a, i64 %b) { +; LABEL: @ab_ugt_i64 +; CHECK: max.u64 + %cmp = icmp ugt i64 %a, %b + %sel = select i1 %cmp, i64 %a, i64 %b + ret i64 %sel +} + +define i64 @ab_uge_i64(i64 %a, i64 %b) { +; LABEL: @ab_uge_i64 +; CHECK: max.u64 + %cmp = icmp uge i64 %a, %b + %sel = select i1 %cmp, i64 %a, i64 %b + ret i64 %sel +} + +define i64 @ab_ult_i64(i64 %a, i64 %b) { +; LABEL: @ab_ult_i64 +; CHECK: min.u64 + %cmp = icmp ult i64 %a, %b + %sel = select i1 %cmp, i64 %a, i64 %b + ret i64 %sel +} + +define i64 @ab_ule_i64(i64 %a, i64 %b) { +; LABEL: @ab_ule_i64 +; CHECK: min.u64 + %cmp = icmp ule i64 %a, %b + %sel = select i1 %cmp, i64 %a, i64 %b + ret i64 %sel +} + +; *** ab, signed, i64 +define i64 @ab_sgt_i64(i64 %a, i64 %b) { +; LABEL: @ab_ugt_i64 +; CHECK: max.s64 + %cmp = icmp sgt i64 %a, %b + %sel = select i1 %cmp, i64 %a, i64 %b + ret i64 %sel +} + +define i64 @ab_sge_i64(i64 %a, i64 %b) { +; LABEL: @ab_sge_i64 +; CHECK: max.s64 + %cmp = icmp sge i64 %a, %b + %sel = select i1 %cmp, i64 %a, i64 %b + ret i64 %sel +} + +define i64 @ab_slt_i64(i64 %a, i64 %b) { +; LABEL: @ab_slt_i64 +; CHECK: min.s64 + %cmp = icmp slt i64 %a, %b + %sel = select i1 %cmp, i64 %a, i64 %b + ret i64 %sel +} + +define i64 @ab_sle_i64(i64 %a, i64 %b) { +; LABEL: @ab_sle_i64 +; CHECK: min.s64 + %cmp = icmp sle i64 %a, %b + %sel = select i1 %cmp, i64 %a, i64 %b + ret i64 %sel +} + +; *** ba, unsigned, i64 +define i64 @ba_ugt_i64(i64 %a, i64 %b) { +; LABEL: @ba_ugt_i64 +; CHECK: min.u64 + %cmp = icmp ugt i64 %a, %b + %sel = select i1 %cmp, i64 %b, i64 %a + ret i64 %sel +} + +define i64 @ba_uge_i64(i64 %a, i64 %b) { +; LABEL: @ba_uge_i64 +; CHECK: min.u64 + %cmp = icmp uge i64 %a, %b + %sel = select i1 %cmp, i64 %b, i64 %a + ret i64 %sel +} + +define i64 @ba_ult_i64(i64 %a, i64 %b) { +; LABEL: @ba_ult_i64 +; CHECK: max.u64 + %cmp = icmp ult i64 %a, %b + %sel = select i1 %cmp, i64 %b, i64 %a + ret i64 %sel +} + +define i64 @ba_ule_i64(i64 %a, i64 %b) { +; LABEL: @ba_ule_i64 +; CHECK: max.u64 + %cmp = icmp ule i64 %a, %b + %sel = select i1 %cmp, i64 %b, i64 %a + ret i64 %sel +} + +; *** ba, signed, i64 +define i64 @ba_sgt_i64(i64 %a, i64 %b) { +; LBAEL: @ba_ugt_i64 +; CHECK: min.s64 + %cmp = icmp sgt i64 %a, %b + %sel = select i1 %cmp, i64 %b, i64 %a + ret i64 %sel +} + +define i64 @ba_sge_i64(i64 %a, i64 %b) { +; LABEL: @ba_sge_i64 +; CHECK: min.s64 + %cmp = icmp sge i64 %a, %b + %sel = select i1 %cmp, i64 %b, i64 %a + ret i64 %sel +} + +define i64 @ba_slt_i64(i64 %a, i64 %b) { +; LABEL: @ba_slt_i64 +; CHECK: max.s64 + %cmp = icmp slt i64 %a, %b + %sel = select i1 %cmp, i64 %b, i64 %a + ret i64 %sel +} + +define i64 @ba_sle_i64(i64 %a, i64 %b) { +; LABEL: @ba_sle_i64 +; CHECK: max.s64 + %cmp = icmp sle i64 %a, %b + %sel = select i1 %cmp, i64 %b, i64 %a + ret i64 %sel +}