This is an archive of the discontinued LLVM Phabricator instance.

[NVPTX] Implement min/max in tablegen, rather than with custom DAGComine logic.
ClosedPublic

Authored by jlebar on Jan 14 2017, 8:55 AM.

Download Raw Diff

Details

Reviewers

Commits

rGcc938fc197ce: [NVPTX] Implement min/max in tablegen, rather than with custom DAGComine logic.
rL292304: [NVPTX] Implement min/max in tablegen, rather than with custom DAGComine logic.

Summary

This change also lets us use max.{s,u}16. There's a vague warning in a
test about this maybe being less efficient, but I could not come up with
a case where the resulting SASS (sm_35 or sm_60) was different with or
without max.{s,u}16. It's true that nvcc seems to emit only
max.{s,u}32, but even ptxas 7.0 seems to have no problem generating
efficient SASS from max.{s,u}16 (the casts up to i32 and back down to
i16 seem to be implicit and nops, happening via register aliasing).

In the absence of evidence, better to have fewer special cases, emit
more straightforward code, etc. In particular, if a new GPU has 16-bit
min/max instructions, we want to be able to use them.

Diff Detail

Repository: rL LLVM

Event Timeline

jlebar updated this revision to Diff 84454.Jan 14 2017, 8:55 AM

jlebar retitled this revision from to [NVPTX] Implement min/max in tablegen, rather than with custom DAGComine logic..

jlebar updated this object.

jlebar added a reviewer: tra.

jlebar added a subscriber: llvm-commits.

Herald added a subscriber: jholewinski. · View Herald TranscriptJan 14 2017, 8:55 AM

jlebar added a child revision: D28793: [NVPTX] Auto-upgrade some NVPTX intrinsics to LLVM target-generic code..Jan 16 2017, 11:29 PM

tra accepted this revision.Jan 17 2017, 1:52 PM

This revision is now accepted and ready to land.Jan 17 2017, 1:52 PM

Closed by commit rL292304: [NVPTX] Implement min/max in tablegen, rather than with custom DAGComine logic. (authored by jlebar). · Explain WhyJan 17 2017, 4:19 PM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

trunk/

lib/

Target/

NVPTX/

NVPTXISelLowering.cpp

80 lines

NVPTXInstrInfo.td

6 lines

test/

CodeGen/

NVPTX/

combine-min-max.ll

134 lines

Diff 84772

llvm/trunk/lib/Target/NVPTX/NVPTXISelLowering.cpp

Show First 20 Lines • Show All 284 Lines • ▼ Show 20 Lines	if (IsPTXVectorType(VT)) {
setOperationAction(ISD::STORE, VT, Custom);		setOperationAction(ISD::STORE, VT, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom);		setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom);
}		}
}		}

// Custom handling for i8 intrinsics		// Custom handling for i8 intrinsics
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);		setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);

setOperationAction(ISD::CTLZ, MVT::i16, Legal);		for (const auto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
setOperationAction(ISD::CTLZ, MVT::i32, Legal);		setOperationAction(ISD::SMIN, Ty, Legal);
setOperationAction(ISD::CTLZ, MVT::i64, Legal);		setOperationAction(ISD::SMAX, Ty, Legal);
		setOperationAction(ISD::UMIN, Ty, Legal);
		setOperationAction(ISD::UMAX, Ty, Legal);

		setOperationAction(ISD::CTPOP, Ty, Legal);
		setOperationAction(ISD::CTLZ, Ty, Legal);
		}

setOperationAction(ISD::CTTZ, MVT::i16, Expand);		setOperationAction(ISD::CTTZ, MVT::i16, Expand);
setOperationAction(ISD::CTTZ, MVT::i32, Expand);		setOperationAction(ISD::CTTZ, MVT::i32, Expand);
setOperationAction(ISD::CTTZ, MVT::i64, Expand);		setOperationAction(ISD::CTTZ, MVT::i64, Expand);
setOperationAction(ISD::CTPOP, MVT::i16, Legal);
setOperationAction(ISD::CTPOP, MVT::i32, Legal);
setOperationAction(ISD::CTPOP, MVT::i64, Legal);

// PTX does not directly support SELP of i1, so promote to i32 first		// PTX does not directly support SELP of i1, so promote to i32 first
setOperationAction(ISD::SELECT, MVT::i1, Custom);		setOperationAction(ISD::SELECT, MVT::i1, Custom);

// PTX cannot multiply two i64s in a single instruction.		// PTX cannot multiply two i64s in a single instruction.
setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);		setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);		setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);

// We have some custom DAG combine patterns for these nodes		// We have some custom DAG combine patterns for these nodes
setTargetDAGCombine(ISD::ADD);		setTargetDAGCombine(ISD::ADD);
setTargetDAGCombine(ISD::AND);		setTargetDAGCombine(ISD::AND);
setTargetDAGCombine(ISD::FADD);		setTargetDAGCombine(ISD::FADD);
setTargetDAGCombine(ISD::MUL);		setTargetDAGCombine(ISD::MUL);
setTargetDAGCombine(ISD::SHL);		setTargetDAGCombine(ISD::SHL);
setTargetDAGCombine(ISD::SELECT);
setTargetDAGCombine(ISD::SREM);		setTargetDAGCombine(ISD::SREM);
setTargetDAGCombine(ISD::UREM);		setTargetDAGCombine(ISD::UREM);

if (!STI.allowFP16Math()) {		if (!STI.allowFP16Math()) {
// Promote fp16 arithmetic if fp16 hardware isn't available or the		// Promote fp16 arithmetic if fp16 hardware isn't available or the
// user passed --nvptx-no-fp16-math. The flag is useful because,		// user passed --nvptx-no-fp16-math. The flag is useful because,
// although sm_53+ GPUs have some sort of FP16 support in		// although sm_53+ GPUs have some sort of FP16 support in
// hardware, only sm_53 and sm_60 have full implementation. Others		// hardware, only sm_53 and sm_60 have full implementation. Others
▲ Show 20 Lines • Show All 3,829 Lines • ▼ Show 20 Lines	if (Val->getOpcode() == NVPTXISD::LoadV2 \|\|

// If we get here, the AND is unnecessary. Just replace it with the load		// If we get here, the AND is unnecessary. Just replace it with the load
DCI.CombineTo(N, Val, AddTo);		DCI.CombineTo(N, Val, AddTo);
}		}

return SDValue();		return SDValue();
}		}

static SDValue PerformSELECTCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
// Currently this detects patterns for integer min and max and
// lowers them to PTX-specific intrinsics that enable hardware
// support.

const SDValue Cond = N->getOperand(0);
if (Cond.getOpcode() != ISD::SETCC) return SDValue();

const SDValue LHS = Cond.getOperand(0);
const SDValue RHS = Cond.getOperand(1);
const SDValue True = N->getOperand(1);
const SDValue False = N->getOperand(2);
if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
return SDValue();

const EVT VT = N->getValueType(0);
if (VT != MVT::i32 && VT != MVT::i64) return SDValue();

const ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
SDValue Larger; // The larger of LHS and RHS when condition is true.
switch (CC) {
case ISD::SETULT:
case ISD::SETULE:
case ISD::SETLT:
case ISD::SETLE:
Larger = RHS;
break;

case ISD::SETGT:
case ISD::SETGE:
case ISD::SETUGT:
case ISD::SETUGE:
Larger = LHS;
break;

default:
return SDValue();
}
const bool IsMax = (Larger == True);
const bool IsSigned = ISD::isSignedIntSetCC(CC);

unsigned IntrinsicId;
if (VT == MVT::i32) {
if (IsSigned)
IntrinsicId = IsMax ? Intrinsic::nvvm_max_i : Intrinsic::nvvm_min_i;
else
IntrinsicId = IsMax ? Intrinsic::nvvm_max_ui : Intrinsic::nvvm_min_ui;
} else {
assert(VT == MVT::i64);
if (IsSigned)
IntrinsicId = IsMax ? Intrinsic::nvvm_max_ll : Intrinsic::nvvm_min_ll;
else
IntrinsicId = IsMax ? Intrinsic::nvvm_max_ull : Intrinsic::nvvm_min_ull;
}

SDLoc DL(N);
return DCI.DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
DCI.DAG.getConstant(IntrinsicId, DL, VT), LHS, RHS);
}

static SDValue PerformREMCombine(SDNode *N,		static SDValue PerformREMCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,		TargetLowering::DAGCombinerInfo &DCI,
CodeGenOpt::Level OptLevel) {		CodeGenOpt::Level OptLevel) {
assert(N->getOpcode() == ISD::SREM \|\| N->getOpcode() == ISD::UREM);		assert(N->getOpcode() == ISD::SREM \|\| N->getOpcode() == ISD::UREM);

// Don't do anything at less than -O2.		// Don't do anything at less than -O2.
if (OptLevel < CodeGenOpt::Default)		if (OptLevel < CodeGenOpt::Default)
return SDValue();		return SDValue();
▲ Show 20 Lines • Show All 193 Lines • ▼ Show 20 Lines	switch (N->getOpcode()) {
case ISD::FADD:		case ISD::FADD:
return PerformADDCombine(N, DCI, STI, OptLevel);		return PerformADDCombine(N, DCI, STI, OptLevel);
case ISD::MUL:		case ISD::MUL:
return PerformMULCombine(N, DCI, OptLevel);		return PerformMULCombine(N, DCI, OptLevel);
case ISD::SHL:		case ISD::SHL:
return PerformSHLCombine(N, DCI, OptLevel);		return PerformSHLCombine(N, DCI, OptLevel);
case ISD::AND:		case ISD::AND:
return PerformANDCombine(N, DCI);		return PerformANDCombine(N, DCI);
case ISD::SELECT:
return PerformSELECTCombine(N, DCI);
case ISD::UREM:		case ISD::UREM:
case ISD::SREM:		case ISD::SREM:
return PerformREMCombine(N, DCI, OptLevel);		return PerformREMCombine(N, DCI, OptLevel);
}		}
return SDValue();		return SDValue();
}		}

/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.		/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
▲ Show 20 Lines • Show All 283 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/NVPTX/NVPTXInstrInfo.td

Show First 20 Lines • Show All 523 Lines • ▼ Show 20 Lines	def : NVPTXInst<(outs RC:$dst), (ins RC:$a),
!strconcat("abs", SizeName, " \t$dst, $a;"),		!strconcat("abs", SizeName, " \t$dst, $a;"),
[(set RC:$dst, (xor (add (sra RC:$a, (i32 NumBits)), RC:$a),		[(set RC:$dst, (xor (add (sra RC:$a, (i32 NumBits)), RC:$a),
(sra RC:$a, (i32 NumBits))))]>;		(sra RC:$a, (i32 NumBits))))]>;
}		}
defm ABS_16 : ABS<Int16Regs, 15, ".s16">;		defm ABS_16 : ABS<Int16Regs, 15, ".s16">;
defm ABS_32 : ABS<Int32Regs, 31, ".s32">;		defm ABS_32 : ABS<Int32Regs, 31, ".s32">;
defm ABS_64 : ABS<Int64Regs, 63, ".s64">;		defm ABS_64 : ABS<Int64Regs, 63, ".s64">;

		// Integer min/max.
		defm SMAX : I3<"max.s", smax>;
		defm UMAX : I3<"max.u", umax>;
		defm SMIN : I3<"min.s", smin>;
		defm UMIN : I3<"min.u", umin>;

//		//
// Wide multiplication		// Wide multiplication
//		//
def MULWIDES64 :		def MULWIDES64 :
NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),		NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
"mul.wide.s32 \t$dst, $a, $b;", []>;		"mul.wide.s32 \t$dst, $a, $b;", []>;
def MULWIDES64Imm :		def MULWIDES64Imm :
NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),		NVPTXInst<(outs Int64Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
▲ Show 20 Lines • Show All 2,517 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/NVPTX/combine-min-max.ll

	Show All 15 Lines
	; LABEL: @ab_ne_i64			; LABEL: @ab_ne_i64
	; CHECK-NOT: min			; CHECK-NOT: min
	; CHECK-NOT: max			; CHECK-NOT: max
	%cmp = icmp ne i64 %a, %b			%cmp = icmp ne i64 %a, %b
	%sel = select i1 %cmp, i64 %b, i64 %a			%sel = select i1 %cmp, i64 %b, i64 %a
	ret i64 %sel			ret i64 %sel
	}			}

	; PTX does have e.g. max.s16, but at least as of Kepler (sm_3x) that			; *************************************
	; gets compiled to SASS that converts the 16 bit parameters to 32 bit			; * All variations with i16
	; before using a 32 bit instruction. That is probably not a win and
	; NVCC 7.5 does not emit 16 bit min/max either, presumably for that			; *** ab, unsigned, i16
	; reason.
	define i16 @ab_ugt_i16(i16 %a, i16 %b) {			define i16 @ab_ugt_i16(i16 %a, i16 %b) {
	; LABEL: @ab_ugt_i16			; LABEL: @ab_ugt_i16
	; CHECK-NOT: min			; CHECK: max.u16
	; CHECK-NOT: max
	%cmp = icmp ugt i16 %a, %b			%cmp = icmp ugt i16 %a, %b
	%sel = select i1 %cmp, i16 %a, i16 %b			%sel = select i1 %cmp, i16 %a, i16 %b
	ret i16 %sel			ret i16 %sel
	}			}

				define i16 @ab_uge_i16(i16 %a, i16 %b) {
				; LABEL: @ab_uge_i16
				; CHECK: max.u16
				%cmp = icmp uge i16 %a, %b
				%sel = select i1 %cmp, i16 %a, i16 %b
				ret i16 %sel
				}

				define i16 @ab_ult_i16(i16 %a, i16 %b) {
				; LABEL: @ab_ult_i16
				; CHECK: min.u16
				%cmp = icmp ult i16 %a, %b
				%sel = select i1 %cmp, i16 %a, i16 %b
				ret i16 %sel
				}

				define i16 @ab_ule_i16(i16 %a, i16 %b) {
				; LABEL: @ab_ule_i16
				; CHECK: min.u16
				%cmp = icmp ule i16 %a, %b
				%sel = select i1 %cmp, i16 %a, i16 %b
				ret i16 %sel
				}

				; *** ab, signed, i16
				define i16 @ab_sgt_i16(i16 %a, i16 %b) {
				; LABEL: @ab_ugt_i16
				; CHECK: max.s16
				%cmp = icmp sgt i16 %a, %b
				%sel = select i1 %cmp, i16 %a, i16 %b
				ret i16 %sel
				}

				define i16 @ab_sge_i16(i16 %a, i16 %b) {
				; LABEL: @ab_sge_i16
				; CHECK: max.s16
				%cmp = icmp sge i16 %a, %b
				%sel = select i1 %cmp, i16 %a, i16 %b
				ret i16 %sel
				}

				define i16 @ab_slt_i16(i16 %a, i16 %b) {
				; LABEL: @ab_slt_i16
				; CHECK: min.s16
				%cmp = icmp slt i16 %a, %b
				%sel = select i1 %cmp, i16 %a, i16 %b
				ret i16 %sel
				}

				define i16 @ab_sle_i16(i16 %a, i16 %b) {
				; LABEL: @ab_sle_i16
				; CHECK: min.s16
				%cmp = icmp sle i16 %a, %b
				%sel = select i1 %cmp, i16 %a, i16 %b
				ret i16 %sel
				}

				; *** ba, unsigned, i16
				define i16 @ba_ugt_i16(i16 %a, i16 %b) {
				; LABEL: @ba_ugt_i16
				; CHECK: min.u16
				%cmp = icmp ugt i16 %a, %b
				%sel = select i1 %cmp, i16 %b, i16 %a
				ret i16 %sel
				}

				define i16 @ba_uge_i16(i16 %a, i16 %b) {
				; LABEL: @ba_uge_i16
				; CHECK: min.u16
				%cmp = icmp uge i16 %a, %b
				%sel = select i1 %cmp, i16 %b, i16 %a
				ret i16 %sel
				}

				define i16 @ba_ult_i16(i16 %a, i16 %b) {
				; LABEL: @ba_ult_i16
				; CHECK: max.u16
				%cmp = icmp ult i16 %a, %b
				%sel = select i1 %cmp, i16 %b, i16 %a
				ret i16 %sel
				}

				define i16 @ba_ule_i16(i16 %a, i16 %b) {
				; LABEL: @ba_ule_i16
				; CHECK: max.u16
				%cmp = icmp ule i16 %a, %b
				%sel = select i1 %cmp, i16 %b, i16 %a
				ret i16 %sel
				}

				; *** ba, signed, i16
				define i16 @ba_sgt_i16(i16 %a, i16 %b) {
				; LBAEL: @ba_ugt_i16
				; CHECK: min.s16
				%cmp = icmp sgt i16 %a, %b
				%sel = select i1 %cmp, i16 %b, i16 %a
				ret i16 %sel
				}

				define i16 @ba_sge_i16(i16 %a, i16 %b) {
				; LABEL: @ba_sge_i16
				; CHECK: min.s16
				%cmp = icmp sge i16 %a, %b
				%sel = select i1 %cmp, i16 %b, i16 %a
				ret i16 %sel
				}

				define i16 @ba_slt_i16(i16 %a, i16 %b) {
				; LABEL: @ba_slt_i16
				; CHECK: max.s16
				%cmp = icmp slt i16 %a, %b
				%sel = select i1 %cmp, i16 %b, i16 %a
				ret i16 %sel
				}

				define i16 @ba_sle_i16(i16 %a, i16 %b) {
				; LABEL: @ba_sle_i16
				; CHECK: max.s16
				%cmp = icmp sle i16 %a, %b
				%sel = select i1 %cmp, i16 %b, i16 %a
				ret i16 %sel
				}

	; *************************************			; *************************************
	; * All variations with i32			; * All variations with i32

	; *** ab, unsigned, i32			; *** ab, unsigned, i32
	define i32 @ab_ugt_i32(i32 %a, i32 %b) {			define i32 @ab_ugt_i32(i32 %a, i32 %b) {
	; LABEL: @ab_ugt_i32			; LABEL: @ab_ugt_i32
	; CHECK: max.u32			; CHECK: max.u32
	▲ Show 20 Lines • Show All 262 Lines • Show Last 20 Lines