This is an archive of the discontinued LLVM Phabricator instance.

[DAGCombiner][AMDGPU][X86] Turn cttz/ctlz into cttz_zero_undef/ctlz_zero_undef if we can prove the input is never zero
ClosedPublic

Authored by craig.topper on Feb 6 2018, 2:18 PM.

Download Raw Diff

Details

Reviewers

spatel
RKSimon
wdng
tstellar
arsenm

Commits

rG58ecffd857a9: [DAGCombiner][AMDGPU][X86] Turn cttz/ctlz into cttz_zero_undef/ctlz_zero_undef…
rL324427: [DAGCombiner][AMDGPU][X86] Turn cttz/ctlz into cttz_zero_undef/ctlz_zero_undef…

Summary

X86 currently has a late DAG combine after cttz/ctlz are turned into BSR+BSF+CMOV to detect this and remove the CMOV. But we should be able to do this much earlier and avoid creating the cmov all together.

For the changed AMDGPU test case it appears that previously the i8 cttz was type legalized to i16 which introduced an OR with 256 in order to limit the result to 8 on the widened type. At this point the result is known to never be zero, but nothing checked that. Then operation legalization is told to promote all i16 cttz to i32. This introduces an extend and a truncate and another OR with 65536 to limit the result to 16. With the DAG combiner change we are able to prevent the creation of the second OR since the opcode will have been changed to cttz_zero_undef after the first OR. I the lack of the OR caused the instruction to change to v_ffbl_b32_sdwa

Diff Detail

Repository: rL LLVM

Event Timeline

craig.topper created this revision.Feb 6 2018, 2:18 PM

Herald added subscribers: t-tye, tpr, dstuttard and 3 others. · View Herald TranscriptFeb 6 2018, 2:18 PM

LGTM

This revision is now accepted and ready to land.Feb 6 2018, 2:26 PM

Closed by commit rL324427: [DAGCombiner][AMDGPU][X86] Turn cttz/ctlz into cttz_zero_undef/ctlz_zero_undef… (authored by ctopper). · Explain WhyFeb 6 2018, 3:59 PM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

trunk/

lib/

CodeGen/

SelectionDAG/

DAGCombiner.cpp

14 lines

Target/

X86/

X86ISelLowering.cpp

11 lines

test/

CodeGen/

AMDGPU/

cttz_zero_undef.ll

3 lines

Diff 133102

llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

	Show First 20 Lines • Show All 6,280 Lines • ▼ Show 20 Lines

	SDValue DAGCombiner::visitCTLZ(SDNode *N) {			SDValue DAGCombiner::visitCTLZ(SDNode *N) {
	SDValue N0 = N->getOperand(0);			SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);			EVT VT = N->getValueType(0);

	// fold (ctlz c1) -> c2			// fold (ctlz c1) -> c2
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0))			if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
	return DAG.getNode(ISD::CTLZ, SDLoc(N), VT, N0);			return DAG.getNode(ISD::CTLZ, SDLoc(N), VT, N0);

				// If the value is known never to be zero, switch to the undef version.
				if (!LegalOperations \|\| TLI.isOperationLegal(ISD::CTLZ_ZERO_UNDEF, VT)) {
				if (DAG.isKnownNeverZero(N0))
				return DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SDLoc(N), VT, N0);
				}

	return SDValue();			return SDValue();
	}			}

	SDValue DAGCombiner::visitCTLZ_ZERO_UNDEF(SDNode *N) {			SDValue DAGCombiner::visitCTLZ_ZERO_UNDEF(SDNode *N) {
	SDValue N0 = N->getOperand(0);			SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);			EVT VT = N->getValueType(0);

	// fold (ctlz_zero_undef c1) -> c2			// fold (ctlz_zero_undef c1) -> c2
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0))			if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
	return DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SDLoc(N), VT, N0);			return DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SDLoc(N), VT, N0);
	return SDValue();			return SDValue();
	}			}

	SDValue DAGCombiner::visitCTTZ(SDNode *N) {			SDValue DAGCombiner::visitCTTZ(SDNode *N) {
	SDValue N0 = N->getOperand(0);			SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);			EVT VT = N->getValueType(0);

	// fold (cttz c1) -> c2			// fold (cttz c1) -> c2
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0))			if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
	return DAG.getNode(ISD::CTTZ, SDLoc(N), VT, N0);			return DAG.getNode(ISD::CTTZ, SDLoc(N), VT, N0);

				// If the value is known never to be zero, switch to the undef version.
				if (!LegalOperations \|\| TLI.isOperationLegal(ISD::CTTZ_ZERO_UNDEF, VT)) {
				if (DAG.isKnownNeverZero(N0))
				return DAG.getNode(ISD::CTTZ_ZERO_UNDEF, SDLoc(N), VT, N0);
				}

	return SDValue();			return SDValue();
	}			}

	SDValue DAGCombiner::visitCTTZ_ZERO_UNDEF(SDNode *N) {			SDValue DAGCombiner::visitCTTZ_ZERO_UNDEF(SDNode *N) {
	SDValue N0 = N->getOperand(0);			SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);			EVT VT = N->getValueType(0);

	// fold (cttz_zero_undef c1) -> c2			// fold (cttz_zero_undef c1) -> c2
	▲ Show 20 Lines • Show All 11,511 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 32,060 Lines • ▼ Show 20 Lines	static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {		const X86Subtarget &Subtarget) {
SDLoc DL(N);		SDLoc DL(N);

SDValue FalseOp = N->getOperand(0);		SDValue FalseOp = N->getOperand(0);
SDValue TrueOp = N->getOperand(1);		SDValue TrueOp = N->getOperand(1);
X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);		X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
SDValue Cond = N->getOperand(3);		SDValue Cond = N->getOperand(3);

if (CC == X86::COND_E \|\| CC == X86::COND_NE) {
switch (Cond.getOpcode()) {
default: break;
case X86ISD::BSR:
case X86ISD::BSF:
// If operand of BSR / BSF are proven never zero, then ZF cannot be set.
if (DAG.isKnownNeverZero(Cond.getOperand(0)))
return (CC == X86::COND_E) ? FalseOp : TrueOp;
}
}

// Try to simplify the EFLAGS and condition code operands.		// Try to simplify the EFLAGS and condition code operands.
// We can't always do this as FCMOV only supports a subset of X86 cond.		// We can't always do this as FCMOV only supports a subset of X86 cond.
if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {		if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
if (FalseOp.getValueType() != MVT::f80 \|\| hasFPCMov(CC)) {		if (FalseOp.getValueType() != MVT::f80 \|\| hasFPCMov(CC)) {
SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),		SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
Flags};		Flags};
return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);		return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
}		}
▲ Show 20 Lines • Show All 6,808 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/cttz_zero_undef.ll

Show First 20 Lines • Show All 233 Lines • ▼ Show 20 Lines	define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind {
%cmp = icmp ne i32 %ctlz, 32		%cmp = icmp ne i32 %ctlz, 32
%sel = select i1 %cmp, i32 %ctlz, i32 -1		%sel = select i1 %cmp, i32 %ctlz, i32 -1
store i32 %sel, i32 addrspace(1)* %out		store i32 %sel, i32 addrspace(1)* %out
ret void		ret void
}		}

; FUNC-LABEL: {{^}}v_cttz_i8_sel_eq_neg1:		; FUNC-LABEL: {{^}}v_cttz_i8_sel_eq_neg1:
; SI: {{buffer\|flat}}_load_ubyte		; SI: {{buffer\|flat}}_load_ubyte
; SI: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}		; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
		; SI-SDWA: v_ffbl_b32_sdwa
; EG: MEM_RAT MSKOR		; EG: MEM_RAT MSKOR
; EG: FFBL_INT		; EG: FFBL_INT
define amdgpu_kernel void @v_cttz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* nocapture readonly %arrayidx) nounwind {		define amdgpu_kernel void @v_cttz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* nocapture readonly %arrayidx) nounwind {
%val = load i8, i8 addrspace(1)* %arrayidx, align 1		%val = load i8, i8 addrspace(1)* %arrayidx, align 1
%ctlz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone		%ctlz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone
%cmp = icmp eq i8 %val, 0		%cmp = icmp eq i8 %val, 0
%sel = select i1 %cmp, i8 -1, i8 %ctlz		%sel = select i1 %cmp, i8 -1, i8 %ctlz
store i8 %sel, i8 addrspace(1)* %out		store i8 %sel, i8 addrspace(1)* %out
Show All 19 Lines