This is an archive of the discontinued LLVM Phabricator instance.

lib/Target/AMDGPU/SIISelLowering.cpp
3525–3527	I think calling simplifyDemandedBits either here or in performUCharToFloat combine will eliminate the need for this code.
3560	Nevermind, I see that this transform is correct, because performUCharToFloatCombine() is checking that the high bits are all zero.

Use getZExtOrTrunc

Herald edited edge metadata. · View Herald TranscriptOct 21 2016, 10:46 AM

kzhuravl added inline comments.Oct 21 2016, 10:46 AM

lib/Target/AMDGPU/SIISelLowering.cpp
3525–3527	In this case, simplifyDemandedBits will replace zero_extend with any_extend.

Only check for zero_extend

Herald edited edge metadata. · View Herald TranscriptOct 21 2016, 12:08 PM

LGTM. Can you mention in the commit message that this will prevent a regression when enabling i16 support, and note the test that would regress.

This revision is now accepted and ready to land.Oct 21 2016, 1:09 PM

kzhuravl updated this object.Oct 21 2016, 3:11 PM

kzhuravl edited edge metadata.

Closed by commit rL284891: [AMDGPU] Perform uchar to float combine for ISD::SINT_TO_FP (authored by kzhuravl). · Explain WhyOct 21 2016, 3:19 PM

This revision was automatically updated to reflect the committed changes.

arsenm added inline comments.Oct 26 2016, 12:41 PM

lib/Target/AMDGPU/SIISelLowering.cpp
3560	That checks if the high 24 bits are zero. For the signed case it needs to check if 25 bits are 0 so it is still incorrect

Revision Contents

Path

Size

lib/

Target/

AMDGPU/

SIISelLowering.cpp

11 lines

Diff 75257

lib/Target/AMDGPU/SIISelLowering.cpp

Show First 20 Lines • Show All 227 Lines • ▼ Show 20 Lines	SITargetLowering::SITargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::SMIN);		setTargetDAGCombine(ISD::SMIN);
setTargetDAGCombine(ISD::SMAX);		setTargetDAGCombine(ISD::SMAX);
setTargetDAGCombine(ISD::UMIN);		setTargetDAGCombine(ISD::UMIN);
setTargetDAGCombine(ISD::UMAX);		setTargetDAGCombine(ISD::UMAX);
setTargetDAGCombine(ISD::SETCC);		setTargetDAGCombine(ISD::SETCC);
setTargetDAGCombine(ISD::AND);		setTargetDAGCombine(ISD::AND);
setTargetDAGCombine(ISD::OR);		setTargetDAGCombine(ISD::OR);
setTargetDAGCombine(ISD::XOR);		setTargetDAGCombine(ISD::XOR);
		setTargetDAGCombine(ISD::SINT_TO_FP);
setTargetDAGCombine(ISD::UINT_TO_FP);		setTargetDAGCombine(ISD::UINT_TO_FP);
setTargetDAGCombine(ISD::FCANONICALIZE);		setTargetDAGCombine(ISD::FCANONICALIZE);

// All memory operations. Some folding on the pointer operand is done to help		// All memory operations. Some folding on the pointer operand is done to help
// matching the constant offsets in the addressing modes.		// matching the constant offsets in the addressing modes.
setTargetDAGCombine(ISD::LOAD);		setTargetDAGCombine(ISD::LOAD);
setTargetDAGCombine(ISD::STORE);		setTargetDAGCombine(ISD::STORE);
setTargetDAGCombine(ISD::ATOMIC_LOAD);		setTargetDAGCombine(ISD::ATOMIC_LOAD);
▲ Show 20 Lines • Show All 3,272 Lines • ▼ Show 20 Lines	SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
}		}

case AMDGPUISD::CVT_F32_UBYTE0:		case AMDGPUISD::CVT_F32_UBYTE0:
case AMDGPUISD::CVT_F32_UBYTE1:		case AMDGPUISD::CVT_F32_UBYTE1:
case AMDGPUISD::CVT_F32_UBYTE2:		case AMDGPUISD::CVT_F32_UBYTE2:
case AMDGPUISD::CVT_F32_UBYTE3: {		case AMDGPUISD::CVT_F32_UBYTE3: {
unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;		unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
SDValue Src = N->getOperand(0);		SDValue Src = N->getOperand(0);
		if (Src.getOpcode() == ISD::ZERO_EXTEND &&
		Src.getOperand(0).getOpcode() == ISD::SRL)
		Src = Src.getOperand(0);
		tstellarAMDUnsubmitted Not Done Reply Inline Actions I think calling simplifyDemandedBits either here or in performUCharToFloat combine will eliminate the need for this code. tstellarAMD: I think calling simplifyDemandedBits either here or in performUCharToFloat combine will…
		kzhuravlAuthorUnsubmitted Not Done Reply Inline Actions In this case, simplifyDemandedBits will replace zero_extend with any_extend. kzhuravl: In this case, simplifyDemandedBits will replace zero_extend with any_extend.

// TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero.		// TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero.
if (Src.getOpcode() == ISD::SRL) {		if (Src.getOpcode() == ISD::SRL) {
// cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x		// cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
// cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x		// cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
// cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x		// cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x

if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {		if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(1))) {
		SDValue Chain = Src.getOperand(0).getScalarValueSizeInBits() < 32 ?
		DAG.getNode(ISD::ZERO_EXTEND, SDLoc(Src.getOperand(0)),
		EVT(MVT::i32), Src.getOperand(0)) : Src.getOperand(0);
unsigned SrcOffset = C->getZExtValue() + 8 * Offset;		unsigned SrcOffset = C->getZExtValue() + 8 * Offset;
if (SrcOffset < 32 && SrcOffset % 8 == 0) {		if (SrcOffset < 32 && SrcOffset % 8 == 0) {
return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, DL,		return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, DL,
MVT::f32, Src.getOperand(0));		MVT::f32, Chain);
}		}
}		}
}		}

APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);		APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);

APInt KnownZero, KnownOne;		APInt KnownZero, KnownOne;
TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),		TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
!DCI.isBeforeLegalizeOps());		!DCI.isBeforeLegalizeOps());
const TargetLowering &TLI = DAG.getTargetLoweringInfo();		const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (TLO.ShrinkDemandedConstant(Src, Demanded) \|\|		if (TLO.ShrinkDemandedConstant(Src, Demanded) \|\|
TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) {		TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) {
DCI.CommitTargetLoweringOpt(TLO);		DCI.CommitTargetLoweringOpt(TLO);
}		}

break;		break;
}		}
		case ISD::SINT_TO_FP:
		tstellarAMDUnsubmitted Not Done Reply Inline Actions I don't think this is correct, because it's replacing SINT_TO_FP with CVT_F32_UBYTE0, which is an unsigned conversion. tstellarAMD: I don't think this is correct, because it's replacing SINT_TO_FP with CVT_F32_UBYTE0, which is…
		tstellarAMDUnsubmitted Not Done Reply Inline Actions Nevermind, I see that this transform is correct, because performUCharToFloatCombine() is checking that the high bits are all zero. tstellarAMD: Nevermind, I see that this transform is correct, because performUCharToFloatCombine() is…
		arsenmUnsubmitted Not Done Reply Inline Actions That checks if the high 24 bits are zero. For the signed case it needs to check if 25 bits are 0 so it is still incorrect arsenm: That checks if the high 24 bits are zero. For the signed case it needs to check if 25 bits are…
case ISD::UINT_TO_FP: {		case ISD::UINT_TO_FP: {
return performUCharToFloatCombine(N, DCI);		return performUCharToFloatCombine(N, DCI);
}		}
case ISD::FADD: {		case ISD::FADD: {
if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)		if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
break;		break;

EVT VT = N->getValueType(0);		EVT VT = N->getValueType(0);
▲ Show 20 Lines • Show All 508 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Perform uchar to float combine for ISD::SINT_TO_FPClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 75257

lib/Target/AMDGPU/SIISelLowering.cpp

[AMDGPU] Perform uchar to float combine for ISD::SINT_TO_FP
ClosedPublic