This is an archive of the discontinued LLVM Phabricator instance.

Differential D81430

[AMDGPU] Custom lowering of i64 umulo/smulo
ClosedPublic

Authored by rampitec on Jun 8 2020, 1:55 PM.

Download Raw Diff

Details

Reviewers

arsenm

Commits

rG295d1fe7333c: [AMDGPU] Custom lowering of i64 umulo/smulo

Diff Detail

Event Timeline

rampitec created this revision.Jun 8 2020, 1:55 PM

Herald added a project: Restricted Project. · View Herald TranscriptJun 8 2020, 1:55 PM

Herald added subscribers: kerbowa, hiraditya, t-tye and 7 others. · View Herald Transcript

Can you also add cases with power of 2 constants that the default expansion handles? I assume we miss out on these as-is?

// mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }

llvm/lib/Target/AMDGPU/SIISelLowering.cpp
5008	I assume this is extracted from the default expansion?
5011	Shift amount should be i32
llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
4	Can you also add a pair that stress the scalar path and add a gfx9 run line

Addressed comments.

In D81430#2081054, @arsenm wrote:
Can you also add cases with power of 2 constants that the default expansion handles? I assume we miss out on these as-is?
// mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }

That is questionably if we are missing something here with umulo, we probably missing quite a bit with smulo. The main difference is the avoidance of 64 bit shifts we do after such lowering.

llvm/lib/Target/AMDGPU/SIISelLowering.cpp
5008	Right. A little simplified for what is legal for us, otherwise it is a default implementation.

Copied power of two optimization as well.

Missing test for smulo case?

In D81430#2081279, @arsenm wrote:

Missing test for smulo case?

Which one? smulo_i64_v_4? It is there. I thought all the tests are quite simmetrical.

arsenm accepted this revision.Jun 8 2020, 6:23 PM

This revision is now accepted and ready to land.Jun 8 2020, 6:23 PM

Closed by commit rG295d1fe7333c: [AMDGPU] Custom lowering of i64 umulo/smulo (authored by rampitec). · Explain WhyJun 8 2020, 11:57 PM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

lib/

Target/

AMDGPU/

SIISelLowering.h

1 line

SIISelLowering.cpp

26 lines

test/

CodeGen/

AMDGPU/

llvm.mulo.ll

76 lines

Diff 269350

llvm/lib/Target/AMDGPU/SIISelLowering.h

Show First 20 Lines • Show All 113 Lines • ▼ Show 20 Lines	private:

SDValue convertArgType(		SDValue convertArgType(
SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Val,		SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Val,
bool Signed, const ISD::InputArg *Arg = nullptr) const;		bool Signed, const ISD::InputArg *Arg = nullptr) const;

/// Custom lowering for ISD::FP_ROUND for MVT::f16.		/// Custom lowering for ISD::FP_ROUND for MVT::f16.
SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;		SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;		SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;
		SDValue lowerXMULO(SDValue Op, SelectionDAG &DAG) const;

SDValue getSegmentAperture(unsigned AS, const SDLoc &DL,		SDValue getSegmentAperture(unsigned AS, const SDLoc &DL,
SelectionDAG &DAG) const;		SelectionDAG &DAG) const;

SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const;		SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;		SDValue lowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;		SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;		SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
▲ Show 20 Lines • Show All 333 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 761 Lines • ▼ Show 20 Lines	if (Subtarget->has16BitInsts()) {
setOperationAction(ISD::FNEG, MVT::v2f16, Custom);		setOperationAction(ISD::FNEG, MVT::v2f16, Custom);
setOperationAction(ISD::FABS, MVT::v2f16, Custom);		setOperationAction(ISD::FABS, MVT::v2f16, Custom);
}		}

for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) {		for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) {
setOperationAction(ISD::SELECT, VT, Custom);		setOperationAction(ISD::SELECT, VT, Custom);
}		}

		setOperationAction(ISD::SMULO, MVT::i64, Custom);
		setOperationAction(ISD::UMULO, MVT::i64, Custom);

setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);		setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);		setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);		setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);		setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f16, Custom);		setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f16, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);		setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);		setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);

▲ Show 20 Lines • Show All 3,647 Lines • ▼ Show 20 Lines	SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SMAX:		case ISD::SMAX:
case ISD::UMIN:		case ISD::UMIN:
case ISD::UMAX:		case ISD::UMAX:
case ISD::FADD:		case ISD::FADD:
case ISD::FMUL:		case ISD::FMUL:
case ISD::FMINNUM_IEEE:		case ISD::FMINNUM_IEEE:
case ISD::FMAXNUM_IEEE:		case ISD::FMAXNUM_IEEE:
return splitBinaryVectorOp(Op, DAG);		return splitBinaryVectorOp(Op, DAG);
		case ISD::SMULO:
		case ISD::UMULO:
		return lowerXMULO(Op, DAG);
case ISD::DYNAMIC_STACKALLOC:		case ISD::DYNAMIC_STACKALLOC:
return LowerDYNAMIC_STACKALLOC(Op, DAG);		return LowerDYNAMIC_STACKALLOC(Op, DAG);
}		}
return SDValue();		return SDValue();
}		}

static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,		static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
const SDLoc &DL,		const SDLoc &DL,
▲ Show 20 Lines • Show All 543 Lines • ▼ Show 20 Lines	SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
if (IsIEEEMode)		if (IsIEEEMode)
return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);		return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);

if (VT == MVT::v4f16)		if (VT == MVT::v4f16)
return splitBinaryVectorOp(Op, DAG);		return splitBinaryVectorOp(Op, DAG);
return Op;		return Op;
}		}

		SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
		EVT VT = Op.getValueType();
		SDLoc SL(Op);
		SDValue LHS = Op.getOperand(0);
		SDValue RHS = Op.getOperand(1);
		bool isSigned = Op.getOpcode() == ISD::SMULO;

		SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
		SDValue Top = DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU,
		SL, VT, LHS, RHS);

		arsenmUnsubmitted Done Reply Inline Actions I assume this is extracted from the default expansion? arsenm: I assume this is extracted from the default expansion?
		rampitecAuthorUnsubmitted Done Reply Inline Actions Right. A little simplified for what is legal for us, otherwise it is a default implementation. rampitec: Right. A little simplified for what is legal for us, otherwise it is a default implementation.
		SDValue Sign = isSigned
		? DAG.getNode(ISD::SRA, SL, VT, Result,
		DAG.getConstant(VT.getScalarSizeInBits() - 1, SL, MVT::i64))
		arsenmUnsubmitted Done Reply Inline Actions Shift amount should be i32 arsenm: Shift amount should be i32
		: DAG.getConstant(0, SL, VT);
		SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);

		return DAG.getMergeValues({ Result, Overflow }, SL);
		}

SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {		SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);		SDLoc SL(Op);
SDValue Chain = Op.getOperand(0);		SDValue Chain = Op.getOperand(0);

if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa \|\|		if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa \|\|
!Subtarget->isTrapHandlerEnabled())		!Subtarget->isTrapHandlerEnabled())
return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);		return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);

▲ Show 20 Lines • Show All 6,468 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/llvm.mulo.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN %s

				define { i64, i1 } @umulo_i64(i64 %x, i64 %y) {
				arsenmUnsubmitted Done Reply Inline Actions Can you also add a pair that stress the scalar path and add a gfx9 run line arsenm: Can you also add a pair that stress the scalar path and add a gfx9 run line
				; GCN-LABEL: umulo_i64:
				; GCN: ; %bb.0: ; %bb
				; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GCN-NEXT: v_mul_hi_u32 v4, v1, v2
				; GCN-NEXT: v_mul_lo_u32 v5, v1, v2
				; GCN-NEXT: v_mul_hi_u32 v6, v0, v3
				; GCN-NEXT: v_mul_lo_u32 v7, v0, v3
				; GCN-NEXT: v_mul_hi_u32 v8, v0, v2
				; GCN-NEXT: v_mul_hi_u32 v9, v1, v3
				; GCN-NEXT: v_mul_lo_u32 v3, v1, v3
				; GCN-NEXT: v_mul_lo_u32 v0, v0, v2
				; GCN-NEXT: v_add_i32_e32 v1, vcc, v8, v7
				; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v6, vcc
				; GCN-NEXT: v_add_i32_e32 v6, vcc, v1, v5
				; GCN-NEXT: v_add_i32_e64 v1, s[4:5], v1, v5
				; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc
				; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v9, vcc
				; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
				; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
				; GCN-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
				; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
				; GCN-NEXT: s_setpc_b64 s[30:31]
				bb:
				%umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %x, i64 %y)
				ret { i64, i1 } %umulo
				}

				define { i64, i1 } @smulo_i64(i64 %x, i64 %y) {
				; GCN-LABEL: smulo_i64:
				; GCN: ; %bb.0: ; %bb
				; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GCN-NEXT: v_mul_hi_u32 v6, v1, v2
				; GCN-NEXT: v_mul_lo_u32 v5, v1, v2
				; GCN-NEXT: v_mul_hi_u32 v7, v0, v3
				; GCN-NEXT: v_mul_lo_u32 v8, v0, v3
				; GCN-NEXT: v_mul_hi_u32 v9, v0, v2
				; GCN-NEXT: v_mul_hi_i32 v10, v1, v3
				; GCN-NEXT: v_mul_lo_u32 v11, v1, v3
				; GCN-NEXT: v_mov_b32_e32 v12, 0
				; GCN-NEXT: v_mul_lo_u32 v4, v0, v2
				; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8
				; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
				; GCN-NEXT: v_add_i32_e32 v9, vcc, v8, v5
				; GCN-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5
				; GCN-NEXT: v_addc_u32_e32 v8, vcc, v7, v6, vcc
				; GCN-NEXT: v_ashrrev_i32_e32 v6, 31, v5
				; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc
				; GCN-NEXT: v_mov_b32_e32 v7, v6
				; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v11
				; GCN-NEXT: v_addc_u32_e32 v9, vcc, v12, v9, vcc
				; GCN-NEXT: v_sub_i32_e32 v2, vcc, v8, v2
				; GCN-NEXT: v_subb_u32_e32 v10, vcc, v9, v12, vcc
				; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1
				; GCN-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc
				; GCN-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc
				; GCN-NEXT: v_sub_i32_e32 v0, vcc, v2, v0
				; GCN-NEXT: v_subb_u32_e32 v8, vcc, v1, v12, vcc
				; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3
				; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
				; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
				; GCN-NEXT: v_cmp_ne_u64_e32 vcc, v[0:1], v[6:7]
				; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
				; GCN-NEXT: v_mov_b32_e32 v0, v4
				; GCN-NEXT: v_mov_b32_e32 v1, v5
				; GCN-NEXT: s_setpc_b64 s[30:31]
				bb:
				%smulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %x, i64 %y)
				ret { i64, i1 } %smulo
				}

				declare { i64, i1 } @llvm.umul.with.overflow.i64(i64, i64)
				declare { i64, i1 } @llvm.smul.with.overflow.i64(i64, i64)