This is an archive of the discontinued LLVM Phabricator instance.

Differential D81430

[AMDGPU] Custom lowering of i64 umulo/smulo
ClosedPublic

Authored by rampitec on Jun 8 2020, 1:55 PM.

Download Raw Diff

Details

Reviewers

arsenm

Commits

rG295d1fe7333c: [AMDGPU] Custom lowering of i64 umulo/smulo

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

rampitec created this revision.Jun 8 2020, 1:55 PM

Herald added a project: Restricted Project. · View Herald TranscriptJun 8 2020, 1:55 PM

Herald added subscribers: kerbowa, hiraditya, t-tye and 7 others. · View Herald Transcript

Can you also add cases with power of 2 constants that the default expansion handles? I assume we miss out on these as-is?

// mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }

llvm/lib/Target/AMDGPU/SIISelLowering.cpp
5008	I assume this is extracted from the default expansion?
5011	Shift amount should be i32
llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
5	Can you also add a pair that stress the scalar path and add a gfx9 run line

Addressed comments.

In D81430#2081054, @arsenm wrote:
Can you also add cases with power of 2 constants that the default expansion handles? I assume we miss out on these as-is?
// mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }

That is questionably if we are missing something here with umulo, we probably missing quite a bit with smulo. The main difference is the avoidance of 64 bit shifts we do after such lowering.

llvm/lib/Target/AMDGPU/SIISelLowering.cpp
5008	Right. A little simplified for what is legal for us, otherwise it is a default implementation.

Copied power of two optimization as well.

Missing test for smulo case?

In D81430#2081279, @arsenm wrote:

Missing test for smulo case?

Which one? smulo_i64_v_4? It is there. I thought all the tests are quite simmetrical.

arsenm accepted this revision.Jun 8 2020, 6:23 PM

This revision is now accepted and ready to land.Jun 8 2020, 6:23 PM

Closed by commit rG295d1fe7333c: [AMDGPU] Custom lowering of i64 umulo/smulo (authored by rampitec). · Explain WhyJun 8 2020, 11:57 PM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

lib/

Target/

AMDGPU/

SIISelLowering.h

1 line

SIISelLowering.cpp

42 lines

test/

CodeGen/

AMDGPU/

llvm.mulo.ll

365 lines

Diff 269418

llvm/lib/Target/AMDGPU/SIISelLowering.h

Show First 20 Lines • Show All 113 Lines • ▼ Show 20 Lines	private:

SDValue convertArgType(		SDValue convertArgType(
SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Val,		SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Val,
bool Signed, const ISD::InputArg *Arg = nullptr) const;		bool Signed, const ISD::InputArg *Arg = nullptr) const;

/// Custom lowering for ISD::FP_ROUND for MVT::f16.		/// Custom lowering for ISD::FP_ROUND for MVT::f16.
SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;		SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;		SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;
		SDValue lowerXMULO(SDValue Op, SelectionDAG &DAG) const;

SDValue getSegmentAperture(unsigned AS, const SDLoc &DL,		SDValue getSegmentAperture(unsigned AS, const SDLoc &DL,
SelectionDAG &DAG) const;		SelectionDAG &DAG) const;

SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const;		SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;		SDValue lowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;		SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;		SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
▲ Show 20 Lines • Show All 333 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 761 Lines • ▼ Show 20 Lines	if (Subtarget->has16BitInsts()) {
setOperationAction(ISD::FNEG, MVT::v2f16, Custom);		setOperationAction(ISD::FNEG, MVT::v2f16, Custom);
setOperationAction(ISD::FABS, MVT::v2f16, Custom);		setOperationAction(ISD::FABS, MVT::v2f16, Custom);
}		}

for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) {		for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) {
setOperationAction(ISD::SELECT, VT, Custom);		setOperationAction(ISD::SELECT, VT, Custom);
}		}

		setOperationAction(ISD::SMULO, MVT::i64, Custom);
		setOperationAction(ISD::UMULO, MVT::i64, Custom);

setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);		setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);		setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);		setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);		setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f16, Custom);		setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f16, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);		setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);		setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);

▲ Show 20 Lines • Show All 3,647 Lines • ▼ Show 20 Lines	SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SMAX:		case ISD::SMAX:
case ISD::UMIN:		case ISD::UMIN:
case ISD::UMAX:		case ISD::UMAX:
case ISD::FADD:		case ISD::FADD:
case ISD::FMUL:		case ISD::FMUL:
case ISD::FMINNUM_IEEE:		case ISD::FMINNUM_IEEE:
case ISD::FMAXNUM_IEEE:		case ISD::FMAXNUM_IEEE:
return splitBinaryVectorOp(Op, DAG);		return splitBinaryVectorOp(Op, DAG);
		case ISD::SMULO:
		case ISD::UMULO:
		return lowerXMULO(Op, DAG);
case ISD::DYNAMIC_STACKALLOC:		case ISD::DYNAMIC_STACKALLOC:
return LowerDYNAMIC_STACKALLOC(Op, DAG);		return LowerDYNAMIC_STACKALLOC(Op, DAG);
}		}
return SDValue();		return SDValue();
}		}

static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,		static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
const SDLoc &DL,		const SDLoc &DL,
▲ Show 20 Lines • Show All 543 Lines • ▼ Show 20 Lines	SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
if (IsIEEEMode)		if (IsIEEEMode)
return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);		return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);

if (VT == MVT::v4f16)		if (VT == MVT::v4f16)
return splitBinaryVectorOp(Op, DAG);		return splitBinaryVectorOp(Op, DAG);
return Op;		return Op;
}		}

		SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
		EVT VT = Op.getValueType();
		SDLoc SL(Op);
		SDValue LHS = Op.getOperand(0);
		SDValue RHS = Op.getOperand(1);
		bool isSigned = Op.getOpcode() == ISD::SMULO;

		if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
		const APInt &C = RHSC->getAPIntValue();
		// mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
		if (C.isPowerOf2()) {
		arsenmUnsubmitted Done Reply Inline Actions I assume this is extracted from the default expansion? arsenm: I assume this is extracted from the default expansion?
		rampitecAuthorUnsubmitted Done Reply Inline Actions Right. A little simplified for what is legal for us, otherwise it is a default implementation. rampitec: Right. A little simplified for what is legal for us, otherwise it is a default implementation.
		// smulo(x, signed_min) is same as umulo(x, signed_min).
		bool UseArithShift = isSigned && !C.isMinSignedValue();
		SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
		arsenmUnsubmitted Done Reply Inline Actions Shift amount should be i32 arsenm: Shift amount should be i32
		SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
		SDValue Overflow = DAG.getSetCC(SL, MVT::i1,
		DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL,
		SL, VT, Result, ShiftAmt),
		LHS, ISD::SETNE);
		return DAG.getMergeValues({ Result, Overflow }, SL);
		}
		}

		SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
		SDValue Top = DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU,
		SL, VT, LHS, RHS);

		SDValue Sign = isSigned
		? DAG.getNode(ISD::SRA, SL, VT, Result,
		DAG.getConstant(VT.getScalarSizeInBits() - 1, SL, MVT::i32))
		: DAG.getConstant(0, SL, VT);
		SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);

		return DAG.getMergeValues({ Result, Overflow }, SL);
		}

SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {		SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);		SDLoc SL(Op);
SDValue Chain = Op.getOperand(0);		SDValue Chain = Op.getOperand(0);

if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa \|\|		if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa \|\|
!Subtarget->isTrapHandlerEnabled())		!Subtarget->isTrapHandlerEnabled())
return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);		return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);

▲ Show 20 Lines • Show All 6,468 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/llvm.mulo.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,SI %s
				; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,GFX9 %s

				define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
				arsenmUnsubmitted Done Reply Inline Actions Can you also add a pair that stress the scalar path and add a gfx9 run line arsenm: Can you also add a pair that stress the scalar path and add a gfx9 run line
				; SI-LABEL: umulo_i64_v_v:
				; SI: ; %bb.0: ; %bb
				; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; SI-NEXT: v_mul_hi_u32 v4, v1, v2
				; SI-NEXT: v_mul_lo_u32 v5, v1, v2
				; SI-NEXT: v_mul_hi_u32 v6, v0, v3
				; SI-NEXT: v_mul_lo_u32 v7, v0, v3
				; SI-NEXT: v_mul_hi_u32 v8, v0, v2
				; SI-NEXT: v_mul_hi_u32 v9, v1, v3
				; SI-NEXT: v_mul_lo_u32 v3, v1, v3
				; SI-NEXT: v_mul_lo_u32 v0, v0, v2
				; SI-NEXT: v_add_i32_e32 v1, vcc, v8, v7
				; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v6, vcc
				; SI-NEXT: v_add_i32_e32 v6, vcc, v1, v5
				; SI-NEXT: v_add_i32_e64 v1, s[4:5], v1, v5
				; SI-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc
				; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v9, vcc
				; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v3
				; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
				; SI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
				; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
				; SI-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX9-LABEL: umulo_i64_v_v:
				; GFX9: ; %bb.0: ; %bb
				; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-NEXT: v_mul_lo_u32 v5, v0, v3
				; GFX9-NEXT: v_mul_hi_u32 v6, v0, v2
				; GFX9-NEXT: v_mul_hi_u32 v8, v0, v3
				; GFX9-NEXT: v_mul_lo_u32 v7, v1, v2
				; GFX9-NEXT: v_mul_hi_u32 v4, v1, v2
				; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v6, v5
				; GFX9-NEXT: v_mul_hi_u32 v10, v1, v3
				; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v8, vcc
				; GFX9-NEXT: v_mul_lo_u32 v1, v1, v3
				; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v9, v7
				; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v4, vcc
				; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v10, vcc
				; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v1
				; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v8, vcc
				; GFX9-NEXT: v_mul_lo_u32 v0, v0, v2
				; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[3:4]
				; GFX9-NEXT: v_add3_u32 v1, v6, v5, v7
				; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
				; GFX9-NEXT: s_setpc_b64 s[30:31]
				bb:
				%umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %x, i64 %y)
				ret { i64, i1 } %umulo
				}

				define { i64, i1 } @smulo_i64_s_s(i64 %x, i64 %y) {
				; SI-LABEL: smulo_i64_s_s:
				; SI: ; %bb.0: ; %bb
				; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; SI-NEXT: v_mul_hi_u32 v6, v1, v2
				; SI-NEXT: v_mul_lo_u32 v5, v1, v2
				; SI-NEXT: v_mul_hi_u32 v7, v0, v3
				; SI-NEXT: v_mul_lo_u32 v8, v0, v3
				; SI-NEXT: v_mul_hi_u32 v9, v0, v2
				; SI-NEXT: v_mul_hi_i32 v10, v1, v3
				; SI-NEXT: v_mul_lo_u32 v11, v1, v3
				; SI-NEXT: v_mov_b32_e32 v12, 0
				; SI-NEXT: v_mul_lo_u32 v4, v0, v2
				; SI-NEXT: v_add_i32_e32 v8, vcc, v9, v8
				; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
				; SI-NEXT: v_add_i32_e32 v9, vcc, v8, v5
				; SI-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5
				; SI-NEXT: v_addc_u32_e32 v8, vcc, v7, v6, vcc
				; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5
				; SI-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc
				; SI-NEXT: v_mov_b32_e32 v7, v6
				; SI-NEXT: v_add_i32_e32 v8, vcc, v8, v11
				; SI-NEXT: v_addc_u32_e32 v9, vcc, v12, v9, vcc
				; SI-NEXT: v_sub_i32_e32 v2, vcc, v8, v2
				; SI-NEXT: v_subb_u32_e32 v10, vcc, v9, v12, vcc
				; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1
				; SI-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc
				; SI-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc
				; SI-NEXT: v_sub_i32_e32 v0, vcc, v2, v0
				; SI-NEXT: v_subb_u32_e32 v8, vcc, v1, v12, vcc
				; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3
				; SI-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc
				; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
				; SI-NEXT: v_cmp_ne_u64_e32 vcc, v[0:1], v[6:7]
				; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
				; SI-NEXT: v_mov_b32_e32 v0, v4
				; SI-NEXT: v_mov_b32_e32 v1, v5
				; SI-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX9-LABEL: smulo_i64_s_s:
				; GFX9: ; %bb.0: ; %bb
				; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-NEXT: v_mul_lo_u32 v5, v0, v3
				; GFX9-NEXT: v_mul_hi_u32 v6, v0, v2
				; GFX9-NEXT: v_mul_hi_u32 v8, v0, v3
				; GFX9-NEXT: v_mul_lo_u32 v7, v1, v2
				; GFX9-NEXT: v_mul_hi_u32 v4, v1, v2
				; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v6, v5
				; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v8, vcc
				; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v9, v7
				; GFX9-NEXT: v_mul_hi_i32 v10, v1, v3
				; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v4, vcc
				; GFX9-NEXT: v_mul_lo_u32 v8, v1, v3
				; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v10, vcc
				; GFX9-NEXT: v_mov_b32_e32 v10, 0
				; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8
				; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v10, v9, vcc
				; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, v4, v2
				; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, v8, v10, vcc
				; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1
				; GFX9-NEXT: v_cndmask_b32_e32 v1, v8, v11, vcc
				; GFX9-NEXT: v_cndmask_b32_e32 v8, v4, v9, vcc
				; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, v8, v0
				; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v1, v10, vcc
				; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3
				; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v4, vcc
				; GFX9-NEXT: v_add3_u32 v1, v6, v5, v7
				; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v1
				; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc
				; GFX9-NEXT: v_mov_b32_e32 v6, v5
				; GFX9-NEXT: v_mul_lo_u32 v0, v0, v2
				; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, v[3:4], v[5:6]
				; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
				; GFX9-NEXT: s_setpc_b64 s[30:31]
				bb:
				%smulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %x, i64 %y)
				ret { i64, i1 } %smulo
				}

				define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) {
				; SI-LABEL: umulo_i64_s:
				; SI: ; %bb.0: ; %bb
				; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
				; SI-NEXT: s_mov_b32 s7, 0xf000
				; SI-NEXT: s_waitcnt lgkmcnt(0)
				; SI-NEXT: v_mov_b32_e32 v0, s2
				; SI-NEXT: v_mul_hi_u32 v1, s1, v0
				; SI-NEXT: s_mul_i32 s4, s1, s2
				; SI-NEXT: v_mov_b32_e32 v2, s3
				; SI-NEXT: v_mul_hi_u32 v3, s0, v2
				; SI-NEXT: s_mul_i32 s5, s0, s3
				; SI-NEXT: v_mul_hi_u32 v0, s0, v0
				; SI-NEXT: v_mul_hi_u32 v2, s1, v2
				; SI-NEXT: s_mul_i32 s1, s1, s3
				; SI-NEXT: s_mul_i32 s0, s0, s2
				; SI-NEXT: v_add_i32_e32 v4, vcc, s5, v0
				; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
				; SI-NEXT: v_mov_b32_e32 v5, s0
				; SI-NEXT: v_add_i32_e32 v4, vcc, s4, v4
				; SI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
				; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
				; SI-NEXT: v_add_i32_e32 v3, vcc, s5, v0
				; SI-NEXT: v_add_i32_e32 v0, vcc, s1, v1
				; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
				; SI-NEXT: v_add_i32_e32 v2, vcc, s4, v3
				; SI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
				; SI-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
				; SI-NEXT: v_cndmask_b32_e64 v0, v5, 0, vcc
				; SI-NEXT: s_mov_b32 s6, -1
				; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
				; SI-NEXT: s_endpgm
				;
				; GFX9-LABEL: umulo_i64_s:
				; GFX9: ; %bb.0: ; %bb
				; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; GFX9-NEXT: s_waitcnt lgkmcnt(0)
				; GFX9-NEXT: s_mul_i32 s7, s0, s3
				; GFX9-NEXT: s_mul_hi_u32 s8, s0, s2
				; GFX9-NEXT: s_mul_hi_u32 s5, s0, s3
				; GFX9-NEXT: s_add_u32 s9, s8, s7
				; GFX9-NEXT: s_mul_i32 s6, s1, s2
				; GFX9-NEXT: s_addc_u32 s5, 0, s5
				; GFX9-NEXT: s_mul_hi_u32 s4, s1, s2
				; GFX9-NEXT: s_add_u32 s9, s9, s6
				; GFX9-NEXT: s_mul_hi_u32 s10, s1, s3
				; GFX9-NEXT: s_addc_u32 s4, s5, s4
				; GFX9-NEXT: s_addc_u32 s5, s10, 0
				; GFX9-NEXT: s_mul_i32 s1, s1, s3
				; GFX9-NEXT: s_add_u32 s4, s4, s1
				; GFX9-NEXT: s_addc_u32 s5, 0, s5
				; GFX9-NEXT: s_add_i32 s1, s8, s7
				; GFX9-NEXT: s_add_i32 s1, s1, s6
				; GFX9-NEXT: s_mul_i32 s2, s0, s2
				; GFX9-NEXT: v_mov_b32_e32 v0, s1
				; GFX9-NEXT: v_cmp_ne_u64_e64 s[0:1], s[4:5], 0
				; GFX9-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[0:1]
				; GFX9-NEXT: v_mov_b32_e32 v0, s2
				; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1]
				; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
				; GFX9-NEXT: s_endpgm
				bb:
				%umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %x, i64 %y)
				%mul = extractvalue { i64, i1 } %umulo, 0
				%overflow = extractvalue { i64, i1 } %umulo, 1
				%res = select i1 %overflow, i64 0, i64 %mul
				store i64 %res, i64 addrspace(1)* undef
				ret void
				}

				define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
				; SI-LABEL: smulo_i64_s:
				; SI: ; %bb.0: ; %bb
				; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
				; SI-NEXT: v_mov_b32_e32 v0, 0
				; SI-NEXT: s_mov_b32 s7, 0xf000
				; SI-NEXT: s_waitcnt lgkmcnt(0)
				; SI-NEXT: v_mov_b32_e32 v1, s2
				; SI-NEXT: v_mul_hi_u32 v2, s1, v1
				; SI-NEXT: s_mul_i32 s4, s1, s2
				; SI-NEXT: v_mov_b32_e32 v3, s3
				; SI-NEXT: v_mul_hi_u32 v4, s0, v3
				; SI-NEXT: s_mul_i32 s5, s0, s3
				; SI-NEXT: v_mul_hi_u32 v1, s0, v1
				; SI-NEXT: v_mul_hi_i32 v3, s1, v3
				; SI-NEXT: s_mul_i32 s6, s1, s3
				; SI-NEXT: s_mul_i32 s8, s0, s2
				; SI-NEXT: v_add_i32_e32 v5, vcc, s5, v1
				; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
				; SI-NEXT: v_mov_b32_e32 v6, s8
				; SI-NEXT: v_add_i32_e32 v5, vcc, s4, v5
				; SI-NEXT: v_addc_u32_e32 v2, vcc, v4, v2, vcc
				; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
				; SI-NEXT: v_add_i32_e32 v1, vcc, s5, v1
				; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2
				; SI-NEXT: v_addc_u32_e32 v3, vcc, v0, v3, vcc
				; SI-NEXT: v_add_i32_e32 v4, vcc, s4, v1
				; SI-NEXT: v_subrev_i32_e32 v1, vcc, s2, v2
				; SI-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v3, vcc
				; SI-NEXT: v_ashrrev_i32_e32 v0, 31, v4
				; SI-NEXT: v_cmp_lt_i32_e64 vcc, s1, 0
				; SI-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
				; SI-NEXT: v_cndmask_b32_e32 v2, v2, v1, vcc
				; SI-NEXT: v_mov_b32_e32 v1, v0
				; SI-NEXT: v_subrev_i32_e32 v5, vcc, s0, v2
				; SI-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v3, vcc
				; SI-NEXT: v_cmp_lt_i32_e64 vcc, s3, 0
				; SI-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
				; SI-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
				; SI-NEXT: v_cmp_ne_u64_e32 vcc, v[2:3], v[0:1]
				; SI-NEXT: v_cndmask_b32_e64 v1, v4, 0, vcc
				; SI-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc
				; SI-NEXT: s_mov_b32 s6, -1
				; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
				; SI-NEXT: s_endpgm
				;
				; GFX9-LABEL: smulo_i64_s:
				; GFX9: ; %bb.0: ; %bb
				; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
				; GFX9-NEXT: s_waitcnt lgkmcnt(0)
				; GFX9-NEXT: s_mul_i32 s7, s0, s3
				; GFX9-NEXT: s_mul_hi_u32 s8, s0, s2
				; GFX9-NEXT: s_mul_hi_u32 s6, s0, s3
				; GFX9-NEXT: s_add_u32 s9, s8, s7
				; GFX9-NEXT: s_mul_i32 s5, s1, s2
				; GFX9-NEXT: s_addc_u32 s6, 0, s6
				; GFX9-NEXT: s_add_u32 s9, s9, s5
				; GFX9-NEXT: s_mul_hi_u32 s4, s1, s2
				; GFX9-NEXT: s_mul_hi_i32 s10, s1, s3
				; GFX9-NEXT: s_addc_u32 s4, s6, s4
				; GFX9-NEXT: s_addc_u32 s6, s10, 0
				; GFX9-NEXT: s_mul_i32 s9, s1, s3
				; GFX9-NEXT: s_add_u32 s4, s4, s9
				; GFX9-NEXT: s_addc_u32 s6, 0, s6
				; GFX9-NEXT: s_sub_u32 s9, s4, s2
				; GFX9-NEXT: s_subb_u32 s10, s6, 0
				; GFX9-NEXT: v_cmp_lt_i32_e64 vcc, s1, 0
				; GFX9-NEXT: v_mov_b32_e32 v0, s6
				; GFX9-NEXT: v_mov_b32_e32 v1, s10
				; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
				; GFX9-NEXT: v_mov_b32_e32 v1, s4
				; GFX9-NEXT: v_mov_b32_e32 v2, s9
				; GFX9-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc
				; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s0, v2
				; GFX9-NEXT: s_add_i32 s1, s8, s7
				; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v0, vcc
				; GFX9-NEXT: s_add_i32 s1, s1, s5
				; GFX9-NEXT: v_cmp_lt_i32_e64 vcc, s3, 0
				; GFX9-NEXT: s_ashr_i32 s4, s1, 31
				; GFX9-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
				; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
				; GFX9-NEXT: s_mov_b32 s5, s4
				; GFX9-NEXT: s_mul_i32 s0, s0, s2
				; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, s[4:5], v[0:1]
				; GFX9-NEXT: v_mov_b32_e32 v0, s0
				; GFX9-NEXT: v_mov_b32_e32 v2, s1
				; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
				; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
				; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
				; GFX9-NEXT: s_endpgm
				bb:
				%umulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %x, i64 %y)
				%mul = extractvalue { i64, i1 } %umulo, 0
				%overflow = extractvalue { i64, i1 } %umulo, 1
				%res = select i1 %overflow, i64 0, i64 %mul
				store i64 %res, i64 addrspace(1)* undef
				ret void
				}

				define { i64, i1 } @smulo_i64_v_4(i64 %i) {
				; SI-LABEL: smulo_i64_v_4:
				; SI: ; %bb.0: ; %bb
				; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; SI-NEXT: v_lshl_b64 v[5:6], v[0:1], 2
				; SI-NEXT: v_alignbit_b32 v4, v1, v0, 30
				; SI-NEXT: v_ashr_i64 v[2:3], v[5:6], 2
				; SI-NEXT: v_cmp_ne_u64_e32 vcc, v[2:3], v[0:1]
				; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
				; SI-NEXT: v_mov_b32_e32 v0, v5
				; SI-NEXT: v_mov_b32_e32 v1, v4
				; SI-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX9-LABEL: smulo_i64_v_4:
				; GFX9: ; %bb.0: ; %bb
				; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-NEXT: v_lshlrev_b64 v[4:5], 2, v[0:1]
				; GFX9-NEXT: v_alignbit_b32 v3, v1, v0, 30
				; GFX9-NEXT: v_ashrrev_i64 v[5:6], 2, v[4:5]
				; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, v[5:6], v[0:1]
				; GFX9-NEXT: v_mov_b32_e32 v0, v4
				; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
				; GFX9-NEXT: v_mov_b32_e32 v1, v3
				; GFX9-NEXT: s_setpc_b64 s[30:31]
				bb:
				%umulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %i, i64 4)
				ret { i64, i1 } %umulo
				}

				define { i64, i1 } @umulo_i64_v_4(i64 %i) {
				; SI-LABEL: umulo_i64_v_4:
				; SI: ; %bb.0: ; %bb
				; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; SI-NEXT: v_and_b32_e32 v7, 0x3fffffff, v1
				; SI-NEXT: v_mov_b32_e32 v6, v0
				; SI-NEXT: v_lshl_b64 v[4:5], v[0:1], 2
				; SI-NEXT: v_alignbit_b32 v3, v1, v0, 30
				; SI-NEXT: v_cmp_ne_u64_e32 vcc, v[6:7], v[0:1]
				; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
				; SI-NEXT: v_mov_b32_e32 v0, v4
				; SI-NEXT: v_mov_b32_e32 v1, v3
				; SI-NEXT: s_setpc_b64 s[30:31]
				;
				; GFX9-LABEL: umulo_i64_v_4:
				; GFX9: ; %bb.0: ; %bb
				; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
				; GFX9-NEXT: v_and_b32_e32 v7, 0x3fffffff, v1
				; GFX9-NEXT: v_mov_b32_e32 v6, v0
				; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, v[6:7], v[0:1]
				; GFX9-NEXT: v_lshlrev_b64 v[4:5], 2, v[0:1]
				; GFX9-NEXT: v_alignbit_b32 v3, v1, v0, 30
				; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
				; GFX9-NEXT: v_mov_b32_e32 v0, v4
				; GFX9-NEXT: v_mov_b32_e32 v1, v3
				; GFX9-NEXT: s_setpc_b64 s[30:31]
				bb:
				%umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %i, i64 4)
				ret { i64, i1 } %umulo
				}

				declare { i64, i1 } @llvm.umul.with.overflow.i64(i64, i64)
				declare { i64, i1 } @llvm.smul.with.overflow.i64(i64, i64)