Index: llvm/lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.h +++ llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -119,6 +119,7 @@ /// Custom lowering for ISD::FP_ROUND for MVT::f16. SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerXMULO(SDValue Op, SelectionDAG &DAG) const; SDValue getSegmentAperture(unsigned AS, const SDLoc &DL, SelectionDAG &DAG) const; Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -767,6 +767,9 @@ setOperationAction(ISD::SELECT, VT, Custom); } + setOperationAction(ISD::SMULO, MVT::i64, Custom); + setOperationAction(ISD::UMULO, MVT::i64, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); @@ -4430,6 +4433,9 @@ case ISD::FMINNUM_IEEE: case ISD::FMAXNUM_IEEE: return splitBinaryVectorOp(Op, DAG); + case ISD::SMULO: + case ISD::UMULO: + return lowerXMULO(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); } @@ -4989,6 +4995,26 @@ return Op; } +SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + SDLoc SL(Op); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + bool isSigned = Op.getOpcode() == ISD::SMULO; + + SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS); + SDValue Top = DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, + SL, VT, LHS, RHS); + + SDValue Sign = isSigned + ? DAG.getNode(ISD::SRA, SL, VT, Result, + DAG.getConstant(VT.getScalarSizeInBits() - 1, SL, MVT::i64)) + : DAG.getConstant(0, SL, VT); + SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE); + + return DAG.getMergeValues({ Result, Overflow }, SL); +} + SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); SDValue Chain = Op.getOperand(0); Index: llvm/test/CodeGen/AMDGPU/llvm.mulo.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/llvm.mulo.ll @@ -0,0 +1,76 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s + +define { i64, i1 } @umulo_i64(i64 %x, i64 %y) { +; GCN-LABEL: umulo_i64: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_hi_u32 v4, v1, v2 +; GCN-NEXT: v_mul_lo_u32 v5, v1, v2 +; GCN-NEXT: v_mul_hi_u32 v6, v0, v3 +; GCN-NEXT: v_mul_lo_u32 v7, v0, v3 +; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v9, v1, v3 +; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 +; GCN-NEXT: v_mul_lo_u32 v0, v0, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v8, v7 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v6, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, v1, v5 +; GCN-NEXT: v_add_i32_e64 v1, s[4:5], v1, v5 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v9, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] +bb: + %umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %x, i64 %y) + ret { i64, i1 } %umulo +} + +define { i64, i1 } @smulo_i64(i64 %x, i64 %y) { +; GCN-LABEL: smulo_i64: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_hi_u32 v6, v1, v2 +; GCN-NEXT: v_mul_lo_u32 v5, v1, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 +; GCN-NEXT: v_mul_lo_u32 v8, v0, v3 +; GCN-NEXT: v_mul_hi_u32 v9, v0, v2 +; GCN-NEXT: v_mul_hi_i32 v10, v1, v3 +; GCN-NEXT: v_mul_lo_u32 v11, v1, v3 +; GCN-NEXT: v_mov_b32_e32 v12, 0 +; GCN-NEXT: v_mul_lo_u32 v4, v0, v2 +; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; GCN-NEXT: v_add_i32_e32 v9, vcc, v8, v5 +; GCN-NEXT: v_add_i32_e64 v5, s[4:5], v8, v5 +; GCN-NEXT: v_addc_u32_e32 v8, vcc, v7, v6, vcc +; GCN-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc +; GCN-NEXT: v_mov_b32_e32 v7, v6 +; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v11 +; GCN-NEXT: v_addc_u32_e32 v9, vcc, v12, v9, vcc +; GCN-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 +; GCN-NEXT: v_subb_u32_e32 v10, vcc, v9, v12, vcc +; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 +; GCN-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc +; GCN-NEXT: v_sub_i32_e32 v0, vcc, v2, v0 +; GCN-NEXT: v_subb_u32_e32 v8, vcc, v1, v12, vcc +; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GCN-NEXT: v_cmp_ne_u64_e32 vcc, v[0:1], v[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GCN-NEXT: v_mov_b32_e32 v0, v4 +; GCN-NEXT: v_mov_b32_e32 v1, v5 +; GCN-NEXT: s_setpc_b64 s[30:31] +bb: + %smulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %x, i64 %y) + ret { i64, i1 } %smulo +} + +declare { i64, i1 } @llvm.umul.with.overflow.i64(i64, i64) +declare { i64, i1 } @llvm.smul.with.overflow.i64(i64, i64)