diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -863,6 +863,7 @@ unsigned NewOp) const; SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerDIV(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const; SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const; SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -3536,9 +3536,8 @@ case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, DAG); case ISD::SDIV: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::SDIV_PRED); case ISD::UDIV: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::UDIV_PRED); + return LowerDIV(Op, DAG); case ISD::SMIN: return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED); case ISD::UMIN: @@ -8791,6 +8790,35 @@ return SDValue(); } +SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + bool Signed = Op.getOpcode() == ISD::SDIV; + unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED; + if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64) + return LowerToPredicatedOp(Op, DAG, PredOpcode); + + // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit + // operations, and truncate the result. + EVT WidenedVT; + if (VT == MVT::nxv16i8) + WidenedVT = MVT::nxv8i16; + else if (VT == MVT::nxv8i16) + WidenedVT = MVT::nxv4i32; + else + llvm_unreachable("Unexpected Custom DIV operation"); + + SDLoc dl(Op); + unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO; + unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI; + SDValue Op0Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(0)); + SDValue Op1Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(1)); + SDValue Op0Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(0)); + SDValue Op1Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(1)); + SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Lo, Op1Lo); + SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Hi, Op1Hi); + return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLo, ResultHi); +} + bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef M, EVT VT) const { // Currently no fixed length shuffles that require SVE are legal. if (useSVEForFixedLengthVectorVT(VT)) diff --git a/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll b/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll --- a/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll +++ b/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll @@ -5,6 +5,50 @@ ; SDIV ; +define @sdiv_i8( %a, %b) { +; CHECK-LABEL: sdiv_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sunpkhi z2.h, z1.b +; CHECK-NEXT: sunpkhi z3.h, z0.b +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: sunpklo z1.h, z1.b +; CHECK-NEXT: sunpklo z0.h, z0.b +; CHECK-NEXT: sunpkhi z4.s, z2.h +; CHECK-NEXT: sunpkhi z5.s, z3.h +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: sunpkhi z5.s, z1.h +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: sunpkhi z3.s, z0.h +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sdiv z3.s, p0/m, z3.s, z5.s +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: uzp1 z1.h, z2.h, z4.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z3.h +; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: ret + %div = sdiv %a, %b + ret %div +} + +define @sdiv_i16( %a, %b) { +; CHECK-LABEL: sdiv_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: sunpkhi z2.s, z1.h +; CHECK-NEXT: sunpkhi z3.s, z0.h +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h +; CHECK-NEXT: ret + %div = sdiv %a, %b + ret %div +} + define @sdiv_i32( %a, %b) { ; CHECK-LABEL: sdiv_i32: ; CHECK: // %bb.0: @@ -63,6 +107,57 @@ ; SREM ; +define @srem_i8( %a, %b) { +; CHECK-LABEL: srem_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sunpkhi z2.h, z1.b +; CHECK-NEXT: sunpkhi z3.h, z0.b +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: sunpklo z4.h, z1.b +; CHECK-NEXT: sunpklo z5.h, z0.b +; CHECK-NEXT: sunpkhi z6.s, z2.h +; CHECK-NEXT: sunpkhi z7.s, z3.h +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: sunpkhi z7.s, z4.h +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: sunpkhi z3.s, z5.h +; CHECK-NEXT: sunpklo z4.s, z4.h +; CHECK-NEXT: sunpklo z5.s, z5.h +; CHECK-NEXT: sdiv z3.s, p0/m, z3.s, z7.s +; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: uzp1 z2.h, z2.h, z6.h +; CHECK-NEXT: uzp1 z3.h, z4.h, z3.h +; CHECK-NEXT: uzp1 z2.b, z3.b, z2.b +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mul z2.b, p0/m, z2.b, z1.b +; CHECK-NEXT: sub z0.b, z0.b, z2.b +; CHECK-NEXT: ret + %div = srem %a, %b + ret %div +} + +define @srem_i16( %a, %b) { +; CHECK-LABEL: srem_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: sunpkhi z2.s, z1.h +; CHECK-NEXT: sunpkhi z3.s, z0.h +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: sunpklo z4.s, z1.h +; CHECK-NEXT: sunpklo z5.s, z0.h +; CHECK-NEXT: movprfx z3, z5 +; CHECK-NEXT: sdiv z3.s, p0/m, z3.s, z4.s +; CHECK-NEXT: uzp1 z2.h, z3.h, z2.h +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mul z2.h, p0/m, z2.h, z1.h +; CHECK-NEXT: sub z0.h, z0.h, z2.h +; CHECK-NEXT: ret + %div = srem %a, %b + ret %div +} + define @srem_i32( %a, %b) { ; CHECK-LABEL: srem_i32: ; CHECK: // %bb.0: @@ -93,6 +188,50 @@ ; UDIV ; +define @udiv_i8( %a, %b) { +; CHECK-LABEL: udiv_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpkhi z2.h, z1.b +; CHECK-NEXT: uunpkhi z3.h, z0.b +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: uunpklo z1.h, z1.b +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: uunpkhi z4.s, z2.h +; CHECK-NEXT: uunpkhi z5.s, z3.h +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: uunpkhi z5.s, z1.h +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: uunpkhi z3.s, z0.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: udiv z3.s, p0/m, z3.s, z5.s +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: uzp1 z1.h, z2.h, z4.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z3.h +; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: ret + %div = udiv %a, %b + ret %div +} + +define @udiv_i16( %a, %b) { +; CHECK-LABEL: udiv_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpkhi z2.s, z1.h +; CHECK-NEXT: uunpkhi z3.s, z0.h +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h +; CHECK-NEXT: ret + %div = udiv %a, %b + ret %div +} + define @udiv_i32( %a, %b) { ; CHECK-LABEL: udiv_i32: ; CHECK: // %bb.0: @@ -152,6 +291,57 @@ ; UREM ; +define @urem_i8( %a, %b) { +; CHECK-LABEL: urem_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpkhi z2.h, z1.b +; CHECK-NEXT: uunpkhi z3.h, z0.b +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: uunpklo z4.h, z1.b +; CHECK-NEXT: uunpklo z5.h, z0.b +; CHECK-NEXT: uunpkhi z6.s, z2.h +; CHECK-NEXT: uunpkhi z7.s, z3.h +; CHECK-NEXT: uunpklo z2.s, z2.h +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s +; CHECK-NEXT: uunpkhi z7.s, z4.h +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: uunpkhi z3.s, z5.h +; CHECK-NEXT: uunpklo z4.s, z4.h +; CHECK-NEXT: uunpklo z5.s, z5.h +; CHECK-NEXT: udiv z3.s, p0/m, z3.s, z7.s +; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s +; CHECK-NEXT: uzp1 z2.h, z2.h, z6.h +; CHECK-NEXT: uzp1 z3.h, z4.h, z3.h +; CHECK-NEXT: uzp1 z2.b, z3.b, z2.b +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mul z2.b, p0/m, z2.b, z1.b +; CHECK-NEXT: sub z0.b, z0.b, z2.b +; CHECK-NEXT: ret + %div = urem %a, %b + ret %div +} + +define @urem_i16( %a, %b) { +; CHECK-LABEL: urem_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpkhi z2.s, z1.h +; CHECK-NEXT: uunpkhi z3.s, z0.h +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: uunpklo z4.s, z1.h +; CHECK-NEXT: uunpklo z5.s, z0.h +; CHECK-NEXT: movprfx z3, z5 +; CHECK-NEXT: udiv z3.s, p0/m, z3.s, z4.s +; CHECK-NEXT: uzp1 z2.h, z3.h, z2.h +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mul z2.h, p0/m, z2.h, z1.h +; CHECK-NEXT: sub z0.h, z0.h, z2.h +; CHECK-NEXT: ret + %div = urem %a, %b + ret %div +} + define @urem_i32( %a, %b) { ; CHECK-LABEL: urem_i32: ; CHECK: // %bb.0: