Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -703,9 +703,14 @@ for (MVT VT : MVT::vector_valuetypes()) { setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); - setOperationAction(ISD::MULHS, VT, Expand); + if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) { + setOperationAction(ISD::MULHS, VT, Custom); + setOperationAction(ISD::MULHU, VT, Custom); + } else { + setOperationAction(ISD::MULHS, VT, Expand); + setOperationAction(ISD::MULHU, VT, Expand); + } setOperationAction(ISD::SMUL_LOHI, VT, Expand); - setOperationAction(ISD::MULHU, VT, Expand); setOperationAction(ISD::UMUL_LOHI, VT, Expand); setOperationAction(ISD::BSWAP, VT, Expand); @@ -2549,6 +2554,45 @@ DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); } +static SDValue LowerMULH(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + assert(VT.is128BitVector() && VT.isInteger() && + "unexpected type for custom-lowering ISD::MULH{U,S}"); + + SDValue V0 = Op.getOperand(0); + SDValue V1 = Op.getOperand(1); + + SDLoc DL(Op); + + EVT ExtractVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); + + unsigned Mull2VectorIdx = VT.getVectorNumElements () / 2; + SDValue VMullIdx = DAG.getConstant(0, DL, MVT::i64); + SDValue VMull2Idx = DAG.getConstant(Mull2VectorIdx, DL, MVT::i64); + + SDValue VMullV0 = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V0, VMullIdx); + SDValue VMullV1 = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V1, VMullIdx); + + SDValue VMull2V0 = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V0, VMull2Idx); + SDValue VMull2V1 = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V1, VMull2Idx); + + unsigned MullOpc = Op.getOpcode() == ISD::MULHS ? AArch64ISD::SMULL + : AArch64ISD::UMULL; + + EVT MullVT = ExtractVT.widenIntegerVectorElementType(*DAG.getContext()); + SDValue Mull = DAG.getNode(MullOpc, DL, MullVT, VMullV0, VMullV1); + SDValue Mull2 = DAG.getNode(MullOpc, DL, MullVT, VMull2V0, VMull2V1); + + Mull = DAG.getNode(ISD::BITCAST, DL, VT, Mull); + Mull2 = DAG.getNode(ISD::BITCAST, DL, VT, Mull2); + + return DAG.getNode(AArch64ISD::UZP2, DL, VT, Mull, Mull2); +} + SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { unsigned IntNo = cast(Op.getOperand(0))->getZExtValue(); @@ -2681,6 +2725,9 @@ return LowerFSINCOS(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); + case ISD::MULHS: + case ISD::MULHU: + return LowerMULH(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::VECREDUCE_ADD: Index: lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.td +++ lib/Target/AArch64/AArch64InstrInfo.td @@ -3773,6 +3773,25 @@ defm : Neon_mul_widen_patterns; +// Patterns for smull2/umull2. +multiclass Neon_mul_high_patterns { + def : Pat<(v8i16 (opnode (extract_high_v16i8 V128:$Rn), + (extract_high_v16i8 V128:$Rm))), + (INST8B V128:$Rn, V128:$Rm)>; + def : Pat<(v4i32 (opnode (extract_high_v8i16 V128:$Rn), + (extract_high_v8i16 V128:$Rm))), + (INST4H V128:$Rn, V128:$Rm)>; + def : Pat<(v2i64 (opnode (extract_high_v4i32 V128:$Rn), + (extract_high_v4i32 V128:$Rm))), + (INST2S V128:$Rn, V128:$Rm)>; +} + +defm : Neon_mul_high_patterns; +defm : Neon_mul_high_patterns; + // Additional patterns for SMLAL/SMLSL and UMLAL/UMLSL multiclass Neon_mulacc_widen_patterns { Index: test/CodeGen/AArch64/arm64-neon-mul-div-cte.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/arm64-neon-mul-div-cte.ll @@ -0,0 +1,63 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s + +define <16 x i8> @mul16xi8(<16 x i8> %x) { +; CHECK-LABEL: mul16xi8: +; CHECK: smull2 [[SMULL2:(v[0-9]+)]].8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +; CHECK: smull [[SMULL:(v[0-9]+)]].8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +; CHECK: uzp2 [[UZP2:(v[0-9]+).16b]], [[SMULL]].16b, [[SMULL2]].16b +; CHECK: sshr [[SSHR:(v[0-9]+.16b)]], [[UZP2]], #1 +; CHECK: usra v0.16b, [[SSHR]], #7 + %div = sdiv <16 x i8> %x, + ret <16 x i8> %div +} + +define <8 x i16> @mul8xi16(<8 x i16> %x) { +; CHECK-LABEL: mul8xi16: +; CHECK: smull2 [[SMULL2:(v[0-9]+)]].4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +; CHECK: smull [[SMULL:(v[0-9]+)]].4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +; CHECK: uzp2 [[UZP2:(v[0-9]+).8h]], [[SMULL]].8h, [[SMULL2]].8h +; CHECK: usra v0.8h, [[UZP2]], #15 + %div = sdiv <8 x i16> %x, + ret <8 x i16> %div +} + +define <4 x i32> @mul32xi4(<4 x i32> %x) { +; CHECK-LABEL: mul32xi4: +; CHECK: smull2 [[SMULL2:(v[0-9]+)]].2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; CHECK: smull [[SMULL:(v[0-9]+)]].2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +; CHECK: uzp2 [[UZP2:(v[0-9]+).4s]], [[SMULL]].4s, [[SMULL2]].4s +; CHECK: sshr [[SSHR:(v[0-9]+.4s)]], [[UZP2]], #1 +; CHECK: usra v0.4s, [[UZP2]], #31 + %div = sdiv <4 x i32> %x, + ret <4 x i32> %div +} + +define <16 x i8> @umul8xi16(<16 x i8> %x) { +; CHECK-LABEL: umul8xi16: +; CHECK: umull2 [[SMULL2:(v[0-9]+)]].8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +; CHECK: umull [[SMULL:(v[0-9]+)]].8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +; CHECK: uzp2 [[UZP2:(v[0-9]+).16b]], [[SMULL]].16b, [[SMULL2]].16b +; CHECK: ushr v0.16b, [[UZP2]], #1 + %div = udiv <16 x i8> %x, + ret <16 x i8> %div +} + +define <8 x i16> @umul16xi8(<8 x i16> %x) { +; CHECK-LABEL: umul16xi8: +; CHECK: umull2 [[SMULL2:(v[0-9]+)]].4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +; CHECK: umull [[SMULL:(v[0-9]+)]].4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +; CHECK: uzp2 [[UZP2:(v[0-9]+).8h]], [[SMULL]].8h, [[SMULL2]].8h +; CHECK: ushr v0.8h, [[UZP2]], #3 + %div = udiv <8 x i16> %x, + ret <8 x i16> %div +} + +define <4 x i32> @umul32xi4(<4 x i32> %x) { +; CHECK-LABEL: umul32xi4: +; CHECK: umull2 [[SMULL2:(v[0-9]+)]].2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; CHECK: umull [[SMULL:(v[0-9]+)]].2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +; CHECK: uzp2 [[UZP2:(v[0-9]+).4s]], [[SMULL]].4s, [[SMULL2]].4s +; CHECK: ushr v0.4s, [[UZP2]], #1 + %div = udiv <4 x i32> %x, + ret <4 x i32> %div +} Index: test/CodeGen/AArch64/neon-idiv.ll =================================================================== --- test/CodeGen/AArch64/neon-idiv.ll +++ test/CodeGen/AArch64/neon-idiv.ll @@ -3,11 +3,13 @@ define <4 x i32> @test1(<4 x i32> %a) { %rem = srem <4 x i32> %a, ret <4 x i32> %rem -; CHECK-LABEL: test1 -; FIXME: Can we lower this more efficiently? -; CHECK: mul -; CHECK: mul -; CHECK: mul -; CHECK: mul +; For C constant X/C is simplified to X-X/C*C. The X/C division is lowered +; to MULHS due the simplification by multiplying by a magic number +; (TargetLowering::BuildSDIV). +; CHECK-LABEL: test1: +; CHECK: smull2 [[SMULL2:(v[0-9]+)]].2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +; CHECK: smull [[SMULL:(v[0-9]+)]].2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +; CHECK: uzp2 [[UZP2:(v[0-9]+).4s]], [[SMULL]].4s, [[SMULL2]].4s +; CHECK: add [[ADD:(v[0-9]+.4s)]], [[UZP2]], v0.4s +; CHECK: sshr [[SSHR:(v[0-9]+.4s)]], [[ADD]], #2 } -