Index: llvm/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -13992,9 +13992,51 @@ return SDValue(); } +// When lowering complex nodes that we recognize, like VQDMULH and MULH, we +// can end up with shuffle(binop(shuffle, shuffle)), that can be simplified to +// binop as the shuffles cancel out. +static SDValue FlattenVectorShuffle(ShuffleVectorSDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + if (!N->getOperand(1).isUndef() || N->getOperand(0).getValueType() != VT) + return SDValue(); + SDValue Op = N->getOperand(0); + + // Looking for binary operators that will have been folded from + // truncates/extends. + switch (Op.getOpcode()) { + case ARMISD::VQDMULH: + case ISD::MULHS: + case ISD::MULHU: + break; + default: + return SDValue(); + } + + ShuffleVectorSDNode *Op0 = dyn_cast(Op.getOperand(0)); + ShuffleVectorSDNode *Op1 = dyn_cast(Op.getOperand(1)); + if (!Op0 || !Op1 || !Op0->getOperand(1).isUndef() || + !Op1->getOperand(1).isUndef() || Op0->getMask() != Op1->getMask() || + Op0->getOperand(0).getValueType() != VT) + return SDValue(); + + // Check the mask turns into an identity shuffle. + ArrayRef NMask = N->getMask(); + ArrayRef OpMask = Op0->getMask(); + for (int i = 0, e = NMask.size(); i != e; i++) { + if (NMask[i] > 0 && OpMask[NMask[i]] > 0 && OpMask[NMask[i]] != i) + return SDValue(); + } + + return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(), + Op0->getOperand(0), Op1->getOperand(0)); +} + /// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for /// ISD::VECTOR_SHUFFLE. static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { + if (SDValue R = FlattenVectorShuffle(cast(N), DAG)) + return R; + // The LLVM shufflevector instruction does not require the shuffle mask // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the Index: llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll +++ llvm/test/CodeGen/Thumb2/mve-vqdmulh.ll @@ -140,55 +140,7 @@ define arm_aapcs_vfpcc <8 x i16> @vqdmulh_i16_interleaved(<8 x i16> %s0, <8 x i16> %s1) { ; CHECK-LABEL: vqdmulh_i16_interleaved: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.u16 r0, q0[0] -; CHECK-NEXT: vmov.16 q2[0], r0 -; CHECK-NEXT: vmov.u16 r0, q0[2] -; CHECK-NEXT: vmov.16 q2[1], r0 -; CHECK-NEXT: vmov.u16 r0, q0[4] -; CHECK-NEXT: vmov.16 q2[2], r0 -; CHECK-NEXT: vmov.u16 r0, q0[6] -; CHECK-NEXT: vmov.16 q2[3], r0 -; CHECK-NEXT: vmov.u16 r0, q0[1] -; CHECK-NEXT: vmov.16 q2[4], r0 -; CHECK-NEXT: vmov.u16 r0, q0[3] -; CHECK-NEXT: vmov.16 q2[5], r0 -; CHECK-NEXT: vmov.u16 r0, q0[5] -; CHECK-NEXT: vmov.16 q2[6], r0 -; CHECK-NEXT: vmov.u16 r0, q0[7] -; CHECK-NEXT: vmov.16 q2[7], r0 -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vqdmulh.s16 q1, q0, q2 -; CHECK-NEXT: vmov.u16 r0, q1[0] -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.u16 r0, q1[4] -; CHECK-NEXT: vmov.16 q0[1], r0 -; CHECK-NEXT: vmov.u16 r0, q1[1] -; CHECK-NEXT: vmov.16 q0[2], r0 -; CHECK-NEXT: vmov.u16 r0, q1[5] -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.u16 r0, q1[2] -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.u16 r0, q1[6] -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov.u16 r0, q1[3] -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov.u16 r0, q1[7] -; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vqdmulh.s16 q0, q1, q0 ; CHECK-NEXT: bx lr entry: %0 = shufflevector <8 x i16> %s0, <8 x i16> undef, <8 x i32>