diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -14421,6 +14421,41 @@ return SDValue(); } +static SDValue PerformVMOVNCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + unsigned IsTop = N->getConstantOperandVal(2); + + // VMOVNt(c, VQMOVNb(a, b)) => VQMOVNt(c, b) + // VMOVNb(c, VQMOVNb(a, b)) => VQMOVNb(c, b) + if ((Op1->getOpcode() == ARMISD::VQMOVNs || + Op1->getOpcode() == ARMISD::VQMOVNu) && + Op1->getConstantOperandVal(2) == 0) + return DCI.DAG.getNode(Op1->getOpcode(), SDLoc(Op1), N->getValueType(0), + Op0, Op1->getOperand(1), N->getOperand(2)); + + // Only the bottom lanes from Qm (Op1) and either the top or bottom lanes from + // Qd (Op0) are demanded from a VMOVN, depending on whether we are inserting + // into the top or bottom lanes. + unsigned NumElts = N->getValueType(0).getVectorNumElements(); + APInt Op1DemandedElts = APInt::getSplat(NumElts, APInt::getLowBitsSet(2, 1)); + APInt Op0DemandedElts = + IsTop ? Op1DemandedElts + : APInt::getSplat(NumElts, APInt::getHighBitsSet(2, 1)); + + APInt KnownUndef, KnownZero; + const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo(); + if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, KnownUndef, + KnownZero, DCI)) + return SDValue(N, 0); + if (TLI.SimplifyDemandedVectorElts(Op1, Op1DemandedElts, KnownUndef, + KnownZero, DCI)) + return SDValue(N, 0); + + return SDValue(); +} + static SDValue PerformLongShiftCombine(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); SDValue Op0 = N->getOperand(0); @@ -15555,6 +15590,8 @@ return PerformVCMPCombine(N, DCI, Subtarget); case ISD::VECREDUCE_ADD: return PerformVECREDUCE_ADDCombine(N, DCI.DAG, Subtarget); + case ARMISD::VMOVN: + return PerformVMOVNCombine(N, DCI); case ARMISD::ASRL: case ARMISD::LSRL: case ARMISD::LSLL: diff --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll --- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll @@ -1418,13 +1418,11 @@ ; CHECK-NEXT: vldrh.u16 q1, [r1], #16 ; CHECK-NEXT: vmullt.s16 q2, q1, q0 ; CHECK-NEXT: vmullb.s16 q0, q1, q0 -; CHECK-NEXT: vshr.s32 q2, q2, #15 ; CHECK-NEXT: vshr.s32 q0, q0, #15 -; CHECK-NEXT: vqmovnb.s32 q2, q2 +; CHECK-NEXT: vshr.s32 q2, q2, #15 ; CHECK-NEXT: vqmovnb.s32 q0, q0 -; CHECK-NEXT: vmovlb.s16 q2, q2 ; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vmovnt.i32 q0, q2 +; CHECK-NEXT: vqmovnt.s32 q0, q2 ; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB7_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block @@ -1867,11 +1865,9 @@ ; CHECK-NEXT: vmullb.s16 q5, q6, q5 ; CHECK-NEXT: vshr.s32 q7, q7, #15 ; CHECK-NEXT: vshr.s32 q5, q5, #15 -; CHECK-NEXT: vqmovnb.s32 q7, q7 ; CHECK-NEXT: vqmovnb.s32 q5, q5 -; CHECK-NEXT: vmovlb.s16 q7, q7 ; CHECK-NEXT: vmovlb.s16 q5, q5 -; CHECK-NEXT: vmovnt.i32 q5, q7 +; CHECK-NEXT: vqmovnt.s32 q5, q7 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrht.16 q5, [r2], #16 ; CHECK-NEXT: le lr, .LBB10_2 @@ -2664,13 +2660,11 @@ ; CHECK-NEXT: vldrb.u8 q1, [r1], #16 ; CHECK-NEXT: vmullt.s8 q2, q1, q0 ; CHECK-NEXT: vmullb.s8 q0, q1, q0 -; CHECK-NEXT: vshr.s16 q2, q2, #7 ; CHECK-NEXT: vshr.s16 q0, q0, #7 -; CHECK-NEXT: vqmovnb.s16 q2, q2 +; CHECK-NEXT: vshr.s16 q2, q2, #7 ; CHECK-NEXT: vqmovnb.s16 q0, q0 -; CHECK-NEXT: vmovlb.s8 q2, q2 ; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmovnt.i16 q0, q2 +; CHECK-NEXT: vqmovnt.s16 q0, q2 ; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB16_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block @@ -3336,13 +3330,11 @@ ; CHECK-NEXT: vldrbt.u8 q4, [r1], #16 ; CHECK-NEXT: vmullt.s8 q5, q4, q0 ; CHECK-NEXT: vmullb.s8 q0, q4, q0 -; CHECK-NEXT: vshr.s16 q5, q5, #7 ; CHECK-NEXT: vshr.s16 q0, q0, #7 -; CHECK-NEXT: vqmovnb.s16 q5, q5 +; CHECK-NEXT: vshr.s16 q5, q5, #7 ; CHECK-NEXT: vqmovnb.s16 q0, q0 -; CHECK-NEXT: vmovlb.s8 q5, q5 ; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmovnt.i16 q0, q5 +; CHECK-NEXT: vqmovnt.s16 q0, q5 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrbt.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB19_2 diff --git a/llvm/test/CodeGen/Thumb2/mve-vqmovn-combine.ll b/llvm/test/CodeGen/Thumb2/mve-vqmovn-combine.ll --- a/llvm/test/CodeGen/Thumb2/mve-vqmovn-combine.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vqmovn-combine.ll @@ -4,9 +4,7 @@ define arm_aapcs_vfpcc <8 x i16> @vqmovni32_sminmax_t1(<4 x i32> %s0, <8 x i16> %src1) { ; CHECK-LABEL: vqmovni32_sminmax_t1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vqmovnb.s32 q0, q0 -; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vmovnt.i32 q1, q0 +; CHECK-NEXT: vqmovnt.s32 q1, q0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -23,7 +21,6 @@ ; CHECK-LABEL: vqmovni32_sminmax_t2: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vqmovnb.s32 q0, q0 -; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: vmovnt.i32 q0, q1 ; CHECK-NEXT: bx lr entry: @@ -39,9 +36,7 @@ define arm_aapcs_vfpcc <8 x i16> @vqmovni32_sminmax_b1(<4 x i32> %s0, <8 x i16> %src1) { ; CHECK-LABEL: vqmovni32_sminmax_b1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vqmovnb.s32 q0, q0 -; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vmovnb.i32 q1, q0 +; CHECK-NEXT: vqmovnb.s32 q1, q0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -137,9 +132,7 @@ define arm_aapcs_vfpcc <16 x i8> @vqmovni16_sminmax_t1(<8 x i16> %s0, <16 x i8> %src1) { ; CHECK-LABEL: vqmovni16_sminmax_t1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vqmovnb.s16 q0, q0 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmovnt.i16 q1, q0 +; CHECK-NEXT: vqmovnt.s16 q1, q0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -156,7 +149,6 @@ ; CHECK-LABEL: vqmovni16_sminmax_t2: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vqmovnb.s16 q0, q0 -; CHECK-NEXT: vmovlb.s8 q0, q0 ; CHECK-NEXT: vmovnt.i16 q0, q1 ; CHECK-NEXT: bx lr entry: @@ -172,9 +164,7 @@ define arm_aapcs_vfpcc <16 x i8> @vqmovni16_sminmax_b1(<8 x i16> %s0, <16 x i8> %src1) { ; CHECK-LABEL: vqmovni16_sminmax_b1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vqmovnb.s16 q0, q0 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmovnb.i16 q1, q0 +; CHECK-NEXT: vqmovnb.s16 q1, q0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: