Index: llvm/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -12690,46 +12690,53 @@ SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - ARMCC::CondCodes CondCode0 = ARMCC::AL; - ARMCC::CondCodes CondCode1 = ARMCC::AL; + auto getOppositeCondition = [](SDValue Value, unsigned Idx) { + const ConstantSDNode *Const = + cast(Value->getOperand(Idx)); + ARMCC::CondCodes Result = + ARMCC::getOppositeCondition((ARMCC::CondCodes)Const->getZExtValue()); + if (isValidMVECond(Result, + Value->getOperand(0)->getValueType(0).isFloatingPoint())) + return Result; + return ARMCC::AL; + }; + + ARMCC::CondCodes Opposite0 = ARMCC::AL; if (N0->getOpcode() == ARMISD::VCMP) - CondCode0 = (ARMCC::CondCodes)cast(N0->getOperand(2)) - ->getZExtValue(); + Opposite0 = getOppositeCondition(N0, 2); else if (N0->getOpcode() == ARMISD::VCMPZ) - CondCode0 = (ARMCC::CondCodes)cast(N0->getOperand(1)) - ->getZExtValue(); + Opposite0 = getOppositeCondition(N0, 1); + + ARMCC::CondCodes Opposite1 = ARMCC::AL; if (N1->getOpcode() == ARMISD::VCMP) - CondCode1 = (ARMCC::CondCodes)cast(N1->getOperand(2)) - ->getZExtValue(); + Opposite1 = getOppositeCondition(N1, 2); else if (N1->getOpcode() == ARMISD::VCMPZ) - CondCode1 = (ARMCC::CondCodes)cast(N1->getOperand(1)) - ->getZExtValue(); + Opposite1 = getOppositeCondition(N1, 1); - if (CondCode0 == ARMCC::AL || CondCode1 == ARMCC::AL) + if (Opposite0 == ARMCC::AL && Opposite1 == ARMCC::AL) return SDValue(); - unsigned Opposite0 = ARMCC::getOppositeCondition(CondCode0); - unsigned Opposite1 = ARMCC::getOppositeCondition(CondCode1); - - if (!isValidMVECond(Opposite0, - N0->getOperand(0)->getValueType(0).isFloatingPoint()) || - !isValidMVECond(Opposite1, - N1->getOperand(0)->getValueType(0).isFloatingPoint())) - return SDValue(); + SDValue NewN0, NewN1; + if (Opposite0 != ARMCC::AL) { + SmallVector Ops; + Ops.push_back(N0->getOperand(0)); + if (N0->getOpcode() == ARMISD::VCMP) + Ops.push_back(N0->getOperand(1)); + Ops.push_back(DCI.DAG.getConstant(Opposite0, SDLoc(N0), MVT::i32)); + NewN0 = DCI.DAG.getNode(N0->getOpcode(), SDLoc(N0), VT, Ops); + } else + NewN0 = DCI.DAG.getLogicalNOT({N0}, N0, VT); - SmallVector Ops0; - Ops0.push_back(N0->getOperand(0)); - if (N0->getOpcode() == ARMISD::VCMP) - Ops0.push_back(N0->getOperand(1)); - Ops0.push_back(DCI.DAG.getConstant(Opposite0, SDLoc(N0), MVT::i32)); - SmallVector Ops1; - Ops1.push_back(N1->getOperand(0)); - if (N1->getOpcode() == ARMISD::VCMP) - Ops1.push_back(N1->getOperand(1)); - Ops1.push_back(DCI.DAG.getConstant(Opposite1, SDLoc(N1), MVT::i32)); + if (Opposite1 != ARMCC::AL) { + SmallVector Ops; + Ops.push_back(N1->getOperand(0)); + if (N1->getOpcode() == ARMISD::VCMP) + Ops.push_back(N1->getOperand(1)); + Ops.push_back(DCI.DAG.getConstant(Opposite1, SDLoc(N1), MVT::i32)); + NewN1 = DCI.DAG.getNode(N1->getOpcode(), SDLoc(N1), VT, Ops); + } else + NewN1 = DCI.DAG.getLogicalNOT({N1}, N1, VT); - SDValue NewN0 = DCI.DAG.getNode(N0->getOpcode(), SDLoc(N0), VT, Ops0); - SDValue NewN1 = DCI.DAG.getNode(N1->getOpcode(), SDLoc(N1), VT, Ops1); SDValue And = DCI.DAG.getNode(ISD::AND, SDLoc(N), VT, NewN0, NewN1); return DCI.DAG.getNode(ISD::XOR, SDLoc(N), VT, And, DCI.DAG.getAllOnesConstant(SDLoc(N), VT)); Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll @@ -296,9 +296,8 @@ define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) { ; CHECK-LABEL: or_mul_reduce_add: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: ldr.w r12, [sp, #20] +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: ldr.w r12, [sp, #16] ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq .LBB3_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph @@ -315,21 +314,16 @@ ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r12 ; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill -; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q1, [r1], #16 ; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 +; CHECK-NEXT: vpnot ; CHECK-NEXT: vsub.i32 q1, q2, q1 -; CHECK-NEXT: vcmp.i32 eq, q1, zr -; CHECK-NEXT: vmrs r5, p0 -; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload -; CHECK-NEXT: vmrs r6, p0 -; CHECK-NEXT: orrs r5, r6 -; CHECK-NEXT: vmsr p0, r5 -; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q1, [r3], #16 -; CHECK-NEXT: vldrwt.u32 q2, [r2], #16 +; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: vpstee +; CHECK-NEXT: vcmpt.i32 ne, q1, zr +; CHECK-NEXT: vldrwe.u32 q1, [r3], #16 +; CHECK-NEXT: vldrwe.u32 q2, [r2], #16 ; CHECK-NEXT: vmul.i32 q1, q2, q1 ; CHECK-NEXT: vadd.i32 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB3_2 @@ -337,12 +331,10 @@ ; CHECK-NEXT: vctp.32 r4 ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .LBB3_4: ; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %cmp8 = icmp eq i32 %N, 0 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph Index: llvm/test/CodeGen/Thumb2/mve-pred-or.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-pred-or.ll +++ llvm/test/CodeGen/Thumb2/mve-pred-or.ll @@ -124,12 +124,10 @@ ; CHECK-LABEL: cmpulez_v4i1: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vcmp.u32 cs, q1, zr -; CHECK-NEXT: vmrs r0, p0 -; CHECK-NEXT: vcmp.i32 eq, q0, zr -; CHECK-NEXT: vmrs r1, p0 -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmsr p0, r0 -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vpnot +; CHECK-NEXT: vpst +; CHECK-NEXT: vcmpt.i32 ne, q0, zr +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <4 x i32> %a, zeroinitializer @@ -247,12 +245,10 @@ ; CHECK-LABEL: cmpult_v4i1: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vcmp.u32 hi, q2, q1 -; CHECK-NEXT: vmrs r0, p0 -; CHECK-NEXT: vcmp.i32 eq, q0, zr -; CHECK-NEXT: vmrs r1, p0 -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmsr p0, r0 -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vpnot +; CHECK-NEXT: vpst +; CHECK-NEXT: vcmpt.i32 ne, q0, zr +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <4 x i32> %a, zeroinitializer @@ -266,12 +262,10 @@ ; CHECK-LABEL: cmpugt_v4i1: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vcmp.u32 hi, q1, q2 -; CHECK-NEXT: vmrs r0, p0 -; CHECK-NEXT: vcmp.i32 eq, q0, zr -; CHECK-NEXT: vmrs r1, p0 -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmsr p0, r0 -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vpnot +; CHECK-NEXT: vpst +; CHECK-NEXT: vcmpt.i32 ne, q0, zr +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <4 x i32> %a, zeroinitializer @@ -285,12 +279,10 @@ ; CHECK-LABEL: cmpule_v4i1: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vcmp.u32 cs, q2, q1 -; CHECK-NEXT: vmrs r0, p0 -; CHECK-NEXT: vcmp.i32 eq, q0, zr -; CHECK-NEXT: vmrs r1, p0 -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmsr p0, r0 -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vpnot +; CHECK-NEXT: vpst +; CHECK-NEXT: vcmpt.i32 ne, q0, zr +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <4 x i32> %a, zeroinitializer @@ -304,12 +296,10 @@ ; CHECK-LABEL: cmpuge_v4i1: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vcmp.u32 cs, q1, q2 -; CHECK-NEXT: vmrs r0, p0 -; CHECK-NEXT: vcmp.i32 eq, q0, zr -; CHECK-NEXT: vmrs r1, p0 -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmsr p0, r0 -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vpnot +; CHECK-NEXT: vpst +; CHECK-NEXT: vcmpt.i32 ne, q0, zr +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <4 x i32> %a, zeroinitializer