Index: llvm/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -12588,8 +12588,14 @@ CondCode1 = (ARMCC::CondCodes)cast(N1->getOperand(1)) ->getZExtValue(); - if (CondCode0 == ARMCC::AL || CondCode1 == ARMCC::AL) - return SDValue(); + // Convert or(A, B) into not(and(not(A), not(B))), as 'not's are essentially + // free compared to ORs + if (CondCode0 == ARMCC::AL || CondCode1 == ARMCC::AL) { + SDValue NewN0 = DCI.DAG.getLogicalNOT({N0}, N0, VT); + SDValue NewN1 = DCI.DAG.getLogicalNOT({N1}, N1, VT); + SDValue And = DCI.DAG.getNode(ISD::AND, SDLoc(N), VT, NewN0, NewN1); + return DCI.DAG.getLogicalNOT({N}, And, VT); + } unsigned Opposite0 = ARMCC::getOppositeCondition(CondCode0); unsigned Opposite1 = ARMCC::getOppositeCondition(CondCode1); Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll @@ -296,9 +296,8 @@ define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) { ; CHECK-LABEL: or_mul_reduce_add: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: ldr.w r12, [sp, #20] +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: ldr.w r12, [sp, #16] ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq .LBB3_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph @@ -315,21 +314,16 @@ ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r12 ; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill -; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q1, [r1], #16 ; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 +; CHECK-NEXT: vpnot ; CHECK-NEXT: vsub.i32 q1, q2, q1 -; CHECK-NEXT: vcmp.i32 eq, q1, zr -; CHECK-NEXT: vmrs r5, p0 -; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload -; CHECK-NEXT: vmrs r6, p0 -; CHECK-NEXT: orrs r5, r6 -; CHECK-NEXT: vmsr p0, r5 -; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q1, [r3], #16 -; CHECK-NEXT: vldrwt.u32 q2, [r2], #16 +; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: vpstee +; CHECK-NEXT: vcmpt.i32 ne, q1, zr +; CHECK-NEXT: vldrwe.u32 q1, [r3], #16 +; CHECK-NEXT: vldrwe.u32 q2, [r2], #16 ; CHECK-NEXT: vmul.i32 q1, q2, q1 ; CHECK-NEXT: vadd.i32 q1, q1, q0 ; CHECK-NEXT: le lr, .LBB3_2 @@ -337,12 +331,10 @@ ; CHECK-NEXT: vctp.32 r4 ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .LBB3_4: ; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %cmp8 = icmp eq i32 %N, 0 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph Index: llvm/test/CodeGen/Thumb2/mve-pred-or.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-pred-or.ll +++ llvm/test/CodeGen/Thumb2/mve-pred-or.ll @@ -94,8 +94,8 @@ define arm_aapcs_vfpcc <4 x i32> @cmpultz_v4i1(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: cmpultz_v4i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.i32 eq, q0, zr -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vcmp.i32 ne, q0, zr +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <4 x i32> %a, zeroinitializer @@ -109,7 +109,7 @@ ; CHECK-LABEL: cmpugtz_v4i1: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vcmpt.i32 eq, q1, zr +; CHECK-NEXT: vcmpt.u32 cs, q1, zr ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: bx lr entry: @@ -123,13 +123,9 @@ define arm_aapcs_vfpcc <4 x i32> @cmpulez_v4i1(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: cmpulez_v4i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.u32 cs, q1, zr -; CHECK-NEXT: vmrs r0, p0 -; CHECK-NEXT: vcmp.i32 eq, q0, zr -; CHECK-NEXT: vmrs r1, p0 -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmsr p0, r0 -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vcmpt.i32 ne, q1, zr +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <4 x i32> %a, zeroinitializer @@ -187,7 +183,7 @@ ; CHECK-LABEL: cmpslt_v4i1: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vcmpt.s32 le, q2, q1 +; CHECK-NEXT: vcmpt.s32 ge, q1, q2 ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: bx lr entry: @@ -202,7 +198,7 @@ ; CHECK-LABEL: cmpsgt_v4i1: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vcmpt.s32 le, q1, q2 +; CHECK-NEXT: vcmpt.s32 ge, q2, q1 ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: bx lr entry: @@ -217,7 +213,7 @@ ; CHECK-LABEL: cmpsle_v4i1: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vcmpt.s32 lt, q2, q1 +; CHECK-NEXT: vcmpt.s32 gt, q1, q2 ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: bx lr entry: @@ -232,7 +228,7 @@ ; CHECK-LABEL: cmpsge_v4i1: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vcmpt.s32 lt, q1, q2 +; CHECK-NEXT: vcmpt.s32 gt, q2, q1 ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: bx lr entry: @@ -246,13 +242,9 @@ define arm_aapcs_vfpcc <4 x i32> @cmpult_v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: cmpult_v4i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.u32 hi, q2, q1 -; CHECK-NEXT: vmrs r0, p0 -; CHECK-NEXT: vcmp.i32 eq, q0, zr -; CHECK-NEXT: vmrs r1, p0 -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmsr p0, r0 -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vcmpt.u32 cs, q1, q2 +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <4 x i32> %a, zeroinitializer @@ -265,13 +257,9 @@ define arm_aapcs_vfpcc <4 x i32> @cmpugt_v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: cmpugt_v4i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.u32 hi, q1, q2 -; CHECK-NEXT: vmrs r0, p0 -; CHECK-NEXT: vcmp.i32 eq, q0, zr -; CHECK-NEXT: vmrs r1, p0 -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmsr p0, r0 -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vcmpt.u32 cs, q2, q1 +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <4 x i32> %a, zeroinitializer @@ -284,13 +272,9 @@ define arm_aapcs_vfpcc <4 x i32> @cmpule_v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: cmpule_v4i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.u32 cs, q2, q1 -; CHECK-NEXT: vmrs r0, p0 -; CHECK-NEXT: vcmp.i32 eq, q0, zr -; CHECK-NEXT: vmrs r1, p0 -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmsr p0, r0 -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vcmpt.u32 hi, q1, q2 +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <4 x i32> %a, zeroinitializer @@ -303,13 +287,9 @@ define arm_aapcs_vfpcc <4 x i32> @cmpuge_v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: cmpuge_v4i1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcmp.u32 cs, q1, q2 -; CHECK-NEXT: vmrs r0, p0 -; CHECK-NEXT: vcmp.i32 eq, q0, zr -; CHECK-NEXT: vmrs r1, p0 -; CHECK-NEXT: orrs r0, r1 -; CHECK-NEXT: vmsr p0, r0 -; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vcmpt.u32 hi, q2, q1 +; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: bx lr entry: %c1 = icmp eq <4 x i32> %a, zeroinitializer Index: llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll +++ llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll @@ -190,8 +190,8 @@ ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vpte.f32 le, q0, q3 -; CHECK-NEXT: vcmpt.f32 le, q3, q1 +; CHECK-NEXT: vpte.f32 ge, q3, q0 +; CHECK-NEXT: vcmpt.f32 ge, q1, q3 ; CHECK-NEXT: vstrwe.32 q2, [r0], #16 ; CHECK-NEXT: le lr, .LBB3_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup @@ -250,8 +250,8 @@ ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q3, [r0] -; CHECK-NEXT: vpte.f16 le, q1, q3 -; CHECK-NEXT: vcmpt.f16 le, q3, q0 +; CHECK-NEXT: vpte.f16 ge, q3, q1 +; CHECK-NEXT: vcmpt.f16 ge, q0, q3 ; CHECK-NEXT: vstrhe.16 q2, [r0], #16 ; CHECK-NEXT: le lr, .LBB4_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup @@ -482,8 +482,8 @@ ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vpte.f32 le, q0, q3 -; CHECK-NEXT: vcmpt.f32 le, q3, q1 +; CHECK-NEXT: vpte.f32 ge, q3, q0 +; CHECK-NEXT: vcmpt.f32 ge, q1, q3 ; CHECK-NEXT: vstrwe.32 q2, [r0], #16 ; CHECK-NEXT: le lr, .LBB8_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup @@ -542,8 +542,8 @@ ; CHECK-NEXT: .LBB9_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrh.u16 q3, [r0] -; CHECK-NEXT: vpte.f16 le, q1, q3 -; CHECK-NEXT: vcmpt.f16 le, q3, q0 +; CHECK-NEXT: vpte.f16 ge, q3, q1 +; CHECK-NEXT: vcmpt.f16 ge, q0, q3 ; CHECK-NEXT: vstrhe.16 q2, [r0], #16 ; CHECK-NEXT: le lr, .LBB9_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup