Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -2825,7 +2825,10 @@ // And if the target does not like this form then turn into: // sub y, (xor x, -1) if (!TLI.preferIncOfAddToSubOfNot(VT) && N0.getOpcode() == ISD::ADD && - N0.hasOneUse()) { + N0.hasOneUse() && + // Limit this to after legalization if the add has wrap flags + (Level >= AfterLegalizeDAG || (!N->getFlags().hasNoUnsignedWrap() && + !N->getFlags().hasNoSignedWrap()))) { SDValue Not = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(0), DAG.getAllOnesConstant(DL, VT)); return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(1), Not); @@ -3041,7 +3044,10 @@ // And if the target does not like this form then turn into: // sub y, (xor x, -1) if (!TLI.preferIncOfAddToSubOfNot(VT) && N0.getOpcode() == ISD::ADD && - N0.hasOneUse() && isOneOrOneSplat(N0.getOperand(1))) { + N0.hasOneUse() && isOneOrOneSplat(N0.getOperand(1)) && + // Limit this to after legalization if the add has wrap flags + (Level >= AfterLegalizeDAG || (!N0->getFlags().hasNoUnsignedWrap() && + !N0->getFlags().hasNoSignedWrap()))) { SDValue Not = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(0), DAG.getAllOnesConstant(DL, VT)); return DAG.getNode(ISD::SUB, DL, VT, N1, Not); Index: llvm/test/CodeGen/AArch64/arm64-vhadd.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-vhadd.ll +++ llvm/test/CodeGen/AArch64/arm64-vhadd.ll @@ -968,10 +968,10 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: shl.4h v0, v0, #8 ; CHECK-NEXT: shl.4h v1, v1, #8 +; CHECK-NEXT: movi.4h v2, #1 ; CHECK-NEXT: sshr.4h v0, v0, #8 -; CHECK-NEXT: sshr.4h v1, v1, #8 -; CHECK-NEXT: mvn.8b v0, v0 -; CHECK-NEXT: sub.4h v0, v1, v0 +; CHECK-NEXT: ssra.4h v0, v1, #8 +; CHECK-NEXT: add.4h v0, v0, v2 ; CHECK-NEXT: ushr.4h v0, v0, #1 ; CHECK-NEXT: ret %zextsrc1 = sext <4 x i8> %src1 to <4 x i16> @@ -1004,9 +1004,7 @@ ; CHECK-NEXT: shl.2s v1, v1, #24 ; CHECK-NEXT: sshr.2s v0, v0, #24 ; CHECK-NEXT: sshr.2s v1, v1, #24 -; CHECK-NEXT: mvn.8b v0, v0 -; CHECK-NEXT: sub.2s v0, v1, v0 -; CHECK-NEXT: sshr.2s v0, v0, #1 +; CHECK-NEXT: srhadd.2s v0, v0, v1 ; CHECK-NEXT: ret %zextsrc1 = sext <2 x i8> %src1 to <2 x i16> %zextsrc2 = sext <2 x i8> %src2 to <2 x i16> @@ -1022,9 +1020,7 @@ ; CHECK-NEXT: movi d2, #0x0000ff000000ff ; CHECK-NEXT: and.8b v0, v0, v2 ; CHECK-NEXT: and.8b v1, v1, v2 -; CHECK-NEXT: mvn.8b v0, v0 -; CHECK-NEXT: sub.2s v0, v1, v0 -; CHECK-NEXT: ushr.2s v0, v0, #1 +; CHECK-NEXT: urhadd.2s v0, v0, v1 ; CHECK-NEXT: ret %zextsrc1 = zext <2 x i8> %src1 to <2 x i16> %zextsrc2 = zext <2 x i8> %src2 to <2 x i16> @@ -1039,12 +1035,12 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: shl.2s v0, v0, #24 ; CHECK-NEXT: shl.2s v1, v1, #24 -; CHECK-NEXT: movi d2, #0x00ffff0000ffff +; CHECK-NEXT: movi.2s v2, #1 ; CHECK-NEXT: sshr.2s v0, v0, #24 -; CHECK-NEXT: sshr.2s v1, v1, #24 -; CHECK-NEXT: mvn.8b v0, v0 -; CHECK-NEXT: sub.2s v0, v1, v0 -; CHECK-NEXT: and.8b v0, v0, v2 +; CHECK-NEXT: ssra.2s v0, v1, #24 +; CHECK-NEXT: movi d1, #0x00ffff0000ffff +; CHECK-NEXT: add.2s v0, v0, v2 +; CHECK-NEXT: and.8b v0, v0, v1 ; CHECK-NEXT: ushr.2s v0, v0, #1 ; CHECK-NEXT: ret %zextsrc1 = sext <2 x i8> %src1 to <2 x i16> @@ -1061,9 +1057,7 @@ ; CHECK-NEXT: movi d2, #0x0000ff000000ff ; CHECK-NEXT: and.8b v0, v0, v2 ; CHECK-NEXT: and.8b v1, v1, v2 -; CHECK-NEXT: mvn.8b v0, v0 -; CHECK-NEXT: sub.2s v0, v1, v0 -; CHECK-NEXT: ushr.2s v0, v0, #1 +; CHECK-NEXT: urhadd.2s v0, v0, v1 ; CHECK-NEXT: ret %zextsrc1 = zext <2 x i8> %src1 to <2 x i16> %zextsrc2 = zext <2 x i8> %src2 to <2 x i16> Index: llvm/test/CodeGen/AArch64/sve-hadd.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-hadd.ll +++ llvm/test/CodeGen/AArch64/sve-hadd.ll @@ -916,15 +916,23 @@ } define @rhaddu_v2i16( %s0, %s1) { -; CHECK-LABEL: rhaddu_v2i16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z2.d, #-1 // =0xffffffffffffffff -; CHECK-NEXT: and z0.d, z0.d, #0xffff -; CHECK-NEXT: and z1.d, z1.d, #0xffff -; CHECK-NEXT: eor z0.d, z0.d, z2.d -; CHECK-NEXT: sub z0.d, z1.d, z0.d -; CHECK-NEXT: lsr z0.d, z0.d, #1 -; CHECK-NEXT: ret +; SVE-LABEL: rhaddu_v2i16: +; SVE: // %bb.0: // %entry +; SVE-NEXT: mov z2.d, #-1 // =0xffffffffffffffff +; SVE-NEXT: and z0.d, z0.d, #0xffff +; SVE-NEXT: and z1.d, z1.d, #0xffff +; SVE-NEXT: eor z0.d, z0.d, z2.d +; SVE-NEXT: sub z0.d, z1.d, z0.d +; SVE-NEXT: lsr z0.d, z0.d, #1 +; SVE-NEXT: ret +; +; SVE2-LABEL: rhaddu_v2i16: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: ptrue p0.d +; SVE2-NEXT: and z0.d, z0.d, #0xffff +; SVE2-NEXT: and z1.d, z1.d, #0xffff +; SVE2-NEXT: urhadd z0.d, p0/m, z0.d, z1.d +; SVE2-NEXT: ret entry: %s0s = zext %s0 to %s1s = zext %s1 to @@ -1127,15 +1135,23 @@ } define @rhaddu_v4i8( %s0, %s1) { -; CHECK-LABEL: rhaddu_v4i8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z2.s, #-1 // =0xffffffffffffffff -; CHECK-NEXT: and z0.s, z0.s, #0xff -; CHECK-NEXT: and z1.s, z1.s, #0xff -; CHECK-NEXT: eor z0.d, z0.d, z2.d -; CHECK-NEXT: sub z0.s, z1.s, z0.s -; CHECK-NEXT: lsr z0.s, z0.s, #1 -; CHECK-NEXT: ret +; SVE-LABEL: rhaddu_v4i8: +; SVE: // %bb.0: // %entry +; SVE-NEXT: mov z2.s, #-1 // =0xffffffffffffffff +; SVE-NEXT: and z0.s, z0.s, #0xff +; SVE-NEXT: and z1.s, z1.s, #0xff +; SVE-NEXT: eor z0.d, z0.d, z2.d +; SVE-NEXT: sub z0.s, z1.s, z0.s +; SVE-NEXT: lsr z0.s, z0.s, #1 +; SVE-NEXT: ret +; +; SVE2-LABEL: rhaddu_v4i8: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: ptrue p0.s +; SVE2-NEXT: and z0.s, z0.s, #0xff +; SVE2-NEXT: and z1.s, z1.s, #0xff +; SVE2-NEXT: urhadd z0.s, p0/m, z0.s, z1.s +; SVE2-NEXT: ret entry: %s0s = zext %s0 to %s1s = zext %s1 to