diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -944,33 +944,37 @@ SDValue ExtOpA = Add.getOperand(0); SDValue ExtOpB = Add.getOperand(1); - auto MatchOperands = [&](SDValue Op1, SDValue Op2, SDValue Op3) { + SDValue Add2; + auto MatchOperands = [&](SDValue Op1, SDValue Op2, SDValue Op3, SDValue A) { ConstantSDNode *ConstOp; if ((ConstOp = isConstOrConstSplat(Op1, DemandedElts)) && ConstOp->isOne()) { ExtOpA = Op2; ExtOpB = Op3; + Add2 = A; return true; } if ((ConstOp = isConstOrConstSplat(Op2, DemandedElts)) && ConstOp->isOne()) { ExtOpA = Op1; ExtOpB = Op3; + Add2 = A; return true; } if ((ConstOp = isConstOrConstSplat(Op3, DemandedElts)) && ConstOp->isOne()) { ExtOpA = Op1; ExtOpB = Op2; + Add2 = A; return true; } return false; }; bool IsCeil = (ExtOpA.getOpcode() == ISD::ADD && - MatchOperands(ExtOpA.getOperand(0), ExtOpA.getOperand(1), ExtOpB)) || + MatchOperands(ExtOpA.getOperand(0), ExtOpA.getOperand(1), ExtOpB, ExtOpA)) || (ExtOpB.getOpcode() == ISD::ADD && - MatchOperands(ExtOpB.getOperand(0), ExtOpB.getOperand(1), ExtOpA)); + MatchOperands(ExtOpB.getOperand(0), ExtOpB.getOperand(1), ExtOpA, ExtOpB)); // If the shift is signed (sra): // - Needs >= 2 sign bit for both operands. @@ -1033,8 +1037,18 @@ EVT NVT = EVT::getIntegerVT(*DAG.getContext(), llvm::bit_ceil(MinWidth)); if (VT.isVector()) NVT = EVT::getVectorVT(*DAG.getContext(), NVT, VT.getVectorElementCount()); - if (!TLI.isOperationLegalOrCustom(AVGOpc, NVT)) - return SDValue(); + if (!TLI.isOperationLegalOrCustom(AVGOpc, NVT)) { + // If we could not transform, and (both) adds are nuw/nsw, we can use the + // larger type size to do the transform. + if (((!IsSigned && Add->getFlags().hasNoUnsignedWrap() && + (!Add2 || Add2->getFlags().hasNoUnsignedWrap())) || + (IsSigned && Add->getFlags().hasNoSignedWrap() && + (!Add2 || Add2->getFlags().hasNoSignedWrap()))) && + TLI.isOperationLegalOrCustom(AVGOpc, VT)) { + NVT = VT; + } else + return SDValue(); + } SDLoc DL(Op); SDValue ResultAVG = diff --git a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll --- a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll @@ -813,8 +813,8 @@ ; CHECK-NEXT: shl.4h v0, v0, #8 ; CHECK-NEXT: shl.4h v1, v1, #8 ; CHECK-NEXT: sshr.4h v0, v0, #8 -; CHECK-NEXT: ssra.4h v0, v1, #8 -; CHECK-NEXT: sshr.4h v0, v0, #1 +; CHECK-NEXT: sshr.4h v1, v1, #8 +; CHECK-NEXT: shadd.4h v0, v0, v1 ; CHECK-NEXT: ret %zextsrc1 = sext <4 x i8> %src1 to <4 x i16> %zextsrc2 = sext <4 x i8> %src2 to <4 x i16> @@ -828,8 +828,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: bic.4h v0, #255, lsl #8 ; CHECK-NEXT: bic.4h v1, #255, lsl #8 -; CHECK-NEXT: add.4h v0, v0, v1 -; CHECK-NEXT: ushr.4h v0, v0, #1 +; CHECK-NEXT: uhadd.4h v0, v0, v1 ; CHECK-NEXT: ret %zextsrc1 = zext <4 x i8> %src1 to <4 x i16> %zextsrc2 = zext <4 x i8> %src2 to <4 x i16> @@ -859,8 +858,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: bic.4h v0, #255, lsl #8 ; CHECK-NEXT: bic.4h v1, #255, lsl #8 -; CHECK-NEXT: add.4h v0, v0, v1 -; CHECK-NEXT: ushr.4h v0, v0, #1 +; CHECK-NEXT: uhadd.4h v0, v0, v1 ; CHECK-NEXT: ret %zextsrc1 = zext <4 x i8> %src1 to <4 x i16> %zextsrc2 = zext <4 x i8> %src2 to <4 x i16> diff --git a/llvm/test/CodeGen/AArch64/sve2-hadd.ll b/llvm/test/CodeGen/AArch64/sve2-hadd.ll --- a/llvm/test/CodeGen/AArch64/sve2-hadd.ll +++ b/llvm/test/CodeGen/AArch64/sve2-hadd.ll @@ -51,9 +51,10 @@ define @haddu_v2i32( %s0, %s1) { ; CHECK-LABEL: haddu_v2i32: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: adr z0.d, [z0.d, z1.d, uxtw] -; CHECK-NEXT: lsr z0.d, z0.d, #1 +; CHECK-NEXT: and z1.d, z1.d, #0xffffffff +; CHECK-NEXT: uhadd z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret entry: %s0s = zext %s0 to @@ -116,10 +117,10 @@ define @haddu_v2i16( %s0, %s1) { ; CHECK-LABEL: haddu_v2i16: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z0.d, z0.d, #0xffff ; CHECK-NEXT: and z1.d, z1.d, #0xffff -; CHECK-NEXT: add z0.d, z0.d, z1.d -; CHECK-NEXT: lsr z0.d, z0.d, #1 +; CHECK-NEXT: uhadd z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret entry: %s0s = zext %s0 to @@ -151,10 +152,10 @@ define @haddu_v4i16( %s0, %s1) { ; CHECK-LABEL: haddu_v4i16: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and z0.s, z0.s, #0xffff ; CHECK-NEXT: and z1.s, z1.s, #0xffff -; CHECK-NEXT: add z0.s, z0.s, z1.s -; CHECK-NEXT: lsr z0.s, z0.s, #1 +; CHECK-NEXT: uhadd z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret entry: %s0s = zext %s0 to @@ -217,10 +218,10 @@ define @haddu_v4i8( %s0, %s1) { ; CHECK-LABEL: haddu_v4i8: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and z0.s, z0.s, #0xff ; CHECK-NEXT: and z1.s, z1.s, #0xff -; CHECK-NEXT: add z0.s, z0.s, z1.s -; CHECK-NEXT: lsr z0.s, z0.s, #1 +; CHECK-NEXT: uhadd z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret entry: %s0s = zext %s0 to @@ -252,10 +253,10 @@ define @haddu_v8i8( %s0, %s1) { ; CHECK-LABEL: haddu_v8i8: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: and z0.h, z0.h, #0xff ; CHECK-NEXT: and z1.h, z1.h, #0xff -; CHECK-NEXT: add z0.h, z0.h, z1.h -; CHECK-NEXT: lsr z0.h, z0.h, #1 +; CHECK-NEXT: uhadd z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret entry: %s0s = zext %s0 to @@ -352,12 +353,10 @@ define @rhaddu_v2i32( %s0, %s1) { ; CHECK-LABEL: rhaddu_v2i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z2.d, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z0.d, z0.d, #0xffffffff ; CHECK-NEXT: and z1.d, z1.d, #0xffffffff -; CHECK-NEXT: eor z0.d, z0.d, z2.d -; CHECK-NEXT: sub z0.d, z1.d, z0.d -; CHECK-NEXT: lsr z0.d, z0.d, #1 +; CHECK-NEXT: urhadd z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret entry: %s0s = zext %s0 to @@ -467,12 +466,10 @@ define @rhaddu_v4i16( %s0, %s1) { ; CHECK-LABEL: rhaddu_v4i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z2.s, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and z0.s, z0.s, #0xffff ; CHECK-NEXT: and z1.s, z1.s, #0xffff -; CHECK-NEXT: eor z0.d, z0.d, z2.d -; CHECK-NEXT: sub z0.s, z1.s, z0.s -; CHECK-NEXT: lsr z0.s, z0.s, #1 +; CHECK-NEXT: urhadd z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret entry: %s0s = zext %s0 to @@ -582,12 +579,10 @@ define @rhaddu_v8i8( %s0, %s1) { ; CHECK-LABEL: rhaddu_v8i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z2.h, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: and z0.h, z0.h, #0xff ; CHECK-NEXT: and z1.h, z1.h, #0xff -; CHECK-NEXT: eor z0.d, z0.d, z2.d -; CHECK-NEXT: sub z0.h, z1.h, z0.h -; CHECK-NEXT: lsr z0.h, z0.h, #1 +; CHECK-NEXT: urhadd z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret entry: %s0s = zext %s0 to diff --git a/llvm/test/CodeGen/Thumb2/mve-vhadd.ll b/llvm/test/CodeGen/Thumb2/mve-vhadd.ll --- a/llvm/test/CodeGen/Thumb2/mve-vhadd.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vhadd.ll @@ -243,10 +243,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmovlb.u16 q1, q1 ; CHECK-NEXT: vmovlb.u16 q0, q0 -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: movs r0, #1 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vshr.u32 q0, q0, #1 +; CHECK-NEXT: vrhadd.u32 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %s0s = zext <4 x i16> %s0 to <4 x i32> @@ -357,10 +354,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmovlb.u8 q1, q1 ; CHECK-NEXT: vmovlb.u8 q0, q0 -; CHECK-NEXT: vadd.i16 q0, q0, q1 -; CHECK-NEXT: movs r0, #1 -; CHECK-NEXT: vadd.i16 q0, q0, r0 -; CHECK-NEXT: vshr.u16 q0, q0, #1 +; CHECK-NEXT: vrhadd.u16 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %s0s = zext <8 x i8> %s0 to <8 x i16>