diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1033,13 +1033,17 @@ if (!TLI.isOperationLegalOrCustom(AVGOpc, NVT)) { // If we could not transform, and (both) adds are nuw/nsw, we can use the // larger type size to do the transform. - if (((!IsSigned && Add->getFlags().hasNoUnsignedWrap() && - (!Add2 || Add2->getFlags().hasNoUnsignedWrap())) || - (IsSigned && Add->getFlags().hasNoSignedWrap() && - (!Add2 || Add2->getFlags().hasNoSignedWrap()))) && - TLI.isOperationLegalOrCustom(AVGOpc, VT)) { + if (!TLI.isOperationLegalOrCustom(AVGOpc, VT)) + return SDValue(); + + if (DAG.computeOverflowForAdd(IsSigned, Add.getOperand(0), + Add.getOperand(1)) == + SelectionDAG::OFK_Never && + (!Add2 || DAG.computeOverflowForAdd(IsSigned, Add2.getOperand(0), + Add2.getOperand(1)) == + SelectionDAG::OFK_Never)) NVT = VT; - } else + else return SDValue(); } diff --git a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll --- a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll @@ -873,8 +873,8 @@ ; CHECK-NEXT: shl.2s v0, v0, #24 ; CHECK-NEXT: shl.2s v1, v1, #24 ; CHECK-NEXT: sshr.2s v0, v0, #24 -; CHECK-NEXT: ssra.2s v0, v1, #24 -; CHECK-NEXT: sshr.2s v0, v0, #1 +; CHECK-NEXT: sshr.2s v1, v1, #24 +; CHECK-NEXT: shadd.2s v0, v0, v1 ; CHECK-NEXT: ret %zextsrc1 = sext <2 x i8> %src1 to <2 x i16> %zextsrc2 = sext <2 x i8> %src2 to <2 x i16> @@ -889,8 +889,7 @@ ; CHECK-NEXT: movi d2, #0x0000ff000000ff ; CHECK-NEXT: and.8b v0, v0, v2 ; CHECK-NEXT: and.8b v1, v1, v2 -; CHECK-NEXT: add.2s v0, v0, v1 -; CHECK-NEXT: ushr.2s v0, v0, #1 +; CHECK-NEXT: uhadd.2s v0, v0, v1 ; CHECK-NEXT: ret %zextsrc1 = zext <2 x i8> %src1 to <2 x i16> %zextsrc2 = zext <2 x i8> %src2 to <2 x i16> @@ -923,8 +922,7 @@ ; CHECK-NEXT: movi d2, #0x0000ff000000ff ; CHECK-NEXT: and.8b v0, v0, v2 ; CHECK-NEXT: and.8b v1, v1, v2 -; CHECK-NEXT: add.2s v0, v0, v1 -; CHECK-NEXT: ushr.2s v0, v0, #1 +; CHECK-NEXT: uhadd.2s v0, v0, v1 ; CHECK-NEXT: ret %zextsrc1 = zext <2 x i8> %src1 to <2 x i16> %zextsrc2 = zext <2 x i8> %src2 to <2 x i16> @@ -1006,9 +1004,7 @@ ; CHECK-NEXT: shl.2s v1, v1, #24 ; CHECK-NEXT: sshr.2s v0, v0, #24 ; CHECK-NEXT: sshr.2s v1, v1, #24 -; CHECK-NEXT: mvn.8b v0, v0 -; CHECK-NEXT: sub.2s v0, v1, v0 -; CHECK-NEXT: sshr.2s v0, v0, #1 +; CHECK-NEXT: srhadd.2s v0, v0, v1 ; CHECK-NEXT: ret %zextsrc1 = sext <2 x i8> %src1 to <2 x i16> %zextsrc2 = sext <2 x i8> %src2 to <2 x i16> @@ -1024,9 +1020,7 @@ ; CHECK-NEXT: movi d2, #0x0000ff000000ff ; CHECK-NEXT: and.8b v0, v0, v2 ; CHECK-NEXT: and.8b v1, v1, v2 -; CHECK-NEXT: mvn.8b v0, v0 -; CHECK-NEXT: sub.2s v0, v1, v0 -; CHECK-NEXT: ushr.2s v0, v0, #1 +; CHECK-NEXT: urhadd.2s v0, v0, v1 ; CHECK-NEXT: ret %zextsrc1 = zext <2 x i8> %src1 to <2 x i16> %zextsrc2 = zext <2 x i8> %src2 to <2 x i16> @@ -1063,9 +1057,7 @@ ; CHECK-NEXT: movi d2, #0x0000ff000000ff ; CHECK-NEXT: and.8b v0, v0, v2 ; CHECK-NEXT: and.8b v1, v1, v2 -; CHECK-NEXT: mvn.8b v0, v0 -; CHECK-NEXT: sub.2s v0, v1, v0 -; CHECK-NEXT: ushr.2s v0, v0, #1 +; CHECK-NEXT: urhadd.2s v0, v0, v1 ; CHECK-NEXT: ret %zextsrc1 = zext <2 x i8> %src1 to <2 x i16> %zextsrc2 = zext <2 x i8> %src2 to <2 x i16> diff --git a/llvm/test/CodeGen/AArch64/sve-hadd.ll b/llvm/test/CodeGen/AArch64/sve-hadd.ll --- a/llvm/test/CodeGen/AArch64/sve-hadd.ll +++ b/llvm/test/CodeGen/AArch64/sve-hadd.ll @@ -219,14 +219,22 @@ } define @hadds_v2i16( %s0, %s1) { -; CHECK-LABEL: hadds_v2i16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: sxth z0.d, p0/m, z0.d -; CHECK-NEXT: sxth z1.d, p0/m, z1.d -; CHECK-NEXT: add z0.d, z0.d, z1.d -; CHECK-NEXT: asr z0.d, z0.d, #1 -; CHECK-NEXT: ret +; SVE-LABEL: hadds_v2i16: +; SVE: // %bb.0: // %entry +; SVE-NEXT: ptrue p0.d +; SVE-NEXT: sxth z0.d, p0/m, z0.d +; SVE-NEXT: sxth z1.d, p0/m, z1.d +; SVE-NEXT: add z0.d, z0.d, z1.d +; SVE-NEXT: asr z0.d, z0.d, #1 +; SVE-NEXT: ret +; +; SVE2-LABEL: hadds_v2i16: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: ptrue p0.d +; SVE2-NEXT: sxth z0.d, p0/m, z0.d +; SVE2-NEXT: sxth z1.d, p0/m, z1.d +; SVE2-NEXT: shadd z0.d, p0/m, z0.d, z1.d +; SVE2-NEXT: ret entry: %s0s = sext %s0 to %s1s = sext %s1 to @@ -256,13 +264,21 @@ } define @haddu_v2i16( %s0, %s1) { -; CHECK-LABEL: haddu_v2i16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: and z0.d, z0.d, #0xffff -; CHECK-NEXT: and z1.d, z1.d, #0xffff -; CHECK-NEXT: add z0.d, z0.d, z1.d -; CHECK-NEXT: lsr z0.d, z0.d, #1 -; CHECK-NEXT: ret +; SVE-LABEL: haddu_v2i16: +; SVE: // %bb.0: // %entry +; SVE-NEXT: and z0.d, z0.d, #0xffff +; SVE-NEXT: and z1.d, z1.d, #0xffff +; SVE-NEXT: add z0.d, z0.d, z1.d +; SVE-NEXT: lsr z0.d, z0.d, #1 +; SVE-NEXT: ret +; +; SVE2-LABEL: haddu_v2i16: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: ptrue p0.d +; SVE2-NEXT: and z0.d, z0.d, #0xffff +; SVE2-NEXT: and z1.d, z1.d, #0xffff +; SVE2-NEXT: uhadd z0.d, p0/m, z0.d, z1.d +; SVE2-NEXT: ret entry: %s0s = zext %s0 to %s1s = zext %s1 to @@ -417,14 +433,22 @@ } define @hadds_v4i8( %s0, %s1) { -; CHECK-LABEL: hadds_v4i8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: sxtb z0.s, p0/m, z0.s -; CHECK-NEXT: sxtb z1.s, p0/m, z1.s -; CHECK-NEXT: add z0.s, z0.s, z1.s -; CHECK-NEXT: asr z0.s, z0.s, #1 -; CHECK-NEXT: ret +; SVE-LABEL: hadds_v4i8: +; SVE: // %bb.0: // %entry +; SVE-NEXT: ptrue p0.s +; SVE-NEXT: sxtb z0.s, p0/m, z0.s +; SVE-NEXT: sxtb z1.s, p0/m, z1.s +; SVE-NEXT: add z0.s, z0.s, z1.s +; SVE-NEXT: asr z0.s, z0.s, #1 +; SVE-NEXT: ret +; +; SVE2-LABEL: hadds_v4i8: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: ptrue p0.s +; SVE2-NEXT: sxtb z0.s, p0/m, z0.s +; SVE2-NEXT: sxtb z1.s, p0/m, z1.s +; SVE2-NEXT: shadd z0.s, p0/m, z0.s, z1.s +; SVE2-NEXT: ret entry: %s0s = sext %s0 to %s1s = sext %s1 to @@ -454,13 +478,21 @@ } define @haddu_v4i8( %s0, %s1) { -; CHECK-LABEL: haddu_v4i8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: and z0.s, z0.s, #0xff -; CHECK-NEXT: and z1.s, z1.s, #0xff -; CHECK-NEXT: add z0.s, z0.s, z1.s -; CHECK-NEXT: lsr z0.s, z0.s, #1 -; CHECK-NEXT: ret +; SVE-LABEL: haddu_v4i8: +; SVE: // %bb.0: // %entry +; SVE-NEXT: and z0.s, z0.s, #0xff +; SVE-NEXT: and z1.s, z1.s, #0xff +; SVE-NEXT: add z0.s, z0.s, z1.s +; SVE-NEXT: lsr z0.s, z0.s, #1 +; SVE-NEXT: ret +; +; SVE2-LABEL: haddu_v4i8: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: ptrue p0.s +; SVE2-NEXT: and z0.s, z0.s, #0xff +; SVE2-NEXT: and z1.s, z1.s, #0xff +; SVE2-NEXT: uhadd z0.s, p0/m, z0.s, z1.s +; SVE2-NEXT: ret entry: %s0s = zext %s0 to %s1s = zext %s1 to @@ -693,16 +725,24 @@ } define @rhadds_v2i32( %s0, %s1) { -; CHECK-LABEL: rhadds_v2i32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z2.d, #-1 // =0xffffffffffffffff -; CHECK-NEXT: sxtw z0.d, p0/m, z0.d -; CHECK-NEXT: sxtw z1.d, p0/m, z1.d -; CHECK-NEXT: eor z0.d, z0.d, z2.d -; CHECK-NEXT: sub z0.d, z1.d, z0.d -; CHECK-NEXT: asr z0.d, z0.d, #1 -; CHECK-NEXT: ret +; SVE-LABEL: rhadds_v2i32: +; SVE: // %bb.0: // %entry +; SVE-NEXT: ptrue p0.d +; SVE-NEXT: mov z2.d, #-1 // =0xffffffffffffffff +; SVE-NEXT: sxtw z0.d, p0/m, z0.d +; SVE-NEXT: sxtw z1.d, p0/m, z1.d +; SVE-NEXT: eor z0.d, z0.d, z2.d +; SVE-NEXT: sub z0.d, z1.d, z0.d +; SVE-NEXT: asr z0.d, z0.d, #1 +; SVE-NEXT: ret +; +; SVE2-LABEL: rhadds_v2i32: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: ptrue p0.d +; SVE2-NEXT: sxtw z0.d, p0/m, z0.d +; SVE2-NEXT: sxtw z1.d, p0/m, z1.d +; SVE2-NEXT: srhadd z0.d, p0/m, z0.d, z1.d +; SVE2-NEXT: ret entry: %s0s = sext %s0 to %s1s = sext %s1 to @@ -884,15 +924,23 @@ } define @rhaddu_v2i16( %s0, %s1) { -; CHECK-LABEL: rhaddu_v2i16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z2.d, #-1 // =0xffffffffffffffff -; CHECK-NEXT: and z0.d, z0.d, #0xffff -; CHECK-NEXT: and z1.d, z1.d, #0xffff -; CHECK-NEXT: eor z0.d, z0.d, z2.d -; CHECK-NEXT: sub z0.d, z1.d, z0.d -; CHECK-NEXT: lsr z0.d, z0.d, #1 -; CHECK-NEXT: ret +; SVE-LABEL: rhaddu_v2i16: +; SVE: // %bb.0: // %entry +; SVE-NEXT: mov z2.d, #-1 // =0xffffffffffffffff +; SVE-NEXT: and z0.d, z0.d, #0xffff +; SVE-NEXT: and z1.d, z1.d, #0xffff +; SVE-NEXT: eor z0.d, z0.d, z2.d +; SVE-NEXT: sub z0.d, z1.d, z0.d +; SVE-NEXT: lsr z0.d, z0.d, #1 +; SVE-NEXT: ret +; +; SVE2-LABEL: rhaddu_v2i16: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: ptrue p0.d +; SVE2-NEXT: and z0.d, z0.d, #0xffff +; SVE2-NEXT: and z1.d, z1.d, #0xffff +; SVE2-NEXT: urhadd z0.d, p0/m, z0.d, z1.d +; SVE2-NEXT: ret entry: %s0s = zext %s0 to %s1s = zext %s1 to @@ -904,16 +952,24 @@ } define @rhadds_v4i16( %s0, %s1) { -; CHECK-LABEL: rhadds_v4i16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov z2.s, #-1 // =0xffffffffffffffff -; CHECK-NEXT: sxth z0.s, p0/m, z0.s -; CHECK-NEXT: sxth z1.s, p0/m, z1.s -; CHECK-NEXT: eor z0.d, z0.d, z2.d -; CHECK-NEXT: sub z0.s, z1.s, z0.s -; CHECK-NEXT: asr z0.s, z0.s, #1 -; CHECK-NEXT: ret +; SVE-LABEL: rhadds_v4i16: +; SVE: // %bb.0: // %entry +; SVE-NEXT: ptrue p0.s +; SVE-NEXT: mov z2.s, #-1 // =0xffffffffffffffff +; SVE-NEXT: sxth z0.s, p0/m, z0.s +; SVE-NEXT: sxth z1.s, p0/m, z1.s +; SVE-NEXT: eor z0.d, z0.d, z2.d +; SVE-NEXT: sub z0.s, z1.s, z0.s +; SVE-NEXT: asr z0.s, z0.s, #1 +; SVE-NEXT: ret +; +; SVE2-LABEL: rhadds_v4i16: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: ptrue p0.s +; SVE2-NEXT: sxth z0.s, p0/m, z0.s +; SVE2-NEXT: sxth z1.s, p0/m, z1.s +; SVE2-NEXT: srhadd z0.s, p0/m, z0.s, z1.s +; SVE2-NEXT: ret entry: %s0s = sext %s0 to %s1s = sext %s1 to @@ -1095,15 +1151,23 @@ } define @rhaddu_v4i8( %s0, %s1) { -; CHECK-LABEL: rhaddu_v4i8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov z2.s, #-1 // =0xffffffffffffffff -; CHECK-NEXT: and z0.s, z0.s, #0xff -; CHECK-NEXT: and z1.s, z1.s, #0xff -; CHECK-NEXT: eor z0.d, z0.d, z2.d -; CHECK-NEXT: sub z0.s, z1.s, z0.s -; CHECK-NEXT: lsr z0.s, z0.s, #1 -; CHECK-NEXT: ret +; SVE-LABEL: rhaddu_v4i8: +; SVE: // %bb.0: // %entry +; SVE-NEXT: mov z2.s, #-1 // =0xffffffffffffffff +; SVE-NEXT: and z0.s, z0.s, #0xff +; SVE-NEXT: and z1.s, z1.s, #0xff +; SVE-NEXT: eor z0.d, z0.d, z2.d +; SVE-NEXT: sub z0.s, z1.s, z0.s +; SVE-NEXT: lsr z0.s, z0.s, #1 +; SVE-NEXT: ret +; +; SVE2-LABEL: rhaddu_v4i8: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: ptrue p0.s +; SVE2-NEXT: and z0.s, z0.s, #0xff +; SVE2-NEXT: and z1.s, z1.s, #0xff +; SVE2-NEXT: urhadd z0.s, p0/m, z0.s, z1.s +; SVE2-NEXT: ret entry: %s0s = zext %s0 to %s1s = zext %s1 to @@ -1115,16 +1179,24 @@ } define @rhadds_v8i8( %s0, %s1) { -; CHECK-LABEL: rhadds_v8i8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mov z2.h, #-1 // =0xffffffffffffffff -; CHECK-NEXT: sxtb z0.h, p0/m, z0.h -; CHECK-NEXT: sxtb z1.h, p0/m, z1.h -; CHECK-NEXT: eor z0.d, z0.d, z2.d -; CHECK-NEXT: sub z0.h, z1.h, z0.h -; CHECK-NEXT: asr z0.h, z0.h, #1 -; CHECK-NEXT: ret +; SVE-LABEL: rhadds_v8i8: +; SVE: // %bb.0: // %entry +; SVE-NEXT: ptrue p0.h +; SVE-NEXT: mov z2.h, #-1 // =0xffffffffffffffff +; SVE-NEXT: sxtb z0.h, p0/m, z0.h +; SVE-NEXT: sxtb z1.h, p0/m, z1.h +; SVE-NEXT: eor z0.d, z0.d, z2.d +; SVE-NEXT: sub z0.h, z1.h, z0.h +; SVE-NEXT: asr z0.h, z0.h, #1 +; SVE-NEXT: ret +; +; SVE2-LABEL: rhadds_v8i8: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: ptrue p0.h +; SVE2-NEXT: sxtb z0.h, p0/m, z0.h +; SVE2-NEXT: sxtb z1.h, p0/m, z1.h +; SVE2-NEXT: srhadd z0.h, p0/m, z0.h, z1.h +; SVE2-NEXT: ret entry: %s0s = sext %s0 to %s1s = sext %s1 to diff --git a/llvm/test/CodeGen/Thumb2/mve-vhadd.ll b/llvm/test/CodeGen/Thumb2/mve-vhadd.ll --- a/llvm/test/CodeGen/Thumb2/mve-vhadd.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vhadd.ll @@ -116,8 +116,7 @@ ; CHECK-NEXT: vmov.i32 q2, #0xff ; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vshr.u32 q0, q0, #1 +; CHECK-NEXT: vhadd.u32 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %s0s = zext <4 x i8> %s0 to <4 x i16> @@ -313,12 +312,9 @@ ; CHECK-LABEL: vrhaddu_v4i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.i32 q2, #0xff -; CHECK-NEXT: movs r0, #1 ; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vshr.u32 q0, q0, #1 +; CHECK-NEXT: vrhadd.u32 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %s0s = zext <4 x i8> %s0 to <4 x i16>