Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -426,6 +426,7 @@ SDValue visitREM(SDNode *N); SDValue visitMULHU(SDNode *N); SDValue visitMULHS(SDNode *N); + SDValue visitAVG(SDNode *N); SDValue visitSMUL_LOHI(SDNode *N); SDValue visitUMUL_LOHI(SDNode *N); SDValue visitMULO(SDNode *N); @@ -1635,6 +1636,10 @@ case ISD::UREM: return visitREM(N); case ISD::MULHU: return visitMULHU(N); case ISD::MULHS: return visitMULHS(N); + case ISD::AVGFLOORS: + case ISD::AVGFLOORU: + case ISD::AVGCEILS: + case ISD::AVGCEILU: return visitAVG(N); case ISD::SMUL_LOHI: return visitSMUL_LOHI(N); case ISD::UMUL_LOHI: return visitUMUL_LOHI(N); case ISD::SMULO: @@ -4654,6 +4659,42 @@ return SDValue(); } +SDValue DAGCombiner::visitAVG(SDNode *N) { + unsigned Opcode = N->getOpcode(); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N->getValueType(0); + SDLoc DL(N); + + // fold (avg c1, c2) + if (SDValue C = DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1})) + return C; + + // canonicalize constant to RHS. + if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && + !DAG.isConstantIntBuildVectorOrConstantInt(N1)) + return DAG.getNode(Opcode, DL, N->getVTList(), N1, N0); + + if (VT.isVector()) { + if (SDValue FoldedVOp = SimplifyVBinOp(N, DL)) + return FoldedVOp; + + // fold (avg x, 0) -> x >> 1 + if (ISD::isConstantSplatVectorAllZeros(N1.getNode())) { + if (Opcode == ISD::AVGFLOORS) + return DAG.getNode(ISD::SRA, DL, VT, N0, DAG.getConstant(1, DL, VT)); + if (Opcode == ISD::AVGFLOORU) + return DAG.getNode(ISD::SRL, DL, VT, N0, DAG.getConstant(1, DL, VT)); + } + } + + // fold (avg x, undef) -> 0 + if (N0.isUndef() || N1.isUndef()) + return DAG.getConstant(0, DL, VT); + + return SDValue(); +} + /// Perform optimizations common to nodes that compute two values. LoOp and HiOp /// give the opcodes for the two computations that are being performed. Return /// true if a simplification was made. Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -5274,6 +5274,30 @@ APInt C2Ext = C2.zext(FullWidth); return (C1Ext * C2Ext).extractBits(C1.getBitWidth(), C1.getBitWidth()); } + case ISD::AVGFLOORS: { + unsigned FullWidth = C1.getBitWidth() + 1; + APInt C1Ext = C1.sext(FullWidth); + APInt C2Ext = C2.sext(FullWidth); + return (C1Ext + C2Ext).extractBits(C1.getBitWidth(), 1); + } + case ISD::AVGFLOORU: { + unsigned FullWidth = C1.getBitWidth() + 1; + APInt C1Ext = C1.zext(FullWidth); + APInt C2Ext = C2.zext(FullWidth); + return (C1Ext + C2Ext).extractBits(C1.getBitWidth(), 1); + } + case ISD::AVGCEILS: { + unsigned FullWidth = C1.getBitWidth() + 1; + APInt C1Ext = C1.sext(FullWidth); + APInt C2Ext = C2.sext(FullWidth); + return (C1Ext + C2Ext + 1).extractBits(C1.getBitWidth(), 1); + } + case ISD::AVGCEILU: { + unsigned FullWidth = C1.getBitWidth() + 1; + APInt C1Ext = C1.zext(FullWidth); + APInt C2Ext = C2.zext(FullWidth); + return (C1Ext + C2Ext + 1).extractBits(C1.getBitWidth(), 1); + } } return llvm::None; } Index: llvm/test/CodeGen/AArch64/hadd-combine.ll =================================================================== --- llvm/test/CodeGen/AArch64/hadd-combine.ll +++ llvm/test/CodeGen/AArch64/hadd-combine.ll @@ -125,8 +125,7 @@ define <8 x i16> @haddu_i_const_zero(<8 x i16> %src1) { ; CHECK-LABEL: haddu_i_const_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: uhadd v0.8h, v0.8h, v1.8h +; CHECK-NEXT: ushr v0.8h, v0.8h, #1 ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> , <8 x i16> %src1) ret <8 x i16> %result @@ -135,9 +134,7 @@ define <8 x i16> @haddu_i_const_both() { ; CHECK-LABEL: haddu_i_const_both: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.8h, #1 -; CHECK-NEXT: movi v1.8h, #3 -; CHECK-NEXT: uhadd v0.8h, v1.8h, v0.8h +; CHECK-NEXT: movi v0.8h, #2 ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> , <8 x i16> ) ret <8 x i16> %result @@ -146,9 +143,7 @@ define <8 x i16> @haddu_i_const_bothhigh() { ; CHECK-LABEL: haddu_i_const_bothhigh: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.2d, #0xffffffffffffffff -; CHECK-NEXT: mvni v1.8h, #1 -; CHECK-NEXT: uhadd v0.8h, v1.8h, v0.8h +; CHECK-NEXT: mvni v0.8h, #1 ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> , <8 x i16> ) ret <8 x i16> %result @@ -157,7 +152,7 @@ define <8 x i16> @haddu_i_undef(<8 x i16> %src1) { ; CHECK-LABEL: haddu_i_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: uhadd v0.8h, v0.8h, v0.8h +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> undef, <8 x i16> %src1) ret <8 x i16> %result @@ -292,8 +287,7 @@ define <8 x i16> @hadds_i_const_zero(<8 x i16> %src1) { ; CHECK-LABEL: hadds_i_const_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: shadd v0.8h, v0.8h, v1.8h +; CHECK-NEXT: sshr v0.8h, v0.8h, #1 ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> , <8 x i16> %src1) ret <8 x i16> %result @@ -302,9 +296,7 @@ define <8 x i16> @hadds_i_const_both() { ; CHECK-LABEL: hadds_i_const_both: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.8h, #1 -; CHECK-NEXT: movi v1.8h, #3 -; CHECK-NEXT: shadd v0.8h, v1.8h, v0.8h +; CHECK-NEXT: movi v0.8h, #2 ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> , <8 x i16> ) ret <8 x i16> %result @@ -314,9 +306,7 @@ ; CHECK-LABEL: hadds_i_const_bothhigh: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #32766 -; CHECK-NEXT: mvni v0.8h, #128, lsl #8 -; CHECK-NEXT: dup v1.8h, w8 -; CHECK-NEXT: shadd v0.8h, v1.8h, v0.8h +; CHECK-NEXT: dup v0.8h, w8 ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> , <8 x i16> ) ret <8 x i16> %result @@ -325,7 +315,7 @@ define <8 x i16> @hadds_i_undef(<8 x i16> %src1) { ; CHECK-LABEL: hadds_i_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: shadd v0.8h, v0.8h, v0.8h +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> undef, <8 x i16> %src1) ret <8 x i16> %result @@ -474,9 +464,7 @@ define <8 x i16> @rhaddu_i_const_both() { ; CHECK-LABEL: rhaddu_i_const_both: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.8h, #1 -; CHECK-NEXT: movi v1.8h, #3 -; CHECK-NEXT: urhadd v0.8h, v1.8h, v0.8h +; CHECK-NEXT: movi v0.8h, #2 ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> , <8 x i16> ) ret <8 x i16> %result @@ -486,8 +474,6 @@ ; CHECK-LABEL: rhaddu_i_const_bothhigh: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v0.2d, #0xffffffffffffffff -; CHECK-NEXT: mvni v1.8h, #1 -; CHECK-NEXT: urhadd v0.8h, v1.8h, v0.8h ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> , <8 x i16> ) ret <8 x i16> %result @@ -496,7 +482,7 @@ define <8 x i16> @rhaddu_i_undef(<8 x i16> %src1) { ; CHECK-LABEL: rhaddu_i_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: urhadd v0.8h, v0.8h, v0.8h +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> undef, <8 x i16> %src1) ret <8 x i16> %result @@ -645,9 +631,7 @@ define <8 x i16> @rhadds_i_const_both() { ; CHECK-LABEL: rhadds_i_const_both: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.8h, #1 -; CHECK-NEXT: movi v1.8h, #3 -; CHECK-NEXT: srhadd v0.8h, v1.8h, v0.8h +; CHECK-NEXT: movi v0.8h, #2 ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> , <8 x i16> ) ret <8 x i16> %result @@ -656,10 +640,7 @@ define <8 x i16> @rhadds_i_const_bothhigh() { ; CHECK-LABEL: rhadds_i_const_bothhigh: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #32766 ; CHECK-NEXT: mvni v0.8h, #128, lsl #8 -; CHECK-NEXT: dup v1.8h, w8 -; CHECK-NEXT: srhadd v0.8h, v1.8h, v0.8h ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> , <8 x i16> ) ret <8 x i16> %result @@ -668,7 +649,7 @@ define <8 x i16> @rhadds_i_undef(<8 x i16> %src1) { ; CHECK-LABEL: rhadds_i_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: srhadd v0.8h, v0.8h, v0.8h +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> undef, <8 x i16> %src1) ret <8 x i16> %result