diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -426,6 +426,7 @@ SDValue visitREM(SDNode *N); SDValue visitMULHU(SDNode *N); SDValue visitMULHS(SDNode *N); + SDValue visitAVG(SDNode *N); SDValue visitSMUL_LOHI(SDNode *N); SDValue visitUMUL_LOHI(SDNode *N); SDValue visitMULO(SDNode *N); @@ -1635,6 +1636,10 @@ case ISD::UREM: return visitREM(N); case ISD::MULHU: return visitMULHU(N); case ISD::MULHS: return visitMULHS(N); + case ISD::AVGFLOORS: + case ISD::AVGFLOORU: + case ISD::AVGCEILS: + case ISD::AVGCEILU: return visitAVG(N); case ISD::SMUL_LOHI: return visitSMUL_LOHI(N); case ISD::UMUL_LOHI: return visitUMUL_LOHI(N); case ISD::SMULO: @@ -4654,6 +4659,46 @@ return SDValue(); } +SDValue DAGCombiner::visitAVG(SDNode *N) { + unsigned Opcode = N->getOpcode(); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N->getValueType(0); + SDLoc DL(N); + + // fold (avg c1, c2) + if (SDValue C = DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1})) + return C; + + // canonicalize constant to RHS. + if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && + !DAG.isConstantIntBuildVectorOrConstantInt(N1)) + return DAG.getNode(Opcode, DL, N->getVTList(), N1, N0); + + if (VT.isVector()) { + if (SDValue FoldedVOp = SimplifyVBinOp(N, DL)) + return FoldedVOp; + + // fold (avgfloor x, 0) -> x >> 1 + if (ISD::isConstantSplatVectorAllZeros(N1.getNode())) { + if (Opcode == ISD::AVGFLOORS) + return DAG.getNode(ISD::SRA, DL, VT, N0, DAG.getConstant(1, DL, VT)); + if (Opcode == ISD::AVGFLOORU) + return DAG.getNode(ISD::SRL, DL, VT, N0, DAG.getConstant(1, DL, VT)); + } + } + + // fold (avg x, undef) -> x + if (N0.isUndef()) + return N1; + if (N1.isUndef()) + return N0; + + // TODO If we use avg for scalars anywhere, we can add (avgfl x, 0) -> x >> 1 + + return SDValue(); +} + /// Perform optimizations common to nodes that compute two values. LoOp and HiOp /// give the opcodes for the two computations that are being performed. Return /// true if a simplification was made. diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -5274,6 +5274,30 @@ APInt C2Ext = C2.zext(FullWidth); return (C1Ext * C2Ext).extractBits(C1.getBitWidth(), C1.getBitWidth()); } + case ISD::AVGFLOORS: { + unsigned FullWidth = C1.getBitWidth() + 1; + APInt C1Ext = C1.sext(FullWidth); + APInt C2Ext = C2.sext(FullWidth); + return (C1Ext + C2Ext).extractBits(C1.getBitWidth(), 1); + } + case ISD::AVGFLOORU: { + unsigned FullWidth = C1.getBitWidth() + 1; + APInt C1Ext = C1.zext(FullWidth); + APInt C2Ext = C2.zext(FullWidth); + return (C1Ext + C2Ext).extractBits(C1.getBitWidth(), 1); + } + case ISD::AVGCEILS: { + unsigned FullWidth = C1.getBitWidth() + 1; + APInt C1Ext = C1.sext(FullWidth); + APInt C2Ext = C2.sext(FullWidth); + return (C1Ext + C2Ext + 1).extractBits(C1.getBitWidth(), 1); + } + case ISD::AVGCEILU: { + unsigned FullWidth = C1.getBitWidth() + 1; + APInt C1Ext = C1.zext(FullWidth); + APInt C2Ext = C2.zext(FullWidth); + return (C1Ext + C2Ext + 1).extractBits(C1.getBitWidth(), 1); + } } return llvm::None; } diff --git a/llvm/test/CodeGen/AArch64/hadd-combine.ll b/llvm/test/CodeGen/AArch64/hadd-combine.ll --- a/llvm/test/CodeGen/AArch64/hadd-combine.ll +++ b/llvm/test/CodeGen/AArch64/hadd-combine.ll @@ -135,8 +135,7 @@ define <8 x i16> @haddu_i_const_zero(<8 x i16> %src1) { ; CHECK-LABEL: haddu_i_const_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: uhadd v0.8h, v0.8h, v1.8h +; CHECK-NEXT: ushr v0.8h, v0.8h, #1 ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> , <8 x i16> %src1) ret <8 x i16> %result @@ -145,9 +144,7 @@ define <8 x i16> @haddu_i_const_both() { ; CHECK-LABEL: haddu_i_const_both: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.8h, #1 -; CHECK-NEXT: movi v1.8h, #3 -; CHECK-NEXT: uhadd v0.8h, v1.8h, v0.8h +; CHECK-NEXT: movi v0.8h, #2 ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> , <8 x i16> ) ret <8 x i16> %result @@ -156,18 +153,16 @@ define <8 x i16> @haddu_i_const_bothhigh() { ; CHECK-LABEL: haddu_i_const_bothhigh: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.2d, #0xffffffffffffffff -; CHECK-NEXT: mvni v1.8h, #1 -; CHECK-NEXT: uhadd v0.8h, v1.8h, v0.8h +; CHECK-NEXT: mvni v0.8h, #1 ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> , <8 x i16> ) ret <8 x i16> %result } -define <8 x i16> @haddu_i_undef(<8 x i16> %src1) { +define <8 x i16> @haddu_i_undef(<8 x i16> %t, <8 x i16> %src1) { ; CHECK-LABEL: haddu_i_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: uhadd v0.8h, v0.8h, v0.8h +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> undef, <8 x i16> %src1) ret <8 x i16> %result @@ -312,8 +307,7 @@ define <8 x i16> @hadds_i_const_zero(<8 x i16> %src1) { ; CHECK-LABEL: hadds_i_const_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: shadd v0.8h, v0.8h, v1.8h +; CHECK-NEXT: sshr v0.8h, v0.8h, #1 ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> , <8 x i16> %src1) ret <8 x i16> %result @@ -322,9 +316,7 @@ define <8 x i16> @hadds_i_const_both() { ; CHECK-LABEL: hadds_i_const_both: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.8h, #1 -; CHECK-NEXT: movi v1.8h, #3 -; CHECK-NEXT: shadd v0.8h, v1.8h, v0.8h +; CHECK-NEXT: movi v0.8h, #2 ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> , <8 x i16> ) ret <8 x i16> %result @@ -334,18 +326,16 @@ ; CHECK-LABEL: hadds_i_const_bothhigh: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #32766 -; CHECK-NEXT: mvni v0.8h, #128, lsl #8 -; CHECK-NEXT: dup v1.8h, w8 -; CHECK-NEXT: shadd v0.8h, v1.8h, v0.8h +; CHECK-NEXT: dup v0.8h, w8 ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> , <8 x i16> ) ret <8 x i16> %result } -define <8 x i16> @hadds_i_undef(<8 x i16> %src1) { +define <8 x i16> @hadds_i_undef(<8 x i16> %t, <8 x i16> %src1) { ; CHECK-LABEL: hadds_i_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: shadd v0.8h, v0.8h, v0.8h +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> undef, <8 x i16> %src1) ret <8 x i16> %result @@ -508,9 +498,7 @@ define <8 x i16> @rhaddu_i_const_both() { ; CHECK-LABEL: rhaddu_i_const_both: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.8h, #1 -; CHECK-NEXT: movi v1.8h, #3 -; CHECK-NEXT: urhadd v0.8h, v1.8h, v0.8h +; CHECK-NEXT: movi v0.8h, #2 ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> , <8 x i16> ) ret <8 x i16> %result @@ -520,17 +508,15 @@ ; CHECK-LABEL: rhaddu_i_const_bothhigh: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v0.2d, #0xffffffffffffffff -; CHECK-NEXT: mvni v1.8h, #1 -; CHECK-NEXT: urhadd v0.8h, v1.8h, v0.8h ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> , <8 x i16> ) ret <8 x i16> %result } -define <8 x i16> @rhaddu_i_undef(<8 x i16> %src1) { +define <8 x i16> @rhaddu_i_undef(<8 x i16> %t, <8 x i16> %src1) { ; CHECK-LABEL: rhaddu_i_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: urhadd v0.8h, v0.8h, v0.8h +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> undef, <8 x i16> %src1) ret <8 x i16> %result @@ -693,9 +679,7 @@ define <8 x i16> @rhadds_i_const_both() { ; CHECK-LABEL: rhadds_i_const_both: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v0.8h, #1 -; CHECK-NEXT: movi v1.8h, #3 -; CHECK-NEXT: srhadd v0.8h, v1.8h, v0.8h +; CHECK-NEXT: movi v0.8h, #2 ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> , <8 x i16> ) ret <8 x i16> %result @@ -704,19 +688,16 @@ define <8 x i16> @rhadds_i_const_bothhigh() { ; CHECK-LABEL: rhadds_i_const_bothhigh: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #32766 ; CHECK-NEXT: mvni v0.8h, #128, lsl #8 -; CHECK-NEXT: dup v1.8h, w8 -; CHECK-NEXT: srhadd v0.8h, v1.8h, v0.8h ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> , <8 x i16> ) ret <8 x i16> %result } -define <8 x i16> @rhadds_i_undef(<8 x i16> %src1) { +define <8 x i16> @rhadds_i_undef(<8 x i16> %t, <8 x i16> %src1) { ; CHECK-LABEL: rhadds_i_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: srhadd v0.8h, v0.8h, v0.8h +; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret %result = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> undef, <8 x i16> %src1) ret <8 x i16> %result diff --git a/llvm/test/CodeGen/X86/pic-load-remat.ll b/llvm/test/CodeGen/X86/pic-load-remat.ll --- a/llvm/test/CodeGen/X86/pic-load-remat.ll +++ b/llvm/test/CodeGen/X86/pic-load-remat.ll @@ -7,10 +7,9 @@ ; CHECK-NEXT: calll L0$pb ; CHECK-NEXT: L0$pb: ; CHECK-NEXT: popl %eax -; CHECK-NEXT: pxor %xmm0, %xmm0 ; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1] ; CHECK-NEXT: psllw {{\.?LCPI[0-9]+_[0-9]+}}-L0$pb(%eax), %xmm1 -; CHECK-NEXT: pavgw {{\.?LCPI[0-9]+_[0-9]+}}-L0$pb(%eax), %xmm0 +; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [21183,21183,21183,21183,21183,21183,21183,21183] ; CHECK-NEXT: paddsw %xmm0, %xmm0 ; CHECK-NEXT: paddw %xmm1, %xmm0 ; CHECK-NEXT: .p2align 4, 0x90