Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1053,6 +1053,14 @@ setOperationAction(ISD::ABDS, VT, Legal); setOperationAction(ISD::ABDU, VT, Legal); } + for (MVT VT : {MVT::v2i16, MVT::v4i8, MVT::v2i8}) { + setOperationAction(ISD::AVGFLOORS, VT, Custom); + setOperationAction(ISD::AVGFLOORU, VT, Custom); + setOperationAction(ISD::AVGCEILS, VT, Custom); + setOperationAction(ISD::AVGCEILU, VT, Custom); + setOperationAction(ISD::ABDS, VT, Custom); + setOperationAction(ISD::ABDU, VT, Custom); + } // Vector reductions for (MVT VT : { MVT::v4f16, MVT::v2f32, @@ -18555,6 +18563,28 @@ Results.push_back(SDValue(CmpSwap, 3)); } +// Custom lower MULH, ABD, HADD and RHADD nodes that are smaller than legal, +// using a bitcast to a larger legal type upon which we perform the bitcast. +// This allow DAG combine to recognize the nodes where it usually would not. +static SDValue LowerBinopWithBitcast(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + + assert((VT == MVT::v4i8 || VT == MVT::v2i16 || VT == MVT::v2i8) && + "Expected smaller than legal type!"); + + EVT ExtVT = VT == MVT::v4i8 ? MVT::v4i16 : MVT::v2i32; + EVT BinOpVT = VT == MVT::v2i16 ? MVT::v4i16 : MVT::v8i8; + + SDLoc DL(N); + SDValue Ext0 = DAG.getNode(ISD::ANY_EXTEND, DL, ExtVT, N->getOperand(0)); + SDValue Ext1 = DAG.getNode(ISD::ANY_EXTEND, DL, ExtVT, N->getOperand(1)); + SDValue BC0 = DAG.getNode(AArch64ISD::NVCAST, DL, BinOpVT, Ext0); + SDValue BC1 = DAG.getNode(AArch64ISD::NVCAST, DL, BinOpVT, Ext1); + SDValue BinOp = DAG.getNode(N->getOpcode(), DL, BinOpVT, BC0, BC1); + SDValue BC2 = DAG.getNode(AArch64ISD::NVCAST, DL, ExtVT, BinOp); + return DAG.getNode(ISD::TRUNCATE, DL, VT, BC2); +} + void AArch64TargetLowering::ReplaceNodeResults( SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { switch (N->getOpcode()) { @@ -18633,6 +18663,16 @@ // Custom lowering has been requested for INSERT_SUBVECTOR -- but delegate // to common code for result type legalisation return; + case ISD::AVGCEILS: + case ISD::AVGCEILU: + case ISD::AVGFLOORS: + case ISD::AVGFLOORU: + case ISD::ABDS: + case ISD::ABDU: { + SDValue Res = LowerBinopWithBitcast(N, DAG); + Results.push_back(Res); + return; + } case ISD::INTRINSIC_WO_CHAIN: { EVT VT = N->getValueType(0); assert((VT == MVT::i8 || VT == MVT::i16) && Index: llvm/test/CodeGen/AArch64/arm64-vhadd.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-vhadd.ll +++ llvm/test/CodeGen/AArch64/arm64-vhadd.ll @@ -364,16 +364,7 @@ define void @testLowerToSRHADD4b(<4 x i8> %src1, <4 x i8> %src2, <4 x i8>* %dest) nounwind { ; CHECK-LABEL: testLowerToSRHADD4b: ; CHECK: // %bb.0: -; CHECK-NEXT: ushll.4s v0, v0, #0 -; CHECK-NEXT: ushll.4s v1, v1, #0 -; CHECK-NEXT: shl.4s v0, v0, #24 -; CHECK-NEXT: shl.4s v1, v1, #24 -; CHECK-NEXT: sshr.4s v0, v0, #24 -; CHECK-NEXT: sshr.4s v1, v1, #24 -; CHECK-NEXT: mvn.16b v0, v0 -; CHECK-NEXT: sub.4s v0, v1, v0 -; CHECK-NEXT: xtn.4h v0, v0 -; CHECK-NEXT: ushr.4h v0, v0, #1 +; CHECK-NEXT: srhadd.8b v0, v0, v1 ; CHECK-NEXT: xtn.8b v0, v0 ; CHECK-NEXT: str s0, [x0] ; CHECK-NEXT: ret @@ -390,13 +381,7 @@ define void @testLowerToSRHADD2h(<2 x i16> %src1, <2 x i16> %src2, <2 x i16>* %dest) nounwind { ; CHECK-LABEL: testLowerToSRHADD2h: ; CHECK: // %bb.0: -; CHECK-NEXT: shl.2s v0, v0, #16 -; CHECK-NEXT: shl.2s v1, v1, #16 -; CHECK-NEXT: sshr.2s v0, v0, #16 -; CHECK-NEXT: sshr.2s v1, v1, #16 -; CHECK-NEXT: mvn.8b v0, v0 -; CHECK-NEXT: sub.2s v0, v1, v0 -; CHECK-NEXT: ushr.2s v0, v0, #1 +; CHECK-NEXT: srhadd.4h v0, v0, v1 ; CHECK-NEXT: mov.s w8, v0[1] ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: strh w9, [x0] @@ -415,13 +400,7 @@ define void @testLowerToSRHADD2b(<2 x i8> %src1, <2 x i8> %src2, <2 x i8>* %dest) nounwind { ; CHECK-LABEL: testLowerToSRHADD2b: ; CHECK: // %bb.0: -; CHECK-NEXT: shl.2s v0, v0, #24 -; CHECK-NEXT: shl.2s v1, v1, #24 -; CHECK-NEXT: sshr.2s v0, v0, #24 -; CHECK-NEXT: sshr.2s v1, v1, #24 -; CHECK-NEXT: mvn.8b v0, v0 -; CHECK-NEXT: sub.2s v0, v1, v0 -; CHECK-NEXT: ushr.2s v0, v0, #1 +; CHECK-NEXT: srhadd.8b v0, v0, v1 ; CHECK-NEXT: mov.s w8, v0[1] ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: strb w9, [x0] @@ -626,12 +605,7 @@ define void @testLowerToURHADD2h(<2 x i16> %src1, <2 x i16> %src2, <2 x i16>* %dest) nounwind { ; CHECK-LABEL: testLowerToURHADD2h: ; CHECK: // %bb.0: -; CHECK-NEXT: movi d2, #0x00ffff0000ffff -; CHECK-NEXT: and.8b v0, v0, v2 -; CHECK-NEXT: and.8b v1, v1, v2 -; CHECK-NEXT: mvn.8b v0, v0 -; CHECK-NEXT: sub.2s v0, v1, v0 -; CHECK-NEXT: ushr.2s v0, v0, #1 +; CHECK-NEXT: urhadd.4h v0, v0, v1 ; CHECK-NEXT: mov.s w8, v0[1] ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: strh w9, [x0] @@ -650,12 +624,7 @@ define void @testLowerToURHADD2b(<2 x i8> %src1, <2 x i8> %src2, <2 x i8>* %dest) nounwind { ; CHECK-LABEL: testLowerToURHADD2b: ; CHECK: // %bb.0: -; CHECK-NEXT: movi d2, #0x0000ff000000ff -; CHECK-NEXT: and.8b v0, v0, v2 -; CHECK-NEXT: and.8b v1, v1, v2 -; CHECK-NEXT: mvn.8b v0, v0 -; CHECK-NEXT: sub.2s v0, v1, v0 -; CHECK-NEXT: ushr.2s v0, v0, #1 +; CHECK-NEXT: urhadd.8b v0, v0, v1 ; CHECK-NEXT: mov.s w8, v0[1] ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: strb w9, [x0] @@ -674,14 +643,7 @@ define void @testLowerToURHADD4b(<4 x i8> %src1, <4 x i8> %src2, <4 x i8>* %dest) nounwind { ; CHECK-LABEL: testLowerToURHADD4b: ; CHECK: // %bb.0: -; CHECK-NEXT: bic.4h v0, #255, lsl #8 -; CHECK-NEXT: bic.4h v1, #255, lsl #8 -; CHECK-NEXT: ushll.4s v0, v0, #0 -; CHECK-NEXT: ushll.4s v1, v1, #0 -; CHECK-NEXT: mvn.16b v0, v0 -; CHECK-NEXT: sub.4s v0, v1, v0 -; CHECK-NEXT: xtn.4h v0, v0 -; CHECK-NEXT: ushr.4h v0, v0, #1 +; CHECK-NEXT: urhadd.8b v0, v0, v1 ; CHECK-NEXT: xtn.8b v0, v0 ; CHECK-NEXT: str s0, [x0] ; CHECK-NEXT: ret @@ -965,11 +927,9 @@ define <4 x i16> @hadd8_sext_asr(<4 x i8> %src1, <4 x i8> %src2) nounwind { ; CHECK-LABEL: hadd8_sext_asr: ; CHECK: // %bb.0: +; CHECK-NEXT: shadd.8b v0, v0, v1 ; CHECK-NEXT: shl.4h v0, v0, #8 -; CHECK-NEXT: shl.4h v1, v1, #8 ; CHECK-NEXT: sshr.4h v0, v0, #8 -; CHECK-NEXT: ssra.4h v0, v1, #8 -; CHECK-NEXT: sshr.4h v0, v0, #1 ; CHECK-NEXT: ret %zextsrc1 = sext <4 x i8> %src1 to <4 x i16> %zextsrc2 = sext <4 x i8> %src2 to <4 x i16> @@ -981,10 +941,8 @@ define <4 x i16> @hadd8_zext_asr(<4 x i8> %src1, <4 x i8> %src2) nounwind { ; CHECK-LABEL: hadd8_zext_asr: ; CHECK: // %bb.0: +; CHECK-NEXT: uhadd.8b v0, v0, v1 ; CHECK-NEXT: bic.4h v0, #255, lsl #8 -; CHECK-NEXT: bic.4h v1, #255, lsl #8 -; CHECK-NEXT: add.4h v0, v0, v1 -; CHECK-NEXT: ushr.4h v0, v0, #1 ; CHECK-NEXT: ret %zextsrc1 = zext <4 x i8> %src1 to <4 x i16> %zextsrc2 = zext <4 x i8> %src2 to <4 x i16> @@ -1012,10 +970,8 @@ define <4 x i16> @hadd8_zext_lsr(<4 x i8> %src1, <4 x i8> %src2) nounwind { ; CHECK-LABEL: hadd8_zext_lsr: ; CHECK: // %bb.0: +; CHECK-NEXT: uhadd.8b v0, v0, v1 ; CHECK-NEXT: bic.4h v0, #255, lsl #8 -; CHECK-NEXT: bic.4h v1, #255, lsl #8 -; CHECK-NEXT: add.4h v0, v0, v1 -; CHECK-NEXT: ushr.4h v0, v0, #1 ; CHECK-NEXT: ret %zextsrc1 = zext <4 x i8> %src1 to <4 x i16> %zextsrc2 = zext <4 x i8> %src2 to <4 x i16> Index: llvm/test/CodeGen/AArch64/neon-abd.ll =================================================================== --- llvm/test/CodeGen/AArch64/neon-abd.ll +++ llvm/test/CodeGen/AArch64/neon-abd.ll @@ -49,12 +49,8 @@ define <4 x i16> @sabd_4h_promoted_ops(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-LABEL: sabd_4h_promoted_ops: ; CHECK: // %bb.0: -; CHECK-NEXT: shl v0.4h, v0.4h, #8 -; CHECK-NEXT: shl v1.4h, v1.4h, #8 -; CHECK-NEXT: sshr v0.4h, v0.4h, #8 -; CHECK-NEXT: sshr v1.4h, v1.4h, #8 -; CHECK-NEXT: sub v0.4h, v0.4h, v1.4h -; CHECK-NEXT: abs v0.4h, v0.4h +; CHECK-NEXT: sabd v0.8b, v0.8b, v1.8b +; CHECK-NEXT: bic v0.4h, #255, lsl #8 ; CHECK-NEXT: ret %a.sext = sext <4 x i8> %a to <4 x i16> %b.sext = sext <4 x i8> %b to <4 x i16> @@ -104,12 +100,9 @@ define <2 x i32> @sabd_2s_promoted_ops(<2 x i16> %a, <2 x i16> %b) #0 { ; CHECK-LABEL: sabd_2s_promoted_ops: ; CHECK: // %bb.0: -; CHECK-NEXT: shl v0.2s, v0.2s, #16 -; CHECK-NEXT: shl v1.2s, v1.2s, #16 -; CHECK-NEXT: sshr v0.2s, v0.2s, #16 -; CHECK-NEXT: sshr v1.2s, v1.2s, #16 -; CHECK-NEXT: sub v0.2s, v0.2s, v1.2s -; CHECK-NEXT: abs v0.2s, v0.2s +; CHECK-NEXT: movi d2, #0x00ffff0000ffff +; CHECK-NEXT: sabd v0.4h, v0.4h, v1.4h +; CHECK-NEXT: and v0.8b, v0.8b, v2.8b ; CHECK-NEXT: ret %a.sext = sext <2 x i16> %a to <2 x i32> %b.sext = sext <2 x i16> %b to <2 x i32> @@ -232,10 +225,8 @@ define <4 x i16> @uabd_4h_promoted_ops(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-LABEL: uabd_4h_promoted_ops: ; CHECK: // %bb.0: +; CHECK-NEXT: uabd v0.8b, v0.8b, v1.8b ; CHECK-NEXT: bic v0.4h, #255, lsl #8 -; CHECK-NEXT: bic v1.4h, #255, lsl #8 -; CHECK-NEXT: sub v0.4h, v0.4h, v1.4h -; CHECK-NEXT: abs v0.4h, v0.4h ; CHECK-NEXT: ret %a.zext = zext <4 x i8> %a to <4 x i16> %b.zext = zext <4 x i8> %b to <4 x i16> @@ -286,10 +277,8 @@ ; CHECK-LABEL: uabd_2s_promoted_ops: ; CHECK: // %bb.0: ; CHECK-NEXT: movi d2, #0x00ffff0000ffff +; CHECK-NEXT: uabd v0.4h, v0.4h, v1.4h ; CHECK-NEXT: and v0.8b, v0.8b, v2.8b -; CHECK-NEXT: and v1.8b, v1.8b, v2.8b -; CHECK-NEXT: sub v0.2s, v0.2s, v1.2s -; CHECK-NEXT: abs v0.2s, v0.2s ; CHECK-NEXT: ret %a.zext = zext <2 x i16> %a to <2 x i32> %b.zext = zext <2 x i16> %b to <2 x i32>