diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -969,7 +969,7 @@ ISD::UINT_TO_FP}); setTargetDAGCombine({ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT, - ISD::FP_TO_UINT_SAT, ISD::FDIV}); + ISD::FP_TO_UINT_SAT, ISD::FADD, ISD::FDIV}); // Try and combine setcc with csel setTargetDAGCombine(ISD::SETCC); @@ -16472,6 +16472,42 @@ return SDValue(); } +static SDValue performFADDCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + SelectionDAG &DAG = DCI.DAG; + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + EVT VT = N->getValueType(0); + SDLoc DL(N); + + if (!N->getFlags().hasAllowReassociation()) + return SDValue(); + + // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c) + auto ReassocComplex = [&](SDValue A, SDValue B) { + if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN) + return SDValue(); + unsigned Opc = A.getConstantOperandVal(0); + if (Opc != Intrinsic::aarch64_neon_vcmla_rot0 && + Opc != Intrinsic::aarch64_neon_vcmla_rot90 && + Opc != Intrinsic::aarch64_neon_vcmla_rot180 && + Opc != Intrinsic::aarch64_neon_vcmla_rot270) + return SDValue(); + SDValue VCMLA = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0), + DAG.getNode(ISD::FADD, DL, VT, A.getOperand(1), B, N->getFlags()), + A.getOperand(2), A.getOperand(3)); + VCMLA->setFlags(A->getFlags()); + return VCMLA; + }; + if (SDValue R = ReassocComplex(LHS, RHS)) + return R; + if (SDValue R = ReassocComplex(RHS, LHS)) + return R; + + return SDValue(); +} + static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) { switch (Opcode) { case ISD::STRICT_FADD: @@ -21569,6 +21605,8 @@ return performORCombine(N, DCI, Subtarget, *this); case ISD::AND: return performANDCombine(N, DCI); + case ISD::FADD: + return performFADDCombine(N, DCI); case ISD::INTRINSIC_WO_CHAIN: return performIntrinsicCombine(N, DCI, Subtarget); case ISD::ANY_EXTEND: diff --git a/llvm/test/CodeGen/AArch64/neon-vcmla.ll b/llvm/test/CodeGen/AArch64/neon-vcmla.ll --- a/llvm/test/CodeGen/AArch64/neon-vcmla.ll +++ b/llvm/test/CodeGen/AArch64/neon-vcmla.ll @@ -311,9 +311,7 @@ define <4 x float> @reassoc_f32x4(<4 x float> %a, <4 x float> %b, <4 x float> %c) { ; CHECK-LABEL: reassoc_f32x4: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: movi v3.2d, #0000000000000000 -; CHECK-NEXT: fcmla v3.4s, v1.4s, v2.4s, #0 -; CHECK-NEXT: fadd v0.4s, v3.4s, v0.4s +; CHECK-NEXT: fcmla v0.4s, v1.4s, v2.4s, #0 ; CHECK-NEXT: ret entry: %d = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot0.v4f32(<4 x float> zeroinitializer, <4 x float> %b, <4 x float> %c) @@ -324,9 +322,7 @@ define <4 x float> @reassoc_c_f32x4(<4 x float> %a, <4 x float> %b, <4 x float> %c) { ; CHECK-LABEL: reassoc_c_f32x4: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: movi v3.2d, #0000000000000000 -; CHECK-NEXT: fcmla v3.4s, v1.4s, v2.4s, #90 -; CHECK-NEXT: fadd v0.4s, v0.4s, v3.4s +; CHECK-NEXT: fcmla v0.4s, v1.4s, v2.4s, #90 ; CHECK-NEXT: ret entry: %d = tail call <4 x float> @llvm.aarch64.neon.vcmla.rot90.v4f32(<4 x float> zeroinitializer, <4 x float> %b, <4 x float> %c) @@ -337,9 +333,7 @@ define <4 x half> @reassoc_f16x4(<4 x half> %a, <4 x half> %b, <4 x half> %c) { ; CHECK-LABEL: reassoc_f16x4: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: movi d3, #0000000000000000 -; CHECK-NEXT: fcmla v3.4h, v1.4h, v2.4h, #180 -; CHECK-NEXT: fadd v0.4h, v3.4h, v0.4h +; CHECK-NEXT: fcmla v0.4h, v1.4h, v2.4h, #180 ; CHECK-NEXT: ret entry: %d = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot180.v4f16(<4 x half> zeroinitializer, <4 x half> %b, <4 x half> %c) @@ -350,9 +344,7 @@ define <4 x half> @reassoc_c_f16x4(<4 x half> %a, <4 x half> %b, <4 x half> %c) { ; CHECK-LABEL: reassoc_c_f16x4: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: movi d3, #0000000000000000 -; CHECK-NEXT: fcmla v3.4h, v1.4h, v2.4h, #270 -; CHECK-NEXT: fadd v0.4h, v0.4h, v3.4h +; CHECK-NEXT: fcmla v0.4h, v1.4h, v2.4h, #270 ; CHECK-NEXT: ret entry: %d = tail call <4 x half> @llvm.aarch64.neon.vcmla.rot270.v4f16(<4 x half> zeroinitializer, <4 x half> %b, <4 x half> %c) @@ -363,10 +355,8 @@ define <2 x double> @reassoc_f64x2(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double> %g) { ; CHECK-LABEL: reassoc_f64x2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: movi v4.2d, #0000000000000000 ; CHECK-NEXT: fcmla v0.2d, v1.2d, v2.2d, #270 -; CHECK-NEXT: fcmla v4.2d, v2.2d, v3.2d, #270 -; CHECK-NEXT: fadd v0.2d, v4.2d, v0.2d +; CHECK-NEXT: fcmla v0.2d, v2.2d, v3.2d, #270 ; CHECK-NEXT: ret entry: %d = tail call <2 x double> @llvm.aarch64.neon.vcmla.rot270.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c) @@ -378,10 +368,9 @@ define <2 x double> @reassoc_c_f64x2(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double> %g) { ; CHECK-LABEL: reassoc_c_f64x2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov v4.16b, v0.16b +; CHECK-NEXT: fadd v0.2d, v0.2d, v0.2d +; CHECK-NEXT: fcmla v0.2d, v1.2d, v2.2d, #270 ; CHECK-NEXT: fcmla v0.2d, v2.2d, v3.2d, #270 -; CHECK-NEXT: fcmla v4.2d, v1.2d, v2.2d, #270 -; CHECK-NEXT: fadd v0.2d, v0.2d, v4.2d ; CHECK-NEXT: ret entry: %d = tail call <2 x double> @llvm.aarch64.neon.vcmla.rot270.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c)