diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -17883,8 +17883,8 @@ // instruction can still be used profitably. This function puts the DAG into a // more appropriate form for those patterns to trigger. static SDValue performAddSubLongCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - SelectionDAG &DAG) { + TargetLowering::DAGCombinerInfo &DCI) { + SelectionDAG &DAG = DCI.DAG; if (DCI.isBeforeLegalizeOps()) return SDValue(); @@ -18218,9 +18218,9 @@ // for vectors of type <1 x i64> and <2 x i64> when SVE is available. // It will transform the add/sub to a scalable version, so that we can // make use of SVE's MLA/MLS that will be generated for that pattern -static SDValue performMulAddSubCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - SelectionDAG &DAG) { +static SDValue +performSVEMulAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { + SelectionDAG &DAG = DCI.DAG; // Make sure that the types are legal if (!DCI.isAfterLegalizeDAG()) return SDValue(); @@ -18266,28 +18266,64 @@ return SDValue(); } +// Given a i64 add from a v1i64 extract, convert to a neon v1i64 add. This can +// help, for example, to produce ssra from sshr+add. +static SDValue performAddSubIntoVectorOp(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + if (VT != MVT::i64) + return SDValue(); + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + + // At least one of the operands should be an extract, and the other should be + // something that is easy to convert to v1i64 type (in this case a load). + if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT && + Op0.getOpcode() != ISD::LOAD) + return SDValue(); + if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT && + Op1.getOpcode() != ISD::LOAD) + return SDValue(); + + SDLoc DL(N); + if (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + Op0.getOperand(0).getValueType() == MVT::v1i64) { + Op0 = Op0.getOperand(0); + Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op1); + } else if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + Op1.getOperand(0).getValueType() == MVT::v1i64) { + Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op0); + Op1 = Op1.getOperand(0); + } else + return SDValue(); + + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, + DAG.getNode(N->getOpcode(), DL, MVT::v1i64, Op0, Op1), + DAG.getConstant(0, DL, MVT::i64)); +} + static SDValue performAddSubCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - SelectionDAG &DAG) { - if (SDValue Val = performMulAddSubCombine(N, DCI, DAG)) - return Val; + TargetLowering::DAGCombinerInfo &DCI) { // Try to change sum of two reductions. - if (SDValue Val = performAddUADDVCombine(N, DAG)) + if (SDValue Val = performAddUADDVCombine(N, DCI.DAG)) + return Val; + if (SDValue Val = performAddDotCombine(N, DCI.DAG)) + return Val; + if (SDValue Val = performAddCSelIntoCSinc(N, DCI.DAG)) return Val; - if (SDValue Val = performAddDotCombine(N, DAG)) + if (SDValue Val = performNegCSelCombine(N, DCI.DAG)) return Val; - if (SDValue Val = performAddCSelIntoCSinc(N, DAG)) + if (SDValue Val = performVectorAddSubExtCombine(N, DCI.DAG)) return Val; - if (SDValue Val = performNegCSelCombine(N, DAG)) + if (SDValue Val = performAddCombineForShiftedOperands(N, DCI.DAG)) return Val; - if (SDValue Val = performVectorAddSubExtCombine(N, DAG)) + if (SDValue Val = performSubAddMULCombine(N, DCI.DAG)) return Val; - if (SDValue Val = performAddCombineForShiftedOperands(N, DAG)) + if (SDValue Val = performSVEMulAddSubCombine(N, DCI)) return Val; - if (SDValue Val = performSubAddMULCombine(N, DAG)) + if (SDValue Val = performAddSubIntoVectorOp(N, DCI.DAG)) return Val; - return performAddSubLongCombine(N, DCI, DAG); + return performAddSubLongCombine(N, DCI); } // Massage DAGs which we can use the high-half "long" operations on into @@ -22312,7 +22348,7 @@ return performVecReduceBitwiseCombine(N, DCI, DAG); case ISD::ADD: case ISD::SUB: - return performAddSubCombine(N, DCI, DAG); + return performAddSubCombine(N, DCI); case ISD::BUILD_VECTOR: return performBuildVectorCombine(N, DCI, DAG); case ISD::TRUNCATE: diff --git a/llvm/test/CodeGen/AArch64/add-extract.ll b/llvm/test/CodeGen/AArch64/add-extract.ll --- a/llvm/test/CodeGen/AArch64/add-extract.ll +++ b/llvm/test/CodeGen/AArch64/add-extract.ll @@ -4,10 +4,9 @@ define i64 @add_i64_ext_load(<1 x i64> %A, ptr %B) nounwind { ; CHECK-LABEL: add_i64_ext_load: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: fmov x9, d0 -; CHECK-NEXT: add x0, x9, x8 +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: add d0, d0, d1 +; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret %a = extractelement <1 x i64> %A, i32 0 %b = load i64, ptr %B @@ -18,10 +17,9 @@ define i64 @sub_i64_ext_load(<1 x i64> %A, ptr %B) nounwind { ; CHECK-LABEL: sub_i64_ext_load: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: fmov x9, d0 -; CHECK-NEXT: sub x0, x9, x8 +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: sub d0, d0, d1 +; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret %a = extractelement <1 x i64> %A, i32 0 %b = load i64, ptr %B @@ -32,11 +30,9 @@ define void @add_i64_ext_load_store(<1 x i64> %A, ptr %B) nounwind { ; CHECK-LABEL: add_i64_ext_load_store: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: fmov x9, d0 -; CHECK-NEXT: add x8, x9, x8 -; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: add d0, d0, d1 +; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret %a = extractelement <1 x i64> %A, i32 0 %b = load i64, ptr %B @@ -61,11 +57,8 @@ define i64 @add_i64_ext_ext(<1 x i64> %A, <1 x i64> %B) nounwind { ; CHECK-LABEL: add_i64_ext_ext: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: fmov x9, d1 -; CHECK-NEXT: add x0, x8, x9 +; CHECK-NEXT: add d0, d0, d1 +; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret %a = extractelement <1 x i64> %A, i32 0 %b = extractelement <1 x i64> %B, i32 0 @@ -90,12 +83,10 @@ define i64 @add_i64_ext_ext_test1(<1 x i64> %A, <2 x i64> %B) nounwind { ; CHECK-LABEL: add_i64_ext_ext_test1: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov x8, v1.d[1] -; CHECK-NEXT: fmov x9, d0 -; CHECK-NEXT: fmov x10, d1 -; CHECK-NEXT: add x9, x9, x10 -; CHECK-NEXT: add x0, x9, x8 +; CHECK-NEXT: add d0, d0, d1 +; CHECK-NEXT: dup v1.2d, v1.d[1] +; CHECK-NEXT: add d0, d0, d1 +; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret %a = extractelement <1 x i64> %A, i32 0 %b = extractelement <2 x i64> %B, i32 0 @@ -108,12 +99,10 @@ define i64 @sub_i64_ext_ext_test1(<1 x i64> %A, <2 x i64> %B) nounwind { ; CHECK-LABEL: sub_i64_ext_ext_test1: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov x8, v1.d[1] -; CHECK-NEXT: fmov x9, d0 -; CHECK-NEXT: fmov x10, d1 -; CHECK-NEXT: sub x9, x9, x10 -; CHECK-NEXT: sub x0, x9, x8 +; CHECK-NEXT: sub d0, d0, d1 +; CHECK-NEXT: dup v1.2d, v1.d[1] +; CHECK-NEXT: sub d0, d0, d1 +; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret %a = extractelement <1 x i64> %A, i32 0 %b = extractelement <2 x i64> %B, i32 0 diff --git a/llvm/test/CodeGen/AArch64/arm64-vshift.ll b/llvm/test/CodeGen/AArch64/arm64-vshift.ll --- a/llvm/test/CodeGen/AArch64/arm64-vshift.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vshift.ll @@ -2733,10 +2733,9 @@ ; CHECK-LABEL: ursra_scalar: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr x9, [x1] -; CHECK-NEXT: urshr d0, d0, #1 -; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: add x0, x8, x9 +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ursra d1, d0, #1 +; CHECK-NEXT: fmov x0, d1 ; CHECK-NEXT: ret %tmp1 = load i64, ptr %A %tmp3 = call i64 @llvm.aarch64.neon.urshl.i64(i64 %tmp1, i64 -1) @@ -2861,10 +2860,9 @@ ; CHECK-LABEL: srsra_scalar: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr x9, [x1] -; CHECK-NEXT: srshr d0, d0, #1 -; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: add x0, x8, x9 +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: srsra d1, d0, #1 +; CHECK-NEXT: fmov x0, d1 ; CHECK-NEXT: ret %tmp1 = load i64, ptr %A %tmp3 = call i64 @llvm.aarch64.neon.srshl.i64(i64 %tmp1, i64 -1)