diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1753,6 +1753,25 @@ return VT.changeVectorElementTypeToInteger(); } +// isIntImmediate - This method tests to see if the node is a constant +// operand. If so Imm will receive the value. +static bool isIntImmediate(const SDNode *N, uint64_t &Imm) { + if (const ConstantSDNode *C = dyn_cast(N)) { + Imm = C->getZExtValue(); + return true; + } + return false; +} + +// isOpcWithIntImmediate - This method tests to see if the node is a specific +// opcode and that it has a immediate integer right operand. +// If so Imm will receive the value. +static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc, + uint64_t &Imm) { + return N->getOpcode() == Opc && + isIntImmediate(N->getOperand(1).getNode(), Imm); +} + static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, const APInt &Demanded, TargetLowering::TargetLoweringOpt &TLO, @@ -16573,6 +16592,32 @@ return SDValue(); } +static SDValue performAddCombineForShiftedOperands(SDNode*N, SelectionDAG& DAG) { + // NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not commutative. + if (N->getOpcode() != ISD::ADD) + return SDValue(); + + // Bail out when value type is not one of {i32, i64}, since AArch64 ADD with + // shifted register is only available for i32 and i64. + EVT VT = N->getValueType(0); + if (VT != MVT::i32 && VT != MVT::i64) + return SDValue(); + + SDLoc DL(N); + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + uint64_t LHSImm = 0, RHSImm = 0; + // If both operand are shifted by imm and shift amount is not greater than 4 for one operand, swap LHS and RHS to put operand with smaller shift amount on RHS. + // + // On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with LSL shift (shift <= 4) has smaller latency and larger throughput than AND with LSL (shift >= 4). + // For the rest of processors, this is no-op for performance or correcteness. + if (isOpcWithIntImmediate(LHS.getNode(), ISD::SHL, LHSImm) && isOpcWithIntImmediate(RHS.getNode(), ISD::SHL, RHSImm) && LHSImm <= 4 && RHSImm > 4 && LHS.hasOneUse()) + return DAG.getNode(ISD::ADD, DL, VT, RHS, LHS); + + return SDValue(); +} + static SDValue performAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { @@ -16587,6 +16632,8 @@ return Val; if (SDValue Val = performVectorAddSubExtCombine(N, DAG)) return Val; + if (SDValue Val = performAddCombineForShiftedOperands(N, DAG)) + return Val; return performAddSubLongCombine(N, DCI, DAG); } diff --git a/llvm/test/CodeGen/AArch64/logical_shifted_reg.ll b/llvm/test/CodeGen/AArch64/logical_shifted_reg.ll --- a/llvm/test/CodeGen/AArch64/logical_shifted_reg.ll +++ b/llvm/test/CodeGen/AArch64/logical_shifted_reg.ll @@ -292,8 +292,8 @@ define i64 @add_swap_rhs_lhs_i64(i64 %0, i64 %1) { ; CHECK-LABEL: add_swap_rhs_lhs_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: lsl x8, x1, #3 -; CHECK-NEXT: add x0, x8, x0, lsl #8 +; CHECK-NEXT: lsl x8, x0, #8 +; CHECK-NEXT: add x0, x8, x1, lsl #3 ; CHECK-NEXT: ret %3 = shl i64 %0, 8 %4 = shl i64 %1, 3 @@ -318,8 +318,8 @@ define i32 @add_swap_rhs_lhs_i32(i32 %0, i32 %1) { ; CHECK-LABEL: add_swap_rhs_lhs_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: lsl w8, w1, #3 -; CHECK-NEXT: add w0, w8, w0, lsl #8 +; CHECK-NEXT: lsl w8, w0, #8 +; CHECK-NEXT: add w0, w8, w1, lsl #3 ; CHECK-NEXT: ret %3 = shl i32 %0, 8 %4 = shl i32 %1, 3