Index: llvm/include/llvm/CodeGen/TargetInstrInfo.h =================================================================== --- llvm/include/llvm/CodeGen/TargetInstrInfo.h +++ llvm/include/llvm/CodeGen/TargetInstrInfo.h @@ -1403,6 +1403,9 @@ } virtual bool optimizeCondBranch(MachineInstr &MI) const { return false; } + /// Look through all instructions, find which can be merged together and merge them + virtual bool mergeInstructions(MachineFunction& MF) const { return false; } + /// Try to remove the load by folding it to a register operand at the use. /// We fold the load instructions if and only if the /// def and use are in the same BB. We only look at one load and see Index: llvm/lib/CodeGen/PeepholeOptimizer.cpp =================================================================== --- llvm/lib/CodeGen/PeepholeOptimizer.cpp +++ llvm/lib/CodeGen/PeepholeOptimizer.cpp @@ -112,6 +112,10 @@ DisablePeephole("disable-peephole", cl::Hidden, cl::init(false), cl::desc("Disable the peephole optimizer")); +static cl::opt +TargetSideInstructionsMerge("enable-target-side-instrs-merge", cl::Hidden, cl::init(true), + cl::desc("Enable merging instructions on target side")); + /// Specifiy whether or not the value tracking looks through /// complex instructions. When this is true, the value tracker /// bails on everything that is not a copy or a bitcast. @@ -1802,6 +1806,10 @@ } } + if (TargetSideInstructionsMerge) { + Changed |= TII->mergeInstructions(MF); + } + return Changed; } Index: llvm/lib/Target/AArch64/AArch64InstrInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -209,6 +209,8 @@ const MachineRegisterInfo *MRI) const override; bool optimizeCondBranch(MachineInstr &MI) const override; + bool mergeInstructions(MachineFunction& MF) const override; + /// Return true when a code sequence can improve throughput. It /// should be called only for instructions in loops. /// \param Pattern - combiner pattern Index: llvm/lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -54,6 +55,9 @@ #define GET_INSTRINFO_CTOR_DTOR #include "AArch64GenInstrInfo.inc" +#define DEBUG_TYPE "aarch64-instr-info" +STATISTIC(NumOfMergesTSTAND, "Number of merges of AND and TST instructions"); + static cl::opt TBZDisplacementBits( "aarch64-tbz-offset-bits", cl::Hidden, cl::init(14), cl::desc("Restrict range of TB[N]Z instructions (DEBUG)")); @@ -5325,6 +5329,128 @@ } } +static MachineBasicBlock::iterator findInstrWithReg(const MachineBasicBlock::iterator IterBegin, + const MachineBasicBlock::iterator IterEnd, + const Register Reg, + const TargetRegisterInfo& TRI) { + return std::find_if(IterBegin, IterEnd, + [Reg, &TRI](const auto& MI){ return MI.readsRegister(Reg, &TRI); }); +} + +static bool HasSameOperands(const MachineInstr& FirstInstr, + const MachineInstr& SecondInstr) { + const auto FirstOp1 = FirstInstr.getOperand(1).getReg(); + const auto FirstOp2 = SecondInstr.getOperand(1).getReg(); + const auto SecondOp1 = FirstInstr.getOperand(2).getReg(); + const auto SecondOp2 = SecondInstr.getOperand(2).getReg(); + + return (FirstOp1 == FirstOp2 && SecondOp1 == SecondOp2) + ||(FirstOp1 == SecondOp2 && SecondOp1 == FirstOp2); +} + +/// +/// Merge AND and TST sequence by ANDS instruction when: +/// 1. AND and TST use same operands +/// 2. Flags are not used between AND and ANDS +/// NB: TST is alias for ANDS xzr +/// +/// Example: +/// \code +/// tst x2, x1 +/// and x3, x2, x1 +/// \endcode +/// to +/// \code +/// ands x3, x2, x1 +/// \endcode +/// +/// \param MF Conditional Branch +/// \return True when at least one merge was generated +/// +bool AArch64InstrInfo::mergeInstructions(MachineFunction& MF) const { + bool Changed = false; + std::vector MInstrsToDelete; + + for (MachineBasicBlock &MBB : MF) { + const auto EndIter = MBB.end(); + MachineBasicBlock::iterator MIAndIter = EndIter; + + for (MachineBasicBlock::iterator CurrentInstrIter = MBB.begin(); + CurrentInstrIter != EndIter; ++CurrentInstrIter) { + + const auto Opcode = CurrentInstrIter->getOpcode(); + if (Opcode == AArch64::ANDXrr || Opcode == AArch64::ANDWrr) { + assert(CurrentInstrIter->getNumOperands() == 3 && "Unexpected number of operands in ANDXrr"); + MIAndIter = CurrentInstrIter; + } + else if ((Opcode == AArch64::ANDSXrr || Opcode == AArch64::ANDSWrr) + && MIAndIter != EndIter) { + assert(CurrentInstrIter->getNumOperands() == 4 && "Unexpected number of operands in ANDSXrr"); + + auto SiutableForMerge = [](const TargetRegisterInfo& TRI, + MachineBasicBlock::iterator& MIAndIter, + const MachineBasicBlock::iterator& CurrentInstrIter) { + MachineInstr* MIAnd = &*MIAndIter; + MachineInstr* MITst = &*CurrentInstrIter; + + if (!HasSameOperands(*MIAnd, *MITst)) { + // Operands are differs, not a candidate for merge + return false; + } + + const auto& RetVal = MITst->getOperand(0); + auto TmpIter = CurrentInstrIter; + auto EndIter = MIAnd->getParent()->end(); + assert(RetVal.isReg() && "Return value should be stored in register"); + const auto RetValUseIter = findInstrWithReg(++TmpIter, EndIter, RetVal.getReg(), TRI); + + if (RetValUseIter != EndIter) { + // Return value of ANDS is used. + // This case requires additional analysis. Not implemented yet + return false; + } + + const auto FlagReg = MITst->getOperand(3); + assert(FlagReg.isReg() && "Third operand in ANDS instruction can be register only"); + const auto FlagUseIter = findInstrWithReg(MIAndIter, CurrentInstrIter, FlagReg.getReg(), TRI); + + if (FlagUseIter != CurrentInstrIter) { + // Flag is used between AND and TST + // This case requires additional analysis. Not implemented yet + // Drop this instruction + MIAndIter = EndIter; + return false; + } + + return true; + }; + + if (!SiutableForMerge(getRegisterInfo(), MIAndIter, CurrentInstrIter)) { + continue; + } + + // Flags are not used between TST and AND instruction + // TST can be merged with AND + DebugLoc DL = MIAndIter->getDebugLoc(); + BuildMI(MBB, MIAndIter, DL, CurrentInstrIter->getDesc(), MIAndIter->getOperand(0).getReg()) + .addReg(MIAndIter->getOperand(1).getReg()) + .addReg(MIAndIter->getOperand(2).getReg()); + MInstrsToDelete.push_back(&*MIAndIter); + MInstrsToDelete.push_back(&*CurrentInstrIter); + Changed = true; + MIAndIter = EndIter; + ++NumOfMergesTSTAND; + } + } + + for (auto* Instr : MInstrsToDelete) { + MBB.erase(Instr); + } + MInstrsToDelete.clear(); + } + return Changed; +} + std::pair AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { const unsigned Mask = AArch64II::MO_FRAGMENT; Index: llvm/test/CodeGen/AArch64/peephole-and-tst.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/peephole-and-tst.ll @@ -0,0 +1,83 @@ +; RUN: llc -mtriple=aarch64-linux-gnu %s -enable-target-side-instrs-merge=true -o %t; FileCheck %s --input-file=%t --check-prefix=CHECKMERGE +; RUN: llc -mtriple=aarch64-linux-gnu %s -enable-target-side-instrs-merge=false -o %t; FileCheck %s --input-file=%t --check-prefix=CHECKMERGENOT + +%struct.anon = type { i32*, i32* } + +@ptr_wrapper = common dso_local local_unnamed_addr global %struct.anon* null, align 8 + +define dso_local i32 @test_func_i32(i32 %in, i32 %bit, i32 %mask) local_unnamed_addr { +entry: + %0 = load %struct.anon*, %struct.anon** @ptr_wrapper, align 8 + %result = getelementptr inbounds %struct.anon, %struct.anon* %0, i64 0, i32 1 + %tobool2 = icmp ne i32 %mask, 0 + br label %do.body + +do.body: ; preds = %4, %entry +; CHECKMERGE-LABEL: test_func_i32: +; CHECKMERGE-LABEL: .LBB0_2: +; CHECKMERGE: ands [[DSTREG:w[0-9]+]], [[SRCREG1:w[0-9]+]], [[SRCREG2:w[0-9]+]] +; CHECKMERGENOT-LABEL: .LBB0_2: +; CHECKMERGENOT: and [[DSTREG:w[0-9]+]], [[SRCREG1:w[0-9]+]], [[SRCREG2:w[0-9]+]] +; CHECKMERGENOT-NEXT: tst [[SRCREG1]], [[SRCREG2]] + %bit.addr.0 = phi i32 [ %bit, %entry ], [ %shl, %4 ] + %retval1.0 = phi i32 [ 0, %entry ], [ %retval1.1, %4 ] + %and = and i32 %bit.addr.0, %in + %tobool = icmp eq i32 %and, 0 + %not.tobool = xor i1 %tobool, true + %inc = zext i1 %not.tobool to i32 + %retval1.1 = add nuw nsw i32 %retval1.0, %inc + %1 = xor i1 %tobool, true + %2 = or i1 %tobool2, %1 + br i1 %2, label %3, label %4 + +3: ; preds = %do.body + store i32* null, i32** %result, align 8 + br label %4 + +4: ; preds = %do.body, %3 + %shl = shl i32 %bit.addr.0, 1 + %tobool6 = icmp eq i32 %shl, 0 + br i1 %tobool6, label %do.end, label %do.body + +do.end: ; preds = %4 + ret i32 %retval1.1 +} + + +define dso_local i32 @test_func_i64(i64 %in, i64 %bit, i64 %mask) local_unnamed_addr #0 { +entry: + %0 = load %struct.anon*, %struct.anon** @ptr_wrapper, align 8 + %result = getelementptr inbounds %struct.anon, %struct.anon* %0, i64 0, i32 1 + %tobool2 = icmp ne i64 %mask, 0 + br label %do.body + +do.body: ; preds = %4, %entry +; CHECKMERGE-LABEL: test_func_i64: +; CHECKMERGE-LABEL: .LBB1_2: +; CHECKMERGE: ands [[DSTREG:x[0-9]+]], [[SRCREG1:x[0-9]+]], [[SRCREG2:x[0-9]+]] +; CHECKMERGENOT-LABEL: .LBB1_2: +; CHECKMERGENOT: and [[DSTREG:x[0-9]+]], [[SRCREG1:x[0-9]+]], [[SRCREG2:x[0-9]+]] +; CHECKMERGENOT-NEXT: tst [[SRCREG1]], [[SRCREG2]] + %bit.addr.0 = phi i64 [ %bit, %entry ], [ %shl, %4 ] + %retval1.0 = phi i32 [ 0, %entry ], [ %retval1.1, %4 ] + %and = and i64 %bit.addr.0, %in + %tobool = icmp eq i64 %and, 0 + %not.tobool = xor i1 %tobool, true + %inc = zext i1 %not.tobool to i32 + %retval1.1 = add nuw nsw i32 %retval1.0, %inc + %1 = xor i1 %tobool, true + %2 = or i1 %tobool2, %1 + br i1 %2, label %3, label %4 + +3: ; preds = %do.body + store i32* null, i32** %result, align 8 + br label %4 + +4: ; preds = %do.body, %3 + %shl = shl i64 %bit.addr.0, 1 + %tobool6 = icmp eq i64 %shl, 0 + br i1 %tobool6, label %do.end, label %do.body + +do.end: ; preds = %4 + ret i32 %retval1.1 +} \ No newline at end of file