diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -3030,6 +3030,7 @@ [(set regtype:$Rd, (AArch64csel regtype:$Rn, regtype:$Rm, (i32 imm:$cond), NZCV))]>, Sched<[WriteI, ReadI, ReadI]> { + let isSelect = 1; let Uses = [NZCV]; bits<5> Rd; @@ -5210,6 +5211,8 @@ (AArch64csel (vt regtype:$Rn), regtype:$Rm, (i32 imm:$cond), NZCV))]>, Sched<[WriteF]> { + let isSelect = 1; + bits<5> Rd; bits<5> Rn; bits<5> Rm; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -209,6 +209,12 @@ const DebugLoc &DL, Register DstReg, ArrayRef Cond, Register TrueReg, Register FalseReg) const override; + bool analyzeSelect(const MachineInstr &MI, + SmallVectorImpl &Cond, unsigned &TrueOp, + unsigned &FalseOp, bool &Optimizable) const override; + MachineInstr *optimizeSelect(MachineInstr &MI, + SmallPtrSetImpl &NewMIs, + bool PreferFalse = false) const override; MCInst getNop() const override; bool isSchedulingBoundary(const MachineInstr &MI, diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -536,6 +536,20 @@ return VReg; } +// Find the original register that VReg is copied from. Return the MI that +// defines it. +static const MachineInstr *removeCopiesInstr(const MachineRegisterInfo &MRI, + const MachineOperand *MO) { + while (Register::isVirtualRegister(MO->getReg())) { + const MachineInstr *DefMI = MRI.getVRegDef(MO->getReg()); + if (!DefMI->isFullCopy()) + return DefMI; + MO = &DefMI->getOperand(1); + } + return MO->getParent(); +} + + // Determine if VReg is defined by an instruction that can be folded into a // csel instruction. If so, return the folded opcode, and the replacement // register. @@ -790,6 +804,172 @@ .addImm(CC); } +static const MachineInstr *getClosestPhysRegDef(const MachineOperand &MO, + const TargetRegisterInfo &TRI) { + if (!MO.isReg()) + return nullptr; + MCPhysReg Reg = MO.getReg(); + if (!Register::isPhysicalRegister(Reg)) + return nullptr; + + const MachineInstr *MI = MO.getParent(); + const MachineBasicBlock *MBB = MI->getParent(); + + LivePhysRegs PhysRegs(TRI); + // Add the register to the set. Step back until it gets removed. + PhysRegs.addReg(Reg); + + auto It = MI->getIterator(); + do { + PhysRegs.removeDefs(*It); + if (!PhysRegs.contains(Reg)) { + // It's been removed. The MI def'd the reg. + return &*It; + } + if (It != MBB->begin()) + --It; + } while (It != MBB->begin()); + + assert(It == MBB->begin()); + PhysRegs.removeDefs(*It); + if (!PhysRegs.contains(Reg)) + return &*It; + return nullptr; +} + +static const MachineInstr *getClosestRegDef(const MachineOperand &MO, + const MachineRegisterInfo &MRI, + const TargetRegisterInfo &TRI) { + if (!MO.isReg()) + return nullptr; + Register Reg = MO.getReg(); + if (Register::isPhysicalRegister(Reg)) { + return getClosestPhysRegDef(MO, TRI); + } + // For VRegs, first walk through all the copies. If we find a copy from a + // physreg, try to find the closest def for that reg. + const MachineInstr *DefMI = removeCopiesInstr(MRI, &MO); + if (DefMI->isFullCopy()) { + const MachineOperand &SrcOp = DefMI->getOperand(1); + assert(SrcOp.isReg() && + Register::isPhysicalRegister(SrcOp.getReg())); + return getClosestPhysRegDef(SrcOp, TRI); + } + return DefMI; +} + +/// Returns true if a (F)CSEL* can be folded into a COPY. +static bool canFoldIntoCOPY(const MachineInstr &MI, const TargetInstrInfo &TII) { + switch (MI.getOpcode()) { + default: + return false; + case AArch64::FCSELHrrr: + case AArch64::FCSELSrrr: + case AArch64::FCSELDrrr: + case AArch64::CSELWr: + case AArch64::CSELXr: { + const MachineOperand *TrueOperand = &MI.getOperand(1); + const MachineOperand *FalseOperand = &MI.getOperand(2); + // Only check for regs. + if (!TrueOperand->isReg() || !FalseOperand->isReg()) + return false; + + const MachineBasicBlock &MBB = *MI.getParent(); + const MachineFunction &MF = *MBB.getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); + + // If it's exactly the same register, do it. + if (TrueOperand->getReg() == FalseOperand->getReg()) + return true; + + // It's not the same register. + // Here we try to go as deep as possible to find a definition. + const MachineInstr *TrueDef = getClosestRegDef(*TrueOperand, MRI, TRI); + const MachineInstr *FalseDef = getClosestRegDef(*FalseOperand, MRI, TRI); + if (TrueDef && FalseDef) { + TrueOperand = &TrueDef->getOperand(0); + FalseOperand = &FalseDef->getOperand(0); + } else if (!TrueDef && !FalseDef) { + Register TrueReg = removeCopies(MRI, TrueOperand->getReg()); + Register FalseReg = removeCopies(MRI, FalseOperand->getReg()); + return TrueReg == FalseReg && MBB.isLiveIn(TrueReg) && + MBB.isLiveIn(FalseReg); + } + + if (!TrueOperand->isReg() || !FalseOperand->isReg()) + return false; + + Register TrueReg = TrueOperand->getReg(); + Register FalseReg = FalseOperand->getReg(); + + if (TrueReg != FalseReg) + return false; + + // If there is no definition in this basic block, and we're looking at the + // same physical register, we should check the MBB's liveins. + if (!TrueDef && !FalseDef && + TrueReg.isPhysical() && FalseReg.isPhysical() && + MBB.isLiveIn(TrueReg) && MBB.isLiveIn(FalseReg)) + return true; + + if (TrueDef->isImplicitDef() || FalseDef->isImplicitDef()) + return false; + + if (TrueDef && FalseDef) + return TrueDef == FalseDef; + + return false; + } + } +} + +bool AArch64InstrInfo::analyzeSelect(const MachineInstr &MI, + SmallVectorImpl &Cond, + unsigned &TrueOp, unsigned &FalseOp, + bool &Optimizable) const { + assert(MI.isSelect()); + if (!canFoldIntoCOPY(MI, *this)) + return true; + // CSEL operands: + // 0: Def. + // 1: True use. + // 2: False use. + // 3: Condition code. + // 4: NZCV implicit use. + TrueOp = 1; + FalseOp = 2; + // Add the rest of the operands as part of the condition. + for (unsigned OpIdx = FalseOp; OpIdx < MI.getNumOperands(); ++OpIdx) + Cond.push_back(MI.getOperand(OpIdx)); + Optimizable = true; + return false; +} + +MachineInstr * +AArch64InstrInfo::optimizeSelect(MachineInstr &MI, + SmallPtrSetImpl &NewMIs, + bool PreferFalse) const { + assert(MI.isSelect()); + assert(MI.getOpcode() == AArch64::CSELWr || + MI.getOpcode() == AArch64::CSELXr || + MI.getOpcode() == AArch64::FCSELHrrr || + MI.getOpcode() == AArch64::FCSELSrrr || + MI.getOpcode() == AArch64::FCSELDrrr && "Unknown select instruction"); + MachineInstrBuilder NewMI = + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY), + MI.getOperand(0).getReg()); + if (PreferFalse) + NewMI.addReg(MI.getOperand(1).getReg()); + else + NewMI.addReg(MI.getOperand(2).getReg()); + + NewMIs.erase(&MI); + NewMIs.insert(NewMI); + + return NewMI; +} + /// Returns true if a MOVi32imm or MOVi64imm can be expanded to an ORRxx. static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) { uint64_t Imm = MI.getOperand(1).getImm(); diff --git a/llvm/test/CodeGen/AArch64/csel-same-source.mir b/llvm/test/CodeGen/AArch64/csel-same-source.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/csel-same-source.mir @@ -0,0 +1,112 @@ +#RUN: llc -mtriple=aarch64-- -run-pass peephole-opt %s -o - | FileCheck %s + +--- +name: csel +# CHECK-LABEL: name: csel +# A coalescing pass is done from machine-cse, and it results in a CSEL with the +# same true and false source. Make sure it gets elimintated. +tracksRegLiveness: true +body: | + bb.0: + %0:gpr32 = IMPLICIT_DEF + $nzcv = IMPLICIT_DEF + %1:gpr32 = UBFMWri %0, 13, 31 + %2:gpr32 = COPY %1 + %3:gpr32 = UBFMWri %0, 13, 31 + %4:gpr32 = COPY %3 + %5:gpr32 = CSELWr %2, %4, 12, implicit $nzcv + ; CSE: = COPY + ; CSE-NOT: = CSELWr + RET_ReallyLR + +... +--- +name: cselcopy +# CHECK-LABEL: name: cselcopy +# Make sure we eliminate CSELS through copies. +tracksRegLiveness: true +body: | + bb.0: + %0:gpr32 = IMPLICIT_DEF + $nzcv = IMPLICIT_DEF + %1:gpr32 = COPY %0 + %2:gpr32 = COPY %0 + %3:gpr32 = COPY %1 + %4:gpr32 = COPY %2 + %5:gpr32 = CSELWr %3, %4, 12, implicit $nzcv + ; CHECK: = COPY + ; CHECK-NOT: = CSELWr + RET_ReallyLR + +... +--- +# CHECK-LABEL: name: onedef +# Check that we verify that the physreg has only one def. If we don't check +# for that, the transformation is incorrect since a def in the middle could +# invalidate the chain. +name: onedef +body: | + bb.0: + $x0 = IMPLICIT_DEF + %0:gpr64 = COPY $x0 + $x0 = IMPLICIT_DEF + %1:gpr64 = COPY $x0 + %2:gpr64 = CSELXr %0, %1, 0, implicit $nzcv + ; CHECK: CSELXr %0 + ; CHECK-NOT: COPY %0 + RET_ReallyLR + +... +--- +# CHECK-LABEL: name: onedef2 +name: onedef2 +body: | + bb.0: + liveins: $x0 + %0:gpr64 = COPY $x0 + %1:gpr64 = COPY $x0 + %2:gpr64 = CSELXr %0, %1, 0, implicit $nzcv + ; CHECK: COPY %1 + ; CHECK-NOT: CSELXr %0 + RET_ReallyLR + +... +--- +# CHECK-LABEL: name: onedef3 +name: onedef3 +body: | + bb.0: + %0:gpr64 = COPY $x0 + %1:gpr64 = COPY $x1 + %2:gpr64 = CSELXr %0, %1, 0, implicit $nzcv + RET_ReallyLR + +... +--- +# CHECK-LABEL: name: implicitdef +name: implicitdef +body: | + bb.0: + %0:gpr64 = IMPLICIT_DEF implicit-def $x0, implicit-def $x1 + %1:gpr64 = COPY $x0 + %2:gpr64 = COPY $x1 + %3:gpr64 = CSELXr %1, %2, 0, implicit $nzcv + ; CHECK: CSELXr %1 + ; CHECK-NOT: COPY %1 + RET_ReallyLR + +... +--- +# CHECK-LABEL: name: implicitdef2 +name: implicitdef2 +body: | + bb.0: + $x0 = IMPLICIT_DEF + %0:gpr64 = COPY $x0 + %1:gpr64 = IMPLICIT_DEF implicit-def $x0 + %2:gpr64 = COPY $x0 + %3:gpr64 = CSELXr %0, %2, 0, implicit $nzcv + ; CHECK: CSELXr %0 + ; CHECK-NOT: COPY %0 + RET_ReallyLR + diff --git a/llvm/test/CodeGen/AArch64/srem-seteq.ll b/llvm/test/CodeGen/AArch64/srem-seteq.ll --- a/llvm/test/CodeGen/AArch64/srem-seteq.ll +++ b/llvm/test/CodeGen/AArch64/srem-seteq.ll @@ -269,9 +269,7 @@ define i32 @test_srem_allones(i32 %X) nounwind { ; CHECK-LABEL: test_srem_allones: ; CHECK: // %bb.0: -; CHECK-NEXT: cmp w0, #0 // =0 -; CHECK-NEXT: csel w8, w0, w0, lt -; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cmp w0, w0 ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %srem = srem i32 %X, 4294967295