Index: llvm/include/llvm/CodeGen/TargetInstrInfo.h =================================================================== --- llvm/include/llvm/CodeGen/TargetInstrInfo.h +++ llvm/include/llvm/CodeGen/TargetInstrInfo.h @@ -88,8 +88,9 @@ struct ExtAddrMode { Register BaseReg; Register ScaledReg; - int64_t Scale; - int64_t Displacement; + int64_t Scale = 0; + int64_t Displacement = 0; + ExtAddrMode() = default; }; //--------------------------------------------------------------------------- @@ -1433,6 +1434,26 @@ return std::nullopt; } + /// Check if it's posisble and beneficial to fold the addressing computation + /// `AddrI` into the addressing mode of the load/store instruction `MemI`. The + /// memory instruction is a user of the virtual register `Reg`, which in turn + /// is the ultimate destination of zero or more COPY instructions from the + /// output register of `AddrI`. + /// Return the adddressing mode after folding in `AM`. + virtual bool canFoldIntoAddrMode(const MachineInstr &MemI, Register Reg, + const MachineInstr &AddrI, + ExtAddrMode &AM) const { + return false; + } + + /// Emit a load/store instruction with the same value register as `MemI`, but + /// using the address from `AM`. The addressing mode must have been obtained + /// from `canFoldIntoAddr` for the samem memory instruction. + virtual MachineInstr *emitLdStWithAddr(MachineInstr &MemI, + const ExtAddrMode &AM) const { + llvm_unreachable("target did not implement emitLdStWithAddr()"); + } + /// Returns true if MI's Def is NullValueReg, and the MI /// does not change the Zero value. i.e. cases such as rax = shr rax, X where /// NullValueReg = rax. Note that if the NullValueReg is non-zero, this Index: llvm/include/llvm/CodeGen/TargetPassConfig.h =================================================================== --- llvm/include/llvm/CodeGen/TargetPassConfig.h +++ llvm/include/llvm/CodeGen/TargetPassConfig.h @@ -130,6 +130,11 @@ /// Default setting for -enable-tail-merge on this target. bool EnableTailMerge = true; + /// Enable sinking of instructions in MachineSink where a computation can be + /// folded into the addressing mode of a memory load/store instruction or + /// replace a copy. + bool EnableSinkAndFold = false; + /// Require processing of functions such that callees are generated before /// callers. bool RequireCodeGenSCCOrder = false; @@ -176,6 +181,9 @@ bool getEnableTailMerge() const { return EnableTailMerge; } void setEnableTailMerge(bool Enable) { setOpt(EnableTailMerge, Enable); } + bool getEnableSinkAndFold() const { return EnableSinkAndFold; } + void setEnableSinkAndFold(bool Enable) { setOpt(EnableSinkAndFold, Enable); } + bool requiresCodeGenSCCOrder() const { return RequireCodeGenSCCOrder; } void setRequiresCodeGenSCCOrder(bool Enable = true) { setOpt(RequireCodeGenSCCOrder, Enable); Index: llvm/lib/CodeGen/MachineSink.cpp =================================================================== --- llvm/lib/CodeGen/MachineSink.cpp +++ llvm/lib/CodeGen/MachineSink.cpp @@ -41,6 +41,7 @@ #include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/CodeGen/RegisterPressure.h" #include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/BasicBlock.h" @@ -115,6 +116,7 @@ namespace { class MachineSinking : public MachineFunctionPass { + const TargetSubtargetInfo *STI = nullptr; const TargetInstrInfo *TII = nullptr; const TargetRegisterInfo *TRI = nullptr; MachineRegisterInfo *MRI = nullptr; // Machine register information @@ -165,7 +167,9 @@ StoreInstrCache; /// Cached BB's register pressure. - std::map> CachedRegisterPressure; + std::map> CachedRegisterPressure; + + bool EnableSinkAndFold; public: static char ID; // Pass identification @@ -187,6 +191,7 @@ AU.addPreserved(); if (UseBlockFreqInfo) AU.addRequired(); + AU.addRequired(); } void releaseMemory() override { @@ -246,11 +251,16 @@ bool PerformTrivialForwardCoalescing(MachineInstr &MI, MachineBasicBlock *MBB); + bool PerformSinkAndFold(MachineInstr &MI, MachineBasicBlock *MBB); + SmallVector & GetAllSortedSuccessors(MachineInstr &MI, MachineBasicBlock *MBB, AllSuccsCache &AllSuccessors) const; - std::vector &getBBRegisterPressure(MachineBasicBlock &MBB); + std::vector &getBBRegisterPressure(const MachineBasicBlock &MBB); + + bool registerPressureSetExceedsLimit(unsigned NRegs, const TargetRegisterClass *RC, + const MachineBasicBlock &MBB); }; } // end anonymous namespace @@ -300,6 +310,179 @@ return true; } +bool MachineSinking::PerformSinkAndFold(MachineInstr &MI, MachineBasicBlock *MBB) { + if (MI.isCopyLike() || MI.mayLoadOrStore() || + MI.getOpcode() == TargetOpcode::REG_SEQUENCE) + return false; + + // Don't sink instructions that the target prefers not to sink. + if (!TII->shouldSink(MI)) + return false; + + // Check if it's safe to move the instruction. + bool SawStore = true; + if (!MI.isSafeToMove(AA, SawStore)) + return false; + + // Convergent operations may not be made control-dependent on additional + // values. + if (MI.isConvergent()) + return false; + + // Don't sink defs/uses of hard registers or if the instruction defines more + // than one register. + // Don't sink more than two register uses - it'll cover most of the cases and + // greatly simplifies the register pressure checks. + Register DefReg; + Register UsedRegA, UsedRegB; + for (const MachineOperand &MO : MI.operands()) { + if (MO.isImm() || MO.isRegMask() || MO.isRegLiveOut() || MO.isMetadata() || + MO.isMCSymbol() || MO.isDbgInstrRef() || MO.isCFIIndex() || + MO.isIntrinsicID() || MO.isPredicate() || MO.isShuffleMask()) + continue; + if (!MO.isReg()) + return false; + + Register Reg = MO.getReg(); + if (Reg == 0) + continue; + + if (Reg.isVirtual()) { + if (MO.isDef()) { + if (DefReg) + return false; + DefReg = Reg; + continue; + } + + if (UsedRegA == 0) + UsedRegA = Reg; + else if (UsedRegB == 0) + UsedRegB = Reg; + else + return false; + continue; + } + + if (Reg.isPhysical() && + (MRI->isConstantPhysReg(Reg) || TII->isIgnorableUse(MO))) + continue; + + return false; + } + + // Scan uses of the destination register. Every use, except the last, must be + // a copy, with a chain of copies terminating with either a copy into a hard + // register, or a load/store instruction where the use is part of the + // address (*not* the stored value). + using SinkInfo = std::pair; + SmallVector SinkInto; + SmallVector Worklist; + Worklist.push_back(DefReg); + + const TargetRegisterClass *RC = MRI->getRegClass(DefReg); + const TargetRegisterClass *RCA = + UsedRegA == 0 ? nullptr : MRI->getRegClass(UsedRegA); + const TargetRegisterClass *RCB = + UsedRegB == 0 ? nullptr : MRI->getRegClass(UsedRegB); + + while (!Worklist.empty()) { + Register Reg = Worklist.pop_back_val(); + + for (MachineOperand &MO : MRI->use_nodbg_operands(Reg)) { + ExtAddrMode MaybeAM; + MachineInstr &UseInst = *MO.getParent(); + if (UseInst.isCopy()) { + Register DstReg; + if (const MachineOperand &O = UseInst.getOperand(0); O.isReg()) + DstReg = O.getReg(); + if (DstReg == 0) + return false; + if (DstReg.isVirtual()) { + Worklist.push_back(DstReg); + continue; + } + // If we are going to replace a copy, the original instruction must be + // as cheap as a copy. + if (!TII->isAsCheapAsAMove(MI)) + return false; + // The hard register must be in the register class of the original + // instruction's destination register. + if (!RC->contains(DstReg)) + return false; + } else if (UseInst.mayLoadOrStore()) { + ExtAddrMode AM; + if (!TII->canFoldIntoAddrMode(UseInst, Reg, MI, AM)) + return false; + MaybeAM = AM; + } else { + return false; + } + + if (UseInst.getParent() == MI.getParent()) + return false; + + // If the register class of source of the copy is a superset of any of + // the register classes of the operands of the materialized instruction + // don't consider that live range extended. + const TargetRegisterClass *RCS = MRI->getRegClass(UseInst.getOperand(1).getReg()); + if (RCA && RCA->hasSuperClassEq(RCS)) + RCA = nullptr; + else if (RCB && RCB->hasSuperClassEq(RCS)) + RCB = nullptr; + if (RCA || RCB) { + if (RCA == nullptr) { + RCA = RCB; + RCB = nullptr; + } + + unsigned NRegs = !!RCA + !!RCB; + if (RCA == RCB) + RCB = nullptr; + + // Check we don't exceed register presure at the destination. + const MachineBasicBlock &MBB = *UseInst.getParent(); + if (RCB == nullptr) { + if (registerPressureSetExceedsLimit(NRegs, RCA, MBB)) + return false; + } else if (registerPressureSetExceedsLimit(1, RCA, MBB) || + registerPressureSetExceedsLimit(1, RCB, MBB)) { + return false; + } + } + + SinkInto.emplace_back(&UseInst, MaybeAM); + } + } + + if (SinkInto.empty()) + return false; + + // Now we know we can fold the instruction in all its users. + if (UsedRegA) + MRI->clearKillFlags(UsedRegA); + if (UsedRegB) + MRI->clearKillFlags(UsedRegB); + + for (auto &[SinkDst, MaybeAM] : SinkInto) { + LLVM_DEBUG(dbgs() << "Sinking copy of"; MI.dump(); dbgs() << "into"; + SinkDst->dump();); + if (SinkDst->isCopy()) { + MachineBasicBlock::iterator InsertPt = SinkDst->getIterator(); + TII->reMaterialize(*SinkDst->getParent(), InsertPt, + SinkDst->getOperand(0).getReg(), 0, MI, *TRI); + LLVM_DEBUG(dbgs() << "yielding"; std::prev(InsertPt)->dump();); + } else { + MachineInstr *NewInstr = TII->emitLdStWithAddr(*SinkDst, MaybeAM); + LLVM_DEBUG(dbgs() << "yielding"; NewInstr->dump();); + } + SinkDst->eraseFromParent(); + } + + MI.eraseFromParent(); + return true; +} + /// AllUsesDominatedByBlock - Return true if all uses of the specified register /// occur in blocks dominated by the specified block. If any use is in the /// definition block, then return false since it is never legal to move def @@ -423,8 +606,9 @@ LLVM_DEBUG(dbgs() << "******** Machine Sinking ********\n"); - TII = MF.getSubtarget().getInstrInfo(); - TRI = MF.getSubtarget().getRegisterInfo(); + STI = &MF.getSubtarget(); + TII = STI->getInstrInfo(); + TRI = STI->getRegisterInfo(); MRI = &MF.getRegInfo(); DT = &getAnalysis(); PDT = &getAnalysis(); @@ -433,6 +617,8 @@ MBPI = &getAnalysis(); AA = &getAnalysis().getAAResults(); RegClassInfo.runOnMachineFunction(MF); + TargetPassConfig *PassConfig = &getAnalysis(); + EnableSinkAndFold = PassConfig->getEnableSinkAndFold(); bool EverMadeChange = false; @@ -509,8 +695,8 @@ } bool MachineSinking::ProcessBlock(MachineBasicBlock &MBB) { - // Can't sink anything out of a block that has less than two successors. - if (MBB.succ_size() <= 1 || MBB.empty()) return false; + if ((!EnableSinkAndFold && MBB.succ_size() <= 1) || MBB.empty()) + return false; // Don't bother sinking code out of unreachable blocks. In addition to being // unprofitable, it can also lead to infinite looping, because in an @@ -541,8 +727,27 @@ continue; } - bool Joined = PerformTrivialForwardCoalescing(MI, &MBB); - if (Joined) { + if (EnableSinkAndFold) { + if (MI.isCopy()) { + Register Reg = MI.getOperand(0).getReg(); + if (Reg.isVirtual() && MRI->use_empty(Reg)) { + MI.eraseFromParent(); + MadeChange = true; + continue; + } + } + + if (PerformSinkAndFold(MI, &MBB)) { + MadeChange = true; + continue; + } + } + + // Can't sink anything out of a block that has less than two successors. + if (MBB.succ_size() <= 1) + continue; + + if (PerformTrivialForwardCoalescing(MI, &MBB)) { MadeChange = true; continue; } @@ -559,7 +764,6 @@ SeenDbgVars.clear(); // recalculate the bb register pressure after sinking one BB. CachedRegisterPressure.clear(); - return MadeChange; } @@ -699,7 +903,7 @@ } std::vector & -MachineSinking::getBBRegisterPressure(MachineBasicBlock &MBB) { +MachineSinking::getBBRegisterPressure(const MachineBasicBlock &MBB) { // Currently to save compiling time, MBB's register pressure will not change // in one ProcessBlock iteration because of CachedRegisterPressure. but MBB's // register pressure is changed after sinking any instructions into it. @@ -715,10 +919,10 @@ RPTracker.init(MBB.getParent(), &RegClassInfo, nullptr, &MBB, MBB.end(), /*TrackLaneMasks*/ false, /*TrackUntiedDefs=*/true); - for (MachineBasicBlock::iterator MII = MBB.instr_end(), + for (MachineBasicBlock::const_iterator MII = MBB.instr_end(), MIE = MBB.instr_begin(); MII != MIE; --MII) { - MachineInstr &MI = *std::prev(MII); + const MachineInstr &MI = *std::prev(MII); if (MI.isDebugInstr() || MI.isPseudoProbe()) continue; RegisterOperands RegOpers; @@ -734,6 +938,18 @@ return It.first->second; } +bool MachineSinking::registerPressureSetExceedsLimit( + unsigned NRegs, const TargetRegisterClass *RC, const MachineBasicBlock &MBB) { + unsigned Weight = NRegs * TRI->getRegClassWeight(RC).RegWeight; + const int *PS = TRI->getRegClassPressureSets(RC); + std::vector BBRegisterPressure = getBBRegisterPressure(MBB); + for (; *PS != -1; PS++) + if (Weight + BBRegisterPressure[*PS] >= + TRI->getRegPressureSetLimit(*MBB.getParent(), *PS)) + return true; + return false; +} + /// isProfitableToSinkTo - Return true if it is profitable to sink MI. bool MachineSinking::isProfitableToSinkTo(Register Reg, MachineInstr &MI, MachineBasicBlock *MBB, @@ -778,21 +994,6 @@ if (!MCycle) return false; - auto isRegisterPressureSetExceedLimit = [&](const TargetRegisterClass *RC) { - unsigned Weight = TRI->getRegClassWeight(RC).RegWeight; - const int *PS = TRI->getRegClassPressureSets(RC); - // Get register pressure for block SuccToSinkTo. - std::vector BBRegisterPressure = - getBBRegisterPressure(*SuccToSinkTo); - for (; *PS != -1; PS++) - // check if any register pressure set exceeds limit in block SuccToSinkTo - // after sinking. - if (Weight + BBRegisterPressure[*PS] >= - TRI->getRegPressureSetLimit(*MBB->getParent(), *PS)) - return true; - return false; - }; - // If this instruction is inside a Cycle and sinking this instruction can make // more registers live range shorten, it is still prifitable. for (const MachineOperand &MO : MI.operands()) { @@ -832,7 +1033,7 @@ // The DefMI is defined inside the cycle. // If sinking this operand makes some register pressure set exceed limit, // it is not profitable. - if (isRegisterPressureSetExceedLimit(MRI->getRegClass(Reg))) { + if (registerPressureSetExceedsLimit(1, MRI->getRegClass(Reg), *SuccToSinkTo)) { LLVM_DEBUG(dbgs() << "register pressure exceed limit, not profitable."); return false; } Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -15325,25 +15325,7 @@ NumBytes = 0; } - if (!AM.Scale) { - int64_t Offset = AM.BaseOffs; - - // 9-bit signed offset - if (isInt<9>(Offset)) - return true; - - // 12-bit unsigned offset - unsigned shift = Log2_64(NumBytes); - if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 && - // Must be a multiple of NumBytes (NumBytes is a power of 2) - (Offset >> shift) << shift == Offset) - return true; - return false; - } - - // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2 - - return AM.Scale == 1 || (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes); + return Subtarget->getInstrInfo()->isLegalAddressingMode(NumBytes, AM.BaseOffs, AM.Scale); } bool AArch64TargetLowering::shouldConsiderGEPOffsetSplit() const { Index: llvm/lib/Target/AArch64/AArch64InstrInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -140,6 +140,13 @@ getAddrModeFromMemoryOp(const MachineInstr &MemI, const TargetRegisterInfo *TRI) const override; + bool canFoldIntoAddrMode(const MachineInstr &MemI, Register Reg, + const MachineInstr &AddrI, + ExtAddrMode &AM) const override; + + MachineInstr *emitLdStWithAddr(MachineInstr &MemI, + const ExtAddrMode &AM) const override; + bool getMemOperandsWithOffsetWidth( const MachineInstr &MI, SmallVectorImpl &BaseOps, int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, @@ -340,6 +347,11 @@ static void decomposeStackOffsetForDwarfOffsets(const StackOffset &Offset, int64_t &ByteSized, int64_t &VGSized); + + // Return true if address of the form BaseReg + Scale * ScaledReg + Offset can + // be used for a load/store of NumBytes. BaseReg is always present and implicit. + bool isLegalAddressingMode(unsigned NumBytes, int64_t Offset, unsigned Scale) const; + #define GET_INSTRINFO_HELPER_DECLS #include "AArch64GenInstrInfo.inc" Index: llvm/lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -2643,6 +2643,438 @@ return AM; } +bool AArch64InstrInfo::canFoldIntoAddrMode(const MachineInstr &MemI, + Register Reg, + const MachineInstr &AddrI, + ExtAddrMode &AM) const { + // Support folding into addressing mode of the form Reg + Offset. + unsigned NumBytes; + int64_t OffsetScale = 1; + switch (MemI.getOpcode()) { + default: + return false; + + case AArch64::LDURQi: + case AArch64::STURQi: + NumBytes = 16; + break; + + case AArch64::LDURDi: + case AArch64::STURDi: + case AArch64::LDURXi: + case AArch64::STURXi: + NumBytes = 8; + break; + + case AArch64::LDURWi: + case AArch64::LDURSWi: + case AArch64::STURWi: + NumBytes = 4; + break; + + case AArch64::LDURHi: + case AArch64::STURHi: + case AArch64::LDURHHi: + case AArch64::STURHHi: + case AArch64::LDURSHXi: + case AArch64::LDURSHWi: + NumBytes = 2; + break; + + case AArch64::LDURBi: + case AArch64::LDURBBi: + case AArch64::LDURSBXi: + case AArch64::LDURSBWi: + case AArch64::STURBi: + case AArch64::STURBBi: + + case AArch64::LDRBui: + case AArch64::LDRBBui: + case AArch64::LDRSBXui: + case AArch64::LDRSBWui: + case AArch64::STRBui: + case AArch64::STRBBui: + NumBytes = 1; + break; + + case AArch64::LDRQui: + case AArch64::STRQui: + NumBytes = 16; + OffsetScale = 16; + break; + + case AArch64::LDRDui: + case AArch64::STRDui: + case AArch64::LDRXui: + case AArch64::STRXui: + NumBytes = 8; + OffsetScale = 8; + break; + + case AArch64::LDRWui: + case AArch64::LDRSWui: + case AArch64::STRWui: + NumBytes = 4; + OffsetScale = 4; + break; + + case AArch64::LDRHui: + case AArch64::STRHui: + case AArch64::LDRHHui: + case AArch64::STRHHui: + case AArch64::LDRSHXui: + case AArch64::LDRSHWui: + NumBytes = 2; + OffsetScale = 2; + break; + } + +// Check the fold operand is not the loaded/stored value. + const MachineOperand &BaseRegOp = MemI.getOperand(0); + if (BaseRegOp.isReg() && BaseRegOp.getReg() == Reg) + return false; + + auto canFoldAddSubImmIntoAddrMode = [&](int64_t Offset) -> bool { + Offset += MemI.getOperand(2).getImm() * OffsetScale; + if (!isLegalAddressingMode(NumBytes, Offset, /* Scale */ 0)) + return false; + AM.BaseReg = AddrI.getOperand(1).getReg(); + AM.ScaledReg = 0; + AM.Scale = 0; + AM.Displacement = Offset; + return true; + }; + + auto canFoldAddRegIntoAddrMode = [&](int64_t Scale) -> bool { + if (MemI.getOperand(2).getImm() != 0) + return false; + if (!isLegalAddressingMode(NumBytes, /* Offset */ 0, Scale)) + return false; + AM.BaseReg = AddrI.getOperand(1).getReg(); + AM.ScaledReg = AddrI.getOperand(2).getReg(); + AM.Scale = Scale; + AM.Displacement = 0; + return true; + }; + + int64_t Offset = 0; + switch (AddrI.getOpcode()) { + default: + return false; + + case AArch64::ADDXri: + Offset = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm(); + return canFoldAddSubImmIntoAddrMode(Offset); + + case AArch64::SUBXri: + Offset = AddrI.getOperand(2).getImm() << AddrI.getOperand(3).getImm(); + return canFoldAddSubImmIntoAddrMode(-Offset); + + case AArch64::ADDXrs: { + int64_t Shift = AddrI.getOperand(3).getImm(); + // Don't allow LSR, ASR, or LSL with shifts other than 0, 1, 2, 3, and 4 + if (Shift > 4) + return false; + // Shift 1 (scale 2) in address is one extra cycle and one extra unit on + // some CPUs. + if (Shift == 1 && !Subtarget.hasLSLFast()) + return false; + return canFoldAddRegIntoAddrMode(1 << Shift); + } + case AArch64::ADDXrr: + return canFoldAddRegIntoAddrMode(1); + } +} + +static unsigned regOffsetOpcode(unsigned Opcode) { + switch (Opcode) { + default: + llvm_unreachable("Address folding not implemented for instruction"); + + case AArch64::LDURQi: + case AArch64::LDRQui: + return AArch64::LDRQroX; + case AArch64::STURQi: + case AArch64::STRQui: + return AArch64::STRQroX; + case AArch64::LDURDi: + case AArch64::LDRDui: + return AArch64::LDRDroX; + case AArch64::STURDi: + case AArch64::STRDui: + return AArch64::STRDroX; + case AArch64::LDURXi: + case AArch64::LDRXui: + return AArch64::LDRXroX; + case AArch64::STURXi: + case AArch64::STRXui: + return AArch64::STRXroX; + case AArch64::LDURWi: + case AArch64::LDRWui: + return AArch64::LDRWroX; + case AArch64::LDURSWi: + case AArch64::LDRSWui: + return AArch64::LDRSWroX; + case AArch64::STURWi: + case AArch64::STRWui: + return AArch64::STRWroX; + case AArch64::LDURHi: + case AArch64::LDRHui: + return AArch64::LDRHroX; + case AArch64::STURHi: + case AArch64::STRHui: + return AArch64::STRHroX; + case AArch64::LDURHHi: + case AArch64::LDRHHui: + return AArch64::LDRHHroX; + case AArch64::STURHHi: + case AArch64::STRHHui: + return AArch64::STRHHroX; + case AArch64::LDURSHXi: + case AArch64::LDRSHXui: + return AArch64::LDRSHXroX; + case AArch64::LDURSHWi: + case AArch64::LDRSHWui: + return AArch64::LDRSHWroX; + case AArch64::LDURBi: + case AArch64::LDRBui: + return AArch64::LDRBroX; + case AArch64::LDURBBi: + case AArch64::LDRBBui: + return AArch64::LDRBBroX; + case AArch64::LDURSBXi: + case AArch64::LDRSBXui: + return AArch64::LDRSBXroX; + case AArch64::LDURSBWi: + case AArch64::LDRSBWui: + return AArch64::LDRSBWroX; + case AArch64::STURBi: + case AArch64::STRBui: + return AArch64::STRBroX; + case AArch64::STURBBi: + case AArch64::STRBBui: + return AArch64::STRBBroX; + } +} + +unsigned scaledOffsetOpcode(unsigned Opcode, unsigned &Scale) { + switch (Opcode) { + default: + llvm_unreachable("Address folding not implemented for instruction"); + + case AArch64::LDURQi: + Scale = 16; + return AArch64::LDRQui; + case AArch64::STURQi: + Scale = 16; + return AArch64::STRQui; + case AArch64::LDURDi: + Scale = 8; + return AArch64::LDRDui; + case AArch64::STURDi: + Scale = 8; + return AArch64::STRDui; + case AArch64::LDURXi: + Scale = 8; + return AArch64::LDRXui; + case AArch64::STURXi: + Scale = 8; + return AArch64::STRXui; + case AArch64::LDURWi: + Scale = 4; + return AArch64::LDRWui; + case AArch64::LDURSWi: + Scale = 4; + return AArch64::LDRSWui; + case AArch64::STURWi: + Scale = 4; + return AArch64::STRWui; + case AArch64::LDURHi: + Scale = 2; + return AArch64::LDRHui; + case AArch64::STURHi: + Scale = 2; + return AArch64::STRHui; + case AArch64::LDURHHi: + Scale = 2; + return AArch64::LDRHHui; + case AArch64::STURHHi: + Scale = 2; + return AArch64::STRHHui; + case AArch64::LDURSHXi: + Scale = 2; + return AArch64::LDRSHXui; + case AArch64::LDURSHWi: + Scale = 2; + return AArch64::LDRSHWui; + case AArch64::LDURBi: + Scale = 1; + return AArch64::LDRBui; + case AArch64::LDURBBi: + Scale = 1; + return AArch64::LDRBBui; + case AArch64::LDURSBXi: + Scale = 1; + return AArch64::LDRSBXui; + case AArch64::LDURSBWi: + Scale = 1; + return AArch64::LDRSBWui; + case AArch64::STURBi: + Scale = 1; + return AArch64::STRBui; + case AArch64::STURBBi: + Scale = 1; + return AArch64::STRBBui; + case AArch64::LDRQui: + case AArch64::STRQui: + Scale = 16; + return Opcode; + case AArch64::LDRDui: + case AArch64::STRDui: + case AArch64::LDRXui: + case AArch64::STRXui: + Scale = 8; + return Opcode; + case AArch64::LDRWui: + case AArch64::LDRSWui: + case AArch64::STRWui: + Scale = 4; + return Opcode; + case AArch64::LDRHui: + case AArch64::STRHui: + case AArch64::LDRHHui: + case AArch64::STRHHui: + case AArch64::LDRSHXui: + case AArch64::LDRSHWui: + Scale = 2; + return Opcode; + case AArch64::LDRBui: + case AArch64::LDRBBui: + case AArch64::LDRSBXui: + case AArch64::LDRSBWui: + case AArch64::STRBui: + case AArch64::STRBBui: + Scale = 1; + return Opcode; + } +} + +unsigned unscaledOffsetOpcode(unsigned Opcode) { + switch (Opcode) { + default: + llvm_unreachable("Address folding not implemented for instruction"); + + case AArch64::LDURQi: + case AArch64::STURQi: + case AArch64::LDURDi: + case AArch64::STURDi: + case AArch64::LDURXi: + case AArch64::STURXi: + case AArch64::LDURWi: + case AArch64::LDURSWi: + case AArch64::STURWi: + case AArch64::LDURHi: + case AArch64::STURHi: + case AArch64::LDURHHi: + case AArch64::STURHHi: + case AArch64::LDURSHXi: + case AArch64::LDURSHWi: + case AArch64::LDURBi: + case AArch64::STURBi: + case AArch64::LDURBBi: + case AArch64::STURBBi: + case AArch64::LDURSBWi: + case AArch64::LDURSBXi: + return Opcode; + case AArch64::LDRQui: + return AArch64::LDURQi; + case AArch64::STRQui: + return AArch64::STURQi; + case AArch64::LDRDui: + return AArch64::LDURDi; + case AArch64::STRDui: + return AArch64::STURDi; + case AArch64::LDRXui: + return AArch64::LDURXi; + case AArch64::STRXui: + return AArch64::STURXi; + case AArch64::LDRWui: + return AArch64::LDURWi; + case AArch64::LDRSWui: + return AArch64::LDURSWi; + case AArch64::STRWui: + return AArch64::STURWi; + case AArch64::LDRHui: + return AArch64::LDURHi; + case AArch64::STRHui: + return AArch64::STURHi; + case AArch64::LDRHHui: + return AArch64::LDURHHi; + case AArch64::STRHHui: + return AArch64::STURHHi; + case AArch64::LDRSHXui: + return AArch64::LDURSHXi; + case AArch64::LDRSHWui: + return AArch64::LDURSHWi; + case AArch64::LDRBBui: + return AArch64::LDURBBi; + case AArch64::LDRBui: + return AArch64::LDURBi; + case AArch64::STRBBui: + return AArch64::STURBBi; + case AArch64::STRBui: + return AArch64::STURBi; + case AArch64::LDRSBWui: + return AArch64::LDURSBWi; + case AArch64::LDRSBXui: + return AArch64::LDURSBXi; + } +} + +MachineInstr *AArch64InstrInfo::emitLdStWithAddr(MachineInstr &MemI, + const ExtAddrMode &AM) const { + if (AM.ScaledReg) { + // The new instruction will be in the form `ldr Rt, [Xn, Xm, lsl #imm]`. + unsigned Opcode = regOffsetOpcode(MemI.getOpcode()); + // Copy the base register to the correct register class. + Register BaseReg = MemI.getMF()->getRegInfo().createVirtualRegister( + &AArch64::GPR64spRegClass); + const DebugLoc &DL = MemI.getDebugLoc(); + BuildMI(*MemI.getParent(), MemI, DL, get(TargetOpcode::COPY), BaseReg) + .addReg(AM.BaseReg); + auto B = BuildMI(*MemI.getParent(), MemI, DL, get(Opcode)) + .addReg(MemI.getOperand(0).getReg(), + MemI.mayLoad() ? RegState::Define : 0) + .addReg(BaseReg) + .addReg(AM.ScaledReg) + .addImm(0) + .addImm(AM.Scale > 1) + .setMemRefs(MemI.memoperands()) + .setMIFlags(MemI.getFlags()); + return B.getInstr(); + } + + assert(AM.ScaledReg == 0 && AM.Scale == 0 && + "Addressing mode not supported for folding"); + +// The new insrtruction will be in the form `ld[u]r Rt, [Xn, #imm]`. + unsigned Scale = 1; + unsigned Opcode = MemI.getOpcode(); + if (isInt<9>(AM.Displacement)) + Opcode = unscaledOffsetOpcode(Opcode); + else + Opcode = scaledOffsetOpcode(Opcode, Scale); + + auto B = BuildMI(*MemI.getParent(), MemI, MemI.getDebugLoc(), get(Opcode)) + .addReg(MemI.getOperand(0).getReg(), + MemI.mayLoad() ? RegState::Define : 0) + .addReg(AM.BaseReg) + .addImm(AM.Displacement / Scale) + .setMemRefs(MemI.memoperands()) + .setMIFlags(MemI.getFlags()); + return B.getInstr(); +} + bool AArch64InstrInfo::getMemOperandWithOffsetWidth( const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, @@ -8427,6 +8859,30 @@ return OptLevel >= CodeGenOpt::Aggressive ? 6 : 2; } +bool AArch64InstrInfo::isLegalAddressingMode(unsigned NumBytes, int64_t Offset, + unsigned Scale) const { + if (Offset && Scale) + return false; + + // Check Reg + Imm + if (!Scale) { + // 9-bit signed offset + if (isInt<9>(Offset)) + return true; + + // 12-bit unsigned offset + unsigned shift = Log2_64(NumBytes); + if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 && + // Must be a multiple of NumBytes (NumBytes is a power of 2) + (Offset >> shift) << shift == Offset) + return true; + return false; + } + + // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2 + return Scale == 1 || (Scale > 0 && Scale == NumBytes); +} + unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) { if (MF.getSubtarget().hardenSlsBlr()) return AArch64::BLRNoIP; Index: llvm/lib/Target/AArch64/AArch64TargetMachine.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -468,6 +468,7 @@ : TargetPassConfig(TM, PM) { if (TM.getOptLevel() != CodeGenOpt::None) substitutePass(&PostRASchedulerID, &PostMachineSchedulerID); + setEnableSinkAndFold(true); } AArch64TargetMachine &getAArch64TargetMachine() const { Index: llvm/test/CodeGen/AArch64/loop-sink.mir =================================================================== --- llvm/test/CodeGen/AArch64/loop-sink.mir +++ llvm/test/CodeGen/AArch64/loop-sink.mir @@ -328,28 +328,18 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64common = COPY $x1 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x0 - ; CHECK-NEXT: [[ADDXri:%[0-9]+]]:gpr64sp = nuw ADDXri [[COPY]], 4, 0 + ; CHECK-NEXT: [[ADDXri:%[0-9]+]]:gpr64sp = ADDXri [[COPY1]], 1, 0 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64all = COPY [[ADDXri]] - ; CHECK-NEXT: [[ADDXri1:%[0-9]+]]:gpr64sp = nuw ADDXri [[COPY]], 8, 0 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64all = COPY [[ADDXri1]] - ; CHECK-NEXT: [[ADDXri2:%[0-9]+]]:gpr64sp = nuw ADDXri [[COPY]], 12, 0 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64all = COPY [[ADDXri2]] - ; CHECK-NEXT: [[ADDXri3:%[0-9]+]]:gpr64sp = nuw ADDXri [[COPY]], 16, 0 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr64all = COPY [[ADDXri3]] - ; CHECK-NEXT: [[ADDXri4:%[0-9]+]]:gpr64sp = nuw ADDXri [[COPY]], 20, 0 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr64all = COPY [[ADDXri4]] - ; CHECK-NEXT: [[ADDXri5:%[0-9]+]]:gpr64sp = ADDXri [[COPY1]], 1, 0 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gpr64all = COPY [[ADDXri5]] ; CHECK-NEXT: [[MOVaddrJT:%[0-9]+]]:gpr64common = MOVaddrJT target-flags(aarch64-page) %jump-table.0, target-flags(aarch64-pageoff, aarch64-nc) %jump-table.0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1..backedge: ; CHECK-NEXT: successors: %bb.9(0x09249249), %bb.2(0x76db6db7) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI:%[0-9]+]]:gpr64sp = PHI [[COPY7]], %bb.0, %7, %bb.9 + ; CHECK-NEXT: [[PHI:%[0-9]+]]:gpr64sp = PHI [[COPY2]], %bb.0, %7, %bb.9 ; CHECK-NEXT: [[LDRBBui:%[0-9]+]]:gpr32 = LDRBBui [[PHI]], 0 :: (load (s8) from %ir.lsr.iv) ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:gpr64 = SUBREG_TO_REG 0, killed [[LDRBBui]], %subreg.sub_32 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gpr32sp = COPY [[SUBREG_TO_REG]].sub_32 - ; CHECK-NEXT: [[SUBSWri:%[0-9]+]]:gpr32 = SUBSWri killed [[COPY8]], 50, 0, implicit-def $nzcv + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr32sp = COPY [[SUBREG_TO_REG]].sub_32 + ; CHECK-NEXT: [[SUBSWri:%[0-9]+]]:gpr32 = SUBSWri killed [[COPY3]], 50, 0, implicit-def $nzcv ; CHECK-NEXT: Bcc 8, %bb.9, implicit $nzcv ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2..backedge: @@ -371,7 +361,7 @@ ; CHECK-NEXT: successors: %bb.9(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp - ; CHECK-NEXT: $x0 = COPY [[COPY2]] + ; CHECK-NEXT: $x0 = nuw ADDXri [[COPY]], 4, 0 ; CHECK-NEXT: BL @_Z6assignPj, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp ; CHECK-NEXT: B %bb.9 @@ -380,7 +370,7 @@ ; CHECK-NEXT: successors: %bb.9(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp - ; CHECK-NEXT: $x0 = COPY [[COPY3]] + ; CHECK-NEXT: $x0 = nuw ADDXri [[COPY]], 8, 0 ; CHECK-NEXT: BL @_Z6assignPj, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp ; CHECK-NEXT: B %bb.9 @@ -389,7 +379,7 @@ ; CHECK-NEXT: successors: %bb.9(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp - ; CHECK-NEXT: $x0 = COPY [[COPY4]] + ; CHECK-NEXT: $x0 = nuw ADDXri [[COPY]], 12, 0 ; CHECK-NEXT: BL @_Z6assignPj, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp ; CHECK-NEXT: B %bb.9 @@ -398,7 +388,7 @@ ; CHECK-NEXT: successors: %bb.9(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp - ; CHECK-NEXT: $x0 = COPY [[COPY5]] + ; CHECK-NEXT: $x0 = nuw ADDXri [[COPY]], 16, 0 ; CHECK-NEXT: BL @_Z6assignPj, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp ; CHECK-NEXT: B %bb.9 @@ -407,15 +397,15 @@ ; CHECK-NEXT: successors: %bb.9(0x80000000) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp - ; CHECK-NEXT: $x0 = COPY [[COPY6]] + ; CHECK-NEXT: $x0 = nuw ADDXri [[COPY]], 20, 0 ; CHECK-NEXT: BL @_Z6assignPj, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.9..backedge.backedge: ; CHECK-NEXT: successors: %bb.1(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[ADDXri6:%[0-9]+]]:gpr64sp = ADDXri [[PHI]], 1, 0 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gpr64all = COPY [[ADDXri6]] + ; CHECK-NEXT: [[ADDXri1:%[0-9]+]]:gpr64sp = ADDXri [[PHI]], 1, 0 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64all = COPY [[ADDXri1]] ; CHECK-NEXT: B %bb.1 bb.0 (%ir-block.bb): successors: %bb.1(0x80000000) @@ -1383,9 +1373,8 @@ ; CHECK-NEXT: liveins: $x0, $x1, $x2, $w3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr32common = COPY $w3 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x2 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x1 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x0 ; CHECK-NEXT: [[SUBSWri:%[0-9]+]]:gpr32 = SUBSWri [[COPY]], 1, 0, implicit-def $nzcv ; CHECK-NEXT: Bcc 11, %bb.2, implicit $nzcv ; CHECK-NEXT: B %bb.1 @@ -1393,30 +1382,30 @@ ; CHECK-NEXT: bb.1.for.body.preheader: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[LDRWui:%[0-9]+]]:gpr32common = LDRWui [[COPY3]], 0 :: (load (s32) from %ir.read, !tbaa !0) + ; CHECK-NEXT: [[LDRWui:%[0-9]+]]:gpr32common = LDRWui [[COPY2]], 0 :: (load (s32) from %ir.read, !tbaa !0) ; CHECK-NEXT: [[ADDWri:%[0-9]+]]:gpr32sp = ADDWri [[LDRWui]], 42, 0 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr32all = COPY [[ADDWri]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr32all = COPY [[ADDWri]] ; CHECK-NEXT: [[MOVi32imm:%[0-9]+]]:gpr32 = MOVi32imm 43 - ; CHECK-NEXT: STRWui killed [[MOVi32imm]], [[COPY3]], 0 :: (store (s32) into %ir.read, !tbaa !0) + ; CHECK-NEXT: STRWui killed [[MOVi32imm]], [[COPY2]], 0 :: (store (s32) into %ir.read, !tbaa !0) ; CHECK-NEXT: B %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2.for.cond.cleanup: ; CHECK-NEXT: [[PHI:%[0-9]+]]:gpr32 = PHI [[COPY]], %bb.0, %6, %bb.3 - ; CHECK-NEXT: STRWui [[PHI]], [[COPY2]], 0 :: (store (s32) into %ir.write, !tbaa !0) + ; CHECK-NEXT: STRWui [[PHI]], [[COPY1]], 0 :: (store (s32) into %ir.write, !tbaa !0) ; CHECK-NEXT: RET_ReallyLR ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3.for.body: ; CHECK-NEXT: successors: %bb.2(0x04000000), %bb.3(0x7c000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PHI1:%[0-9]+]]:gpr32common = PHI [[COPY4]], %bb.1, %8, %bb.3 + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:gpr32common = PHI [[COPY3]], %bb.1, %8, %bb.3 ; CHECK-NEXT: [[PHI2:%[0-9]+]]:gpr32sp = PHI [[COPY]], %bb.1, %7, %bb.3 ; CHECK-NEXT: [[PHI3:%[0-9]+]]:gpr32 = PHI [[COPY]], %bb.1, %6, %bb.3 ; CHECK-NEXT: [[SDIVWr:%[0-9]+]]:gpr32 = SDIVWr [[PHI3]], [[PHI1]] - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr32all = COPY [[SDIVWr]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr32all = COPY [[SDIVWr]] ; CHECK-NEXT: [[SUBSWri1:%[0-9]+]]:gpr32 = SUBSWri [[PHI2]], 1, 0, implicit-def $nzcv - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr32all = COPY [[SUBSWri1]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr32all = COPY [[SUBSWri1]] ; CHECK-NEXT: [[ADDWri1:%[0-9]+]]:gpr32sp = ADDWri [[PHI1]], 1, 0 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gpr32all = COPY [[ADDWri1]] + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr32all = COPY [[ADDWri1]] ; CHECK-NEXT: Bcc 0, %bb.2, implicit $nzcv ; CHECK-NEXT: B %bb.3 bb.0.entry: Index: llvm/test/CodeGen/AArch64/sink-and-fold.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sink-and-fold.ll @@ -0,0 +1,296 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s | FileCheck %s +target triple = "aarch64-linux" + +declare i32 @use(...) + +define i32 @f0(i1 %c1, ptr %p) nounwind { +; CHECK-LABEL: f0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: tbz w0, #0, .LBB0_2 +; CHECK-NEXT: // %bb.1: // %if.then +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: add x0, x1, #8 +; CHECK-NEXT: bl use +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB0_2: // %if.else +; CHECK-NEXT: ldur w0, [x1, #8] +; CHECK-NEXT: ret +entry: + %a = getelementptr i32, ptr %p, i32 2 + br i1 %c1, label %if.then, label %if.else + +if.then: + %v0 = call i32 @use(ptr %a) + br label %exit + +if.else: + %v1 = load i32, ptr %a + br label %exit + +exit: + %v = phi i32 [%v0, %if.then], [%v1, %if.else] + ret i32 %v +} + +define i32 @f1(i1 %c1, ptr %p, i64 %i) nounwind { +; CHECK-LABEL: f1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: tbz w0, #0, .LBB1_2 +; CHECK-NEXT: // %bb.1: // %if.then +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: add x0, x1, x2 +; CHECK-NEXT: bl use +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB1_2: // %if.else +; CHECK-NEXT: ldr w0, [x1, x2] +; CHECK-NEXT: ret +entry: + %a = getelementptr i8, ptr %p, i64 %i + br i1 %c1, label %if.then, label %if.else + +if.then: + %v0 = call i32 @use(ptr %a) + br label %exit + +if.else: + %v1 = load i32, ptr %a + br label %exit + +exit: + %v = phi i32 [%v0, %if.then], [%v1, %if.else] + ret i32 %v +} + +; Address calculation too slow. +%S = type {i32, [7 x i32] } +define i32 @f2(i1 %c1, ptr %p, i64 %i) "tune-cpu"="neoverse-n1" nounwind { +; CHECK-LABEL: f2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: add x1, x1, x2, lsl #5 +; CHECK-NEXT: tbz w0, #0, .LBB2_2 +; CHECK-NEXT: // %bb.1: // %if.then +; CHECK-NEXT: mov x0, x1 +; CHECK-NEXT: bl use +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB2_2: // %if.else +; CHECK-NEXT: mov w0, #1 // =0x1 +; CHECK-NEXT: bl use +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %a = getelementptr %S, ptr %p, i64 %i + br i1 %c1, label %if.then, label %if.else + +if.then: + %v0 = call i32 @use(ptr %a) + br label %exit + +if.else: + %v1 = call i32 @use(i32 1, ptr %a) + br label %exit + +exit: + %v = phi i32 [%v0, %if.then], [%v1, %if.else] + ret i32 %v +} + +; Address calculation cheap enough on some cores. +define i32 @f3(i1 %c1, ptr %p, i64 %i) "tune-cpu"="neoverse-n1" nounwind { +; CHECK-LABEL: f3: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: tbz w0, #0, .LBB3_2 +; CHECK-NEXT: // %bb.1: // %if.then +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: add x0, x1, x2, lsl #2 +; CHECK-NEXT: bl use +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB3_2: // %if.else +; CHECK-NEXT: ldr w0, [x1, x2, lsl #2] +; CHECK-NEXT: ret +entry: + %a = getelementptr i32, ptr %p, i64 %i + br i1 %c1, label %if.then, label %if.else + +if.then: + %v0 = call i32 @use(ptr %a) + br label %exit + +if.else: + %v1 = load i32, ptr %a + br label %exit + +exit: + %v = phi i32 [%v0, %if.then], [%v1, %if.else] + ret i32 %v +} + +define void @f4(ptr %a, i64 %n) nounwind { +; CHECK-LABEL: f4: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmp x1, #1 +; CHECK-NEXT: b.lt .LBB4_9 +; CHECK-NEXT: // %bb.1: // %LI.preheader +; CHECK-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; CHECK-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: mov x22, xzr +; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, x1 +; CHECK-NEXT: mov x20, x0 +; CHECK-NEXT: b .LBB4_3 +; CHECK-NEXT: .LBB4_2: // %LI.latch +; CHECK-NEXT: // in Loop: Header=BB4_3 Depth=1 +; CHECK-NEXT: cmp x22, x19 +; CHECK-NEXT: mov x22, x23 +; CHECK-NEXT: b.ge .LBB4_8 +; CHECK-NEXT: .LBB4_3: // %LI +; CHECK-NEXT: // =>This Loop Header: Depth=1 +; CHECK-NEXT: // Child Loop BB4_6 Depth 2 +; CHECK-NEXT: mov x21, xzr +; CHECK-NEXT: add x23, x22, #1 +; CHECK-NEXT: b .LBB4_6 +; CHECK-NEXT: .LBB4_4: // %if.else +; CHECK-NEXT: // in Loop: Header=BB4_6 Depth=2 +; CHECK-NEXT: ldr w0, [x20, x22, lsl #2] +; CHECK-NEXT: .LBB4_5: // %LJ.latch +; CHECK-NEXT: // in Loop: Header=BB4_6 Depth=2 +; CHECK-NEXT: add x8, x21, #1 +; CHECK-NEXT: str w0, [x20, x21, lsl #2] +; CHECK-NEXT: mov x21, x8 +; CHECK-NEXT: sub x9, x8, #1 +; CHECK-NEXT: cmp x9, x19 +; CHECK-NEXT: b.ge .LBB4_2 +; CHECK-NEXT: .LBB4_6: // %LJ +; CHECK-NEXT: // Parent Loop BB4_3 Depth=1 +; CHECK-NEXT: // => This Inner Loop Header: Depth=2 +; CHECK-NEXT: ldr w8, [x20, x21, lsl #2] +; CHECK-NEXT: tbz w8, #31, .LBB4_4 +; CHECK-NEXT: // %bb.7: // %if.then +; CHECK-NEXT: // in Loop: Header=BB4_6 Depth=2 +; CHECK-NEXT: add x0, x20, x22, lsl #2 +; CHECK-NEXT: mov x1, x21 +; CHECK-NEXT: bl use +; CHECK-NEXT: b .LBB4_5 +; CHECK-NEXT: .LBB4_8: +; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; CHECK-NEXT: .LBB4_9: // %exit +; CHECK-NEXT: ret +entry: + %c0 = icmp slt i64 %n, 1 + br i1 %c0, label %exit, label %LI + +LI: + %i = phi i64 [0, %entry], [%i.next, %LI.latch] + %i.next = add i64 %i, 1 + %ai.ptr = getelementptr i32, ptr %a, i64 %i + br label %LJ + +LJ: + %j = phi i64 [0, %LI], [%j.next, %LJ.latch] + %j.next = add i64 %j, 1 + %aj.ptr = getelementptr i32, ptr %a, i64 %j + %aj = load i32, ptr %aj.ptr + %c1 = icmp slt i32 %aj, 0 + br i1 %c1, label %if.then, label %if.else + +if.then: + %v = call i32 @use(ptr %ai.ptr, i64 %j) + store i32 %v, ptr %aj.ptr + br label %LJ.latch + +if.else: + %ai = load i32, ptr %ai.ptr + store i32 %ai, ptr %aj.ptr + br label %LJ.latch + +LJ.latch: + %c2 = icmp slt i64 %j, %n + br i1 %c2, label %LJ, label %LI.latch + +LI.latch: + %c3 = icmp slt i64 %i, %n + br i1 %c3, label %LI, label %exit + +exit: + ret void +} + +%T = type { i32, i32, i32 } + +define void @f5(ptr %a, i32 %n, i32 %k) nounwind { +; CHECK-LABEL: f5: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: cmp w1, #1 +; CHECK-NEXT: b.lt .LBB5_7 +; CHECK-NEXT: // %bb.1: // %L.preheader +; CHECK-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill +; CHECK-NEXT: mov w8, #12 // =0xc +; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: mov w19, w1 +; CHECK-NEXT: smaddl x20, w2, w8, x0 +; CHECK-NEXT: add x21, x0, #8 +; CHECK-NEXT: mov w22, #-1 // =0xffffffff +; CHECK-NEXT: b .LBB5_4 +; CHECK-NEXT: .LBB5_2: // %if.else +; CHECK-NEXT: // in Loop: Header=BB5_4 Depth=1 +; CHECK-NEXT: ldur w0, [x20, #4] +; CHECK-NEXT: .LBB5_3: // %L.latch +; CHECK-NEXT: // in Loop: Header=BB5_4 Depth=1 +; CHECK-NEXT: add w22, w22, #1 +; CHECK-NEXT: cmp w22, w19 +; CHECK-NEXT: str w0, [x21], #12 +; CHECK-NEXT: b.ge .LBB5_6 +; CHECK-NEXT: .LBB5_4: // %L +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr w8, [x21] +; CHECK-NEXT: tbz w8, #31, .LBB5_2 +; CHECK-NEXT: // %bb.5: // %if.then +; CHECK-NEXT: // in Loop: Header=BB5_4 Depth=1 +; CHECK-NEXT: add x0, x20, #4 +; CHECK-NEXT: add w1, w22, #1 +; CHECK-NEXT: bl use +; CHECK-NEXT: b .LBB5_3 +; CHECK-NEXT: .LBB5_6: +; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; CHECK-NEXT: .LBB5_7: // %exit +; CHECK-NEXT: ret +entry: + %p = getelementptr %T, ptr %a, i32 %k, i32 1 + %c0 = icmp slt i32 %n, 1 + br i1 %c0, label %exit, label %L + +L: + %i = phi i32 [0, %entry], [%i.next, %L.latch] + %i.next = add i32 %i, 1 + %ai.ptr = getelementptr %T, ptr %a, i32 %i, i32 2 + %ai = load i32, ptr %ai.ptr + %c1 = icmp slt i32 %ai, 0 + br i1 %c1, label %if.then, label %if.else + +if.then: + %u.0 = call i32 @use(ptr %p, i32 %i) + br label %L.latch + +if.else: + %u.1 = load i32, ptr %p + br label %L.latch + +L.latch: + %u = phi i32 [%u.0, %if.then], [%u.1, %if.else] + store i32 %u, ptr %ai.ptr + %c2 = icmp slt i32 %i, %n + br i1 %c2, label %L, label %exit + +exit: + ret void +}