diff --git a/llvm/include/llvm/CodeGen/GlobalISel/Localizer.h b/llvm/include/llvm/CodeGen/GlobalISel/Localizer.h --- a/llvm/include/llvm/CodeGen/GlobalISel/Localizer.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/Localizer.h @@ -45,7 +45,7 @@ MachineRegisterInfo *MRI; /// Check whether or not \p MI needs to be moved close to its uses. - static bool shouldLocalize(const MachineInstr &MI); + bool shouldLocalize(const MachineInstr &MI); /// Check if \p MOUse is used in the same basic block as \p Def. /// If the use is in the same block, we say it is local. @@ -57,6 +57,13 @@ /// Initialize the field members using \p MF. void init(MachineFunction &MF); + /// Do inter-block localization from the entry block. + bool localizeInterBlock(MachineFunction &MF, + SmallPtrSetImpl &LocalizedInstrs); + + /// Do intra-block localization of already localized instructions. + bool localizeIntraBlock(SmallPtrSetImpl &LocalizedInstrs); + public: Localizer(); diff --git a/llvm/lib/CodeGen/GlobalISel/Localizer.cpp b/llvm/lib/CodeGen/GlobalISel/Localizer.cpp --- a/llvm/lib/CodeGen/GlobalISel/Localizer.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Localizer.cpp @@ -40,6 +40,16 @@ case TargetOpcode::G_FCONSTANT: case TargetOpcode::G_FRAME_INDEX: return true; + case TargetOpcode::G_GLOBAL_VALUE: { + // G_GLOBAL_VALUES will usually result in a two instruction materialization + // sequence, at least on AArch64, which can increase code size if we remat + // in multiple places vs just taking a single hit with a spill + reload. + // FIXME: Make this configurable by the target with a hook. + unsigned Reg = MI.getOperand(0).getReg(); + int NumUses = std::distance(MRI->use_instr_nodbg_begin(Reg), + MRI->use_instr_nodbg_end()); + return NumUses < 3; + } } } @@ -57,6 +67,106 @@ return InsertMBB == Def.getParent(); } +bool Localizer::localizeInterBlock( + MachineFunction &MF, SmallPtrSetImpl &LocalizedInstrs) { + bool Changed = false; + DenseMap, unsigned> MBBWithLocalDef; + + // Since the IRTranslator only emits constants into the entry block, and the + // rest of the GISel pipeline generally emits constants close to their users, + // we only localize instructions in the entry block here. This might change if + // we start doing CSE across blocks. + auto &MBB = MF.front(); + for (MachineInstr &MI : MBB) { + if (LocalizedInstrs.count(&MI) || !shouldLocalize(MI)) + continue; + LLVM_DEBUG(dbgs() << "Should localize: " << MI); + assert(MI.getDesc().getNumDefs() == 1 && + "More than one definition not supported yet"); + unsigned Reg = MI.getOperand(0).getReg(); + // Check if all the users of MI are local. + // We are going to invalidation the list of use operands, so we + // can't use range iterator. + for (auto MOIt = MRI->use_begin(Reg), MOItEnd = MRI->use_end(); + MOIt != MOItEnd;) { + MachineOperand &MOUse = *MOIt++; + // Check if the use is already local. + MachineBasicBlock *InsertMBB; + LLVM_DEBUG(MachineInstr &MIUse = *MOUse.getParent(); + dbgs() << "Checking use: " << MIUse + << " #Opd: " << MIUse.getOperandNo(&MOUse) << '\n'); + if (isLocalUse(MOUse, MI, InsertMBB)) + continue; + LLVM_DEBUG(dbgs() << "Fixing non-local use\n"); + Changed = true; + auto MBBAndReg = std::make_pair(InsertMBB, Reg); + auto NewVRegIt = MBBWithLocalDef.find(MBBAndReg); + if (NewVRegIt == MBBWithLocalDef.end()) { + // Create the localized instruction. + MachineInstr *LocalizedMI = MF.CloneMachineInstr(&MI); + LocalizedInstrs.insert(LocalizedMI); + MachineInstr &UseMI = *MOUse.getParent(); + if (MRI->hasOneUse(Reg) && !UseMI.isPHI()) + InsertMBB->insert(InsertMBB->SkipPHIsAndLabels(UseMI), LocalizedMI); + else + InsertMBB->insert(InsertMBB->SkipPHIsAndLabels(InsertMBB->begin()), + LocalizedMI); + + // Set a new register for the definition. + unsigned NewReg = MRI->createGenericVirtualRegister(MRI->getType(Reg)); + MRI->setRegClassOrRegBank(NewReg, MRI->getRegClassOrRegBank(Reg)); + LocalizedMI->getOperand(0).setReg(NewReg); + NewVRegIt = + MBBWithLocalDef.insert(std::make_pair(MBBAndReg, NewReg)).first; + LLVM_DEBUG(dbgs() << "Inserted: " << *LocalizedMI); + } + LLVM_DEBUG(dbgs() << "Update use with: " << printReg(NewVRegIt->second) + << '\n'); + // Update the user reg. + MOUse.setReg(NewVRegIt->second); + } + } + return Changed; +} + +bool Localizer::localizeIntraBlock( + SmallPtrSetImpl &LocalizedInstrs) { + bool Changed = false; + + // For each already-localized instruction which has multiple users, then we + // scan the block top down from the current position until we hit one of them. + + // FIXME: Consider doing inst duplication if live ranges are very long due to + // many users, but this case may be better served by regalloc improvements. + + for (MachineInstr *MI : LocalizedInstrs) { + unsigned Reg = MI->getOperand(0).getReg(); + MachineBasicBlock &MBB = *MI->getParent(); + // If the instruction has a single use, we would have already moved it right + // before its user in localizeInterBlock(). + if (MRI->hasOneUse(Reg)) + continue; + + // All of the user MIs of this reg. + SmallPtrSet Users; + for (MachineInstr &UseMI : MRI->use_nodbg_instructions(Reg)) + Users.insert(&UseMI); + + MachineBasicBlock::iterator II(MI); + ++II; + while (II != MBB.end() && !Users.count(&*II)) + ++II; + + LLVM_DEBUG(dbgs() << "Intra-block: moving " << *MI << " before " << *&*II + << "\n"); + assert(II != MBB.end() && "Didn't find the user in the MBB"); + MI->removeFromParent(); + MBB.insert(II, MI); + Changed = true; + } + return Changed; +} + bool Localizer::runOnMachineFunction(MachineFunction &MF) { // If the ISel pipeline failed, do not bother running that pass. if (MF.getProperties().hasProperty( @@ -67,62 +177,10 @@ init(MF); - bool Changed = false; - // Keep track of the instructions we localized. - // We won't need to process them if we see them later in the CFG. - SmallPtrSet LocalizedInstrs; - DenseMap, unsigned> MBBWithLocalDef; - // TODO: Do bottom up traversal. - for (MachineBasicBlock &MBB : MF) { - for (MachineInstr &MI : MBB) { - if (LocalizedInstrs.count(&MI) || !shouldLocalize(MI)) - continue; - LLVM_DEBUG(dbgs() << "Should localize: " << MI); - assert(MI.getDesc().getNumDefs() == 1 && - "More than one definition not supported yet"); - unsigned Reg = MI.getOperand(0).getReg(); - // Check if all the users of MI are local. - // We are going to invalidation the list of use operands, so we - // can't use range iterator. - for (auto MOIt = MRI->use_begin(Reg), MOItEnd = MRI->use_end(); - MOIt != MOItEnd;) { - MachineOperand &MOUse = *MOIt++; - // Check if the use is already local. - MachineBasicBlock *InsertMBB; - LLVM_DEBUG(MachineInstr &MIUse = *MOUse.getParent(); - dbgs() << "Checking use: " << MIUse - << " #Opd: " << MIUse.getOperandNo(&MOUse) << '\n'); - if (isLocalUse(MOUse, MI, InsertMBB)) - continue; - LLVM_DEBUG(dbgs() << "Fixing non-local use\n"); - Changed = true; - auto MBBAndReg = std::make_pair(InsertMBB, Reg); - auto NewVRegIt = MBBWithLocalDef.find(MBBAndReg); - if (NewVRegIt == MBBWithLocalDef.end()) { - // Create the localized instruction. - MachineInstr *LocalizedMI = MF.CloneMachineInstr(&MI); - LocalizedInstrs.insert(LocalizedMI); - // Don't try to be smart for the insertion point. - // There is no guarantee that the first seen use is the first - // use in the block. - InsertMBB->insert(InsertMBB->SkipPHIsAndLabels(InsertMBB->begin()), - LocalizedMI); + // Keep track of the instructions we localized. We'll do a second pass of + // intra-block localization to further reduce live ranges. + SmallPtrSet LocalizedInstrs; - // Set a new register for the definition. - unsigned NewReg = - MRI->createGenericVirtualRegister(MRI->getType(Reg)); - MRI->setRegClassOrRegBank(NewReg, MRI->getRegClassOrRegBank(Reg)); - LocalizedMI->getOperand(0).setReg(NewReg); - NewVRegIt = - MBBWithLocalDef.insert(std::make_pair(MBBAndReg, NewReg)).first; - LLVM_DEBUG(dbgs() << "Inserted: " << *LocalizedMI); - } - LLVM_DEBUG(dbgs() << "Update use with: " << printReg(NewVRegIt->second) - << '\n'); - // Update the user reg. - MOUse.setReg(NewVRegIt->second); - } - } - } - return Changed; + bool Changed = localizeInterBlock(MF, LocalizedInstrs); + return Changed |= localizeIntraBlock(LocalizedInstrs); } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/localizer.mir b/llvm/test/CodeGen/AArch64/GlobalISel/localizer.mir --- a/llvm/test/CodeGen/AArch64/GlobalISel/localizer.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/localizer.mir @@ -15,6 +15,29 @@ define void @float_non_local_phi_use_followed_by_use_fi() { ret void } define void @non_local_phi() { ret void } define void @non_local_label() { ret void } + + @var1 = common global i32 0, align 4 + @var2 = common global i32 0, align 4 + @var3 = common global i32 0, align 4 + @var4 = common global i32 0, align 4 + + define i32 @intrablock_with_globalvalue() { + entry: + %0 = load i32, i32* @var1, align 4 + %cmp = icmp eq i32 %0, 1 + br i1 %cmp, label %if.then, label %if.end + + if.then: + store i32 2, i32* @var2, align 4 + store i32 3, i32* @var1, align 4 + store i32 2, i32* @var3, align 4 + store i32 3, i32* @var1, align 4 + br label %if.end + + if.end: + ret i32 0 + } + ... --- @@ -301,3 +324,68 @@ %2:fpr(s32) = G_FADD %0, %1 G_BR %bb.1 ... +--- +name: intrablock_with_globalvalue +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: intrablock_with_globalvalue + ; CHECK: bb.0.entry: + + ; Some of these instructions are dead. We're checking: + ; 1. That var1 isn't localized because it has 3 users. + ; 2. That the other instructions are sunk immediately before their first user + ; in the if.then block or as close as possible. + ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK: [[GV:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var1 + ; CHECK: [[C:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[C1:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 2 + ; CHECK: [[GV1:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var2 + ; CHECK: [[C2:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 3 + ; CHECK: [[GV2:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var3 + ; CHECK: [[C3:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[LOAD:%[0-9]+]]:gpr(s32) = G_LOAD [[GV]](p0) :: (load 4 from @var1) + ; CHECK: [[ICMP:%[0-9]+]]:gpr(s32) = G_ICMP intpred(eq), [[LOAD]](s32), [[C]] + ; CHECK: [[TRUNC:%[0-9]+]]:gpr(s1) = G_TRUNC [[ICMP]](s32) + ; CHECK: G_BRCOND [[TRUNC]](s1), %bb.1 + ; CHECK: G_BR %bb.2 + ; CHECK: bb.1.if.then: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: [[GV3:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var2 + ; CHECK: [[C4:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 2 + ; CHECK: G_STORE [[C4]](s32), [[GV3]](p0) :: (store 4 into @var2) + ; CHECK: [[C5:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 3 + ; CHECK: G_STORE [[C5]](s32), [[GV]](p0) :: (store 4 into @var1) + ; CHECK: [[GV4:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var3 + ; CHECK: G_STORE [[C4]](s32), [[GV4]](p0) :: (store 4 into @var3) + ; CHECK: G_STORE [[C5]](s32), [[GV]](p0) :: (store 4 into @var1) + ; CHECK: bb.2.if.end: + ; CHECK: [[C6:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 0 + ; CHECK: $w0 = COPY [[C6]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + bb.1.entry: + %1:gpr(p0) = G_GLOBAL_VALUE @var1 + %2:gpr(s32) = G_CONSTANT i32 1 + %4:gpr(s32) = G_CONSTANT i32 2 + %5:gpr(p0) = G_GLOBAL_VALUE @var2 + %6:gpr(s32) = G_CONSTANT i32 3 + %7:gpr(p0) = G_GLOBAL_VALUE @var3 + %8:gpr(s32) = G_CONSTANT i32 0 + %0:gpr(s32) = G_LOAD %1(p0) :: (load 4 from @var1) + %9:gpr(s32) = G_ICMP intpred(eq), %0(s32), %2 + %3:gpr(s1) = G_TRUNC %9(s32) + G_BRCOND %3(s1), %bb.2 + G_BR %bb.3 + + bb.2.if.then: + G_STORE %4(s32), %5(p0) :: (store 4 into @var2) + G_STORE %6(s32), %1(p0) :: (store 4 into @var1) + G_STORE %4(s32), %7(p0) :: (store 4 into @var3) + G_STORE %6(s32), %1(p0) :: (store 4 into @var1) + + bb.3.if.end: + $w0 = COPY %8(s32) + RET_ReallyLR implicit $w0 + +...