Index: llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h @@ -1053,6 +1053,11 @@ /// \returns True if the target wants to expand the given reduction intrinsic /// into a shuffle sequence. bool shouldExpandReduction(const IntrinsicInst *II) const; + + /// \returns the size cost of rematerializing a GlobalValue address relative + /// to a stack reload. + unsigned getGISelRematGlobalCost() const; + /// @} private: @@ -1269,6 +1274,7 @@ virtual bool useReductionIntrinsic(unsigned Opcode, Type *Ty, ReductionFlags) const = 0; virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0; + virtual unsigned getGISelRematGlobalCost() const = 0; virtual int getInstructionLatency(const Instruction *I) = 0; }; @@ -1701,6 +1707,11 @@ bool shouldExpandReduction(const IntrinsicInst *II) const override { return Impl.shouldExpandReduction(II); } + + unsigned getGISelRematGlobalCost() const override { + return Impl.getGISelRematGlobalCost(); + } + int getInstructionLatency(const Instruction *I) override { return Impl.getInstructionLatency(I); } Index: llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -572,6 +572,10 @@ return true; } + unsigned getGISelRematGlobalCost() const { + return 1; + } + protected: // Obtain the minimum required size to hold the value (without the sign) // In case of a vector it returns the min required size for one element. Index: llvm/trunk/include/llvm/CodeGen/GlobalISel/Localizer.h =================================================================== --- llvm/trunk/include/llvm/CodeGen/GlobalISel/Localizer.h +++ llvm/trunk/include/llvm/CodeGen/GlobalISel/Localizer.h @@ -27,6 +27,7 @@ namespace llvm { // Forward declarations. class MachineRegisterInfo; +class TargetTransformInfo; /// This pass implements the localization mechanism described at the /// top of this file. One specificity of the implementation is that @@ -43,9 +44,11 @@ /// MRI contains all the register class/bank information that this /// pass uses and updates. MachineRegisterInfo *MRI; + /// TTI used for getting remat costs for instructions. + TargetTransformInfo *TTI; /// Check whether or not \p MI needs to be moved close to its uses. - static bool shouldLocalize(const MachineInstr &MI); + bool shouldLocalize(const MachineInstr &MI); /// Check if \p MOUse is used in the same basic block as \p Def. /// If the use is in the same block, we say it is local. @@ -57,6 +60,13 @@ /// Initialize the field members using \p MF. void init(MachineFunction &MF); + /// Do inter-block localization from the entry block. + bool localizeInterBlock(MachineFunction &MF, + SmallPtrSetImpl &LocalizedInstrs); + + /// Do intra-block localization of already localized instructions. + bool localizeIntraBlock(SmallPtrSetImpl &LocalizedInstrs); + public: Localizer(); Index: llvm/trunk/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/trunk/lib/Analysis/TargetTransformInfo.cpp +++ llvm/trunk/lib/Analysis/TargetTransformInfo.cpp @@ -724,6 +724,10 @@ return TTIImpl->shouldExpandReduction(II); } +unsigned TargetTransformInfo::getGISelRematGlobalCost() const { + return TTIImpl->getGISelRematGlobalCost(); +} + int TargetTransformInfo::getInstructionLatency(const Instruction *I) const { return TTIImpl->getInstructionLatency(I); } Index: llvm/trunk/lib/CodeGen/GlobalISel/Localizer.cpp =================================================================== --- llvm/trunk/lib/CodeGen/GlobalISel/Localizer.cpp +++ llvm/trunk/lib/CodeGen/GlobalISel/Localizer.cpp @@ -10,6 +10,7 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/GlobalISel/Localizer.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -20,17 +21,55 @@ using namespace llvm; char Localizer::ID = 0; -INITIALIZE_PASS(Localizer, DEBUG_TYPE, - "Move/duplicate certain instructions close to their use", false, - false) +INITIALIZE_PASS_BEGIN(Localizer, DEBUG_TYPE, + "Move/duplicate certain instructions close to their use", + false, false) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_END(Localizer, DEBUG_TYPE, + "Move/duplicate certain instructions close to their use", + false, false) Localizer::Localizer() : MachineFunctionPass(ID) { initializeLocalizerPass(*PassRegistry::getPassRegistry()); } -void Localizer::init(MachineFunction &MF) { MRI = &MF.getRegInfo(); } +void Localizer::init(MachineFunction &MF) { + MRI = &MF.getRegInfo(); + TTI = &getAnalysis().getTTI(MF.getFunction()); +} bool Localizer::shouldLocalize(const MachineInstr &MI) { + // Assuming a spill and reload of a value has a cost of 1 instruction each, + // this helper function computes the maximum number of uses we should consider + // for remat. E.g. on arm64 global addresses take 2 insts to materialize. We + // break even in terms of code size when the original MI has 2 users vs + // choosing to potentially spill. Any more than 2 users we we have a net code + // size increase. This doesn't take into account register pressure though. + auto maxUses = [](unsigned RematCost) { + // A cost of 1 means remats are basically free. + if (RematCost == 1) + return UINT_MAX; + if (RematCost == 2) + return 2U; + + // Remat is too expensive, only sink if there's one user. + if (RematCost > 2) + return 1U; + llvm_unreachable("Unexpected remat cost"); + }; + + // Helper to walk through uses and terminate if we've reached a limit. Saves + // us spending time traversing uses if all we want to know is if it's >= min. + auto isUsesAtMost = [&](unsigned Reg, unsigned MaxUses) { + unsigned NumUses = 0; + auto UI = MRI->use_instr_nodbg_begin(Reg), UE = MRI->use_instr_nodbg_end(); + for (; UI != UE && NumUses < MaxUses; ++UI) { + NumUses++; + } + // If we haven't reached the end yet then there are more than MaxUses users. + return UI == UE; + }; + switch (MI.getOpcode()) { default: return false; @@ -40,10 +79,20 @@ case TargetOpcode::G_FCONSTANT: case TargetOpcode::G_FRAME_INDEX: return true; + case TargetOpcode::G_GLOBAL_VALUE: { + unsigned RematCost = TTI->getGISelRematGlobalCost(); + unsigned Reg = MI.getOperand(0).getReg(); + unsigned MaxUses = maxUses(RematCost); + if (MaxUses == UINT_MAX) + return true; // Remats are "free" so always localize. + bool B = isUsesAtMost(Reg, MaxUses); + return B; + } } } void Localizer::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); getSelectionDAGFallbackAnalysisUsage(AU); MachineFunctionPass::getAnalysisUsage(AU); } @@ -57,6 +106,106 @@ return InsertMBB == Def.getParent(); } +bool Localizer::localizeInterBlock( + MachineFunction &MF, SmallPtrSetImpl &LocalizedInstrs) { + bool Changed = false; + DenseMap, unsigned> MBBWithLocalDef; + + // Since the IRTranslator only emits constants into the entry block, and the + // rest of the GISel pipeline generally emits constants close to their users, + // we only localize instructions in the entry block here. This might change if + // we start doing CSE across blocks. + auto &MBB = MF.front(); + for (MachineInstr &MI : MBB) { + if (LocalizedInstrs.count(&MI) || !shouldLocalize(MI)) + continue; + LLVM_DEBUG(dbgs() << "Should localize: " << MI); + assert(MI.getDesc().getNumDefs() == 1 && + "More than one definition not supported yet"); + unsigned Reg = MI.getOperand(0).getReg(); + // Check if all the users of MI are local. + // We are going to invalidation the list of use operands, so we + // can't use range iterator. + for (auto MOIt = MRI->use_begin(Reg), MOItEnd = MRI->use_end(); + MOIt != MOItEnd;) { + MachineOperand &MOUse = *MOIt++; + // Check if the use is already local. + MachineBasicBlock *InsertMBB; + LLVM_DEBUG(MachineInstr &MIUse = *MOUse.getParent(); + dbgs() << "Checking use: " << MIUse + << " #Opd: " << MIUse.getOperandNo(&MOUse) << '\n'); + if (isLocalUse(MOUse, MI, InsertMBB)) + continue; + LLVM_DEBUG(dbgs() << "Fixing non-local use\n"); + Changed = true; + auto MBBAndReg = std::make_pair(InsertMBB, Reg); + auto NewVRegIt = MBBWithLocalDef.find(MBBAndReg); + if (NewVRegIt == MBBWithLocalDef.end()) { + // Create the localized instruction. + MachineInstr *LocalizedMI = MF.CloneMachineInstr(&MI); + LocalizedInstrs.insert(LocalizedMI); + MachineInstr &UseMI = *MOUse.getParent(); + if (MRI->hasOneUse(Reg) && !UseMI.isPHI()) + InsertMBB->insert(InsertMBB->SkipPHIsAndLabels(UseMI), LocalizedMI); + else + InsertMBB->insert(InsertMBB->SkipPHIsAndLabels(InsertMBB->begin()), + LocalizedMI); + + // Set a new register for the definition. + unsigned NewReg = MRI->createGenericVirtualRegister(MRI->getType(Reg)); + MRI->setRegClassOrRegBank(NewReg, MRI->getRegClassOrRegBank(Reg)); + LocalizedMI->getOperand(0).setReg(NewReg); + NewVRegIt = + MBBWithLocalDef.insert(std::make_pair(MBBAndReg, NewReg)).first; + LLVM_DEBUG(dbgs() << "Inserted: " << *LocalizedMI); + } + LLVM_DEBUG(dbgs() << "Update use with: " << printReg(NewVRegIt->second) + << '\n'); + // Update the user reg. + MOUse.setReg(NewVRegIt->second); + } + } + return Changed; +} + +bool Localizer::localizeIntraBlock( + SmallPtrSetImpl &LocalizedInstrs) { + bool Changed = false; + + // For each already-localized instruction which has multiple users, then we + // scan the block top down from the current position until we hit one of them. + + // FIXME: Consider doing inst duplication if live ranges are very long due to + // many users, but this case may be better served by regalloc improvements. + + for (MachineInstr *MI : LocalizedInstrs) { + unsigned Reg = MI->getOperand(0).getReg(); + MachineBasicBlock &MBB = *MI->getParent(); + // If the instruction has a single use, we would have already moved it right + // before its user in localizeInterBlock(). + if (MRI->hasOneUse(Reg)) + continue; + + // All of the user MIs of this reg. + SmallPtrSet Users; + for (MachineInstr &UseMI : MRI->use_nodbg_instructions(Reg)) + Users.insert(&UseMI); + + MachineBasicBlock::iterator II(MI); + ++II; + while (II != MBB.end() && !Users.count(&*II)) + ++II; + + LLVM_DEBUG(dbgs() << "Intra-block: moving " << *MI << " before " << *&*II + << "\n"); + assert(II != MBB.end() && "Didn't find the user in the MBB"); + MI->removeFromParent(); + MBB.insert(II, MI); + Changed = true; + } + return Changed; +} + bool Localizer::runOnMachineFunction(MachineFunction &MF) { // If the ISel pipeline failed, do not bother running that pass. if (MF.getProperties().hasProperty( @@ -67,62 +216,10 @@ init(MF); - bool Changed = false; - // Keep track of the instructions we localized. - // We won't need to process them if we see them later in the CFG. - SmallPtrSet LocalizedInstrs; - DenseMap, unsigned> MBBWithLocalDef; - // TODO: Do bottom up traversal. - for (MachineBasicBlock &MBB : MF) { - for (MachineInstr &MI : MBB) { - if (LocalizedInstrs.count(&MI) || !shouldLocalize(MI)) - continue; - LLVM_DEBUG(dbgs() << "Should localize: " << MI); - assert(MI.getDesc().getNumDefs() == 1 && - "More than one definition not supported yet"); - unsigned Reg = MI.getOperand(0).getReg(); - // Check if all the users of MI are local. - // We are going to invalidation the list of use operands, so we - // can't use range iterator. - for (auto MOIt = MRI->use_begin(Reg), MOItEnd = MRI->use_end(); - MOIt != MOItEnd;) { - MachineOperand &MOUse = *MOIt++; - // Check if the use is already local. - MachineBasicBlock *InsertMBB; - LLVM_DEBUG(MachineInstr &MIUse = *MOUse.getParent(); - dbgs() << "Checking use: " << MIUse - << " #Opd: " << MIUse.getOperandNo(&MOUse) << '\n'); - if (isLocalUse(MOUse, MI, InsertMBB)) - continue; - LLVM_DEBUG(dbgs() << "Fixing non-local use\n"); - Changed = true; - auto MBBAndReg = std::make_pair(InsertMBB, Reg); - auto NewVRegIt = MBBWithLocalDef.find(MBBAndReg); - if (NewVRegIt == MBBWithLocalDef.end()) { - // Create the localized instruction. - MachineInstr *LocalizedMI = MF.CloneMachineInstr(&MI); - LocalizedInstrs.insert(LocalizedMI); - // Don't try to be smart for the insertion point. - // There is no guarantee that the first seen use is the first - // use in the block. - InsertMBB->insert(InsertMBB->SkipPHIsAndLabels(InsertMBB->begin()), - LocalizedMI); + // Keep track of the instructions we localized. We'll do a second pass of + // intra-block localization to further reduce live ranges. + SmallPtrSet LocalizedInstrs; - // Set a new register for the definition. - unsigned NewReg = - MRI->createGenericVirtualRegister(MRI->getType(Reg)); - MRI->setRegClassOrRegBank(NewReg, MRI->getRegClassOrRegBank(Reg)); - LocalizedMI->getOperand(0).setReg(NewReg); - NewVRegIt = - MBBWithLocalDef.insert(std::make_pair(MBBAndReg, NewReg)).first; - LLVM_DEBUG(dbgs() << "Inserted: " << *LocalizedMI); - } - LLVM_DEBUG(dbgs() << "Update use with: " << printReg(NewVRegIt->second) - << '\n'); - // Update the user reg. - MOUse.setReg(NewVRegIt->second); - } - } - } - return Changed; + bool Changed = localizeInterBlock(MF, LocalizedInstrs); + return Changed |= localizeIntraBlock(LocalizedInstrs); } Index: llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -165,6 +165,10 @@ return false; } + unsigned getGISelRematGlobalCost() const { + return 2; + } + bool useReductionIntrinsic(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const; Index: llvm/trunk/test/CodeGen/AArch64/GlobalISel/localizer-arm64-tti.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/GlobalISel/localizer-arm64-tti.ll +++ llvm/trunk/test/CodeGen/AArch64/GlobalISel/localizer-arm64-tti.ll @@ -0,0 +1,62 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -o - -verify-machineinstrs -O0 -global-isel -stop-after=localizer %s | FileCheck %s +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-ios5.0.0" + +@var1 = common global i32 0, align 4 +@var2 = common global i32 0, align 4 +@var3 = common global i32 0, align 4 +@var4 = common global i32 0, align 4 + +; This is an ll test instead of MIR because -run-pass doesn't seem to support +; initializing the target TTI which we need for this test. + +; Some of the instructions in entry block are dead after this pass so don't +; strictly need to be checked for. + +define i32 @foo() { + ; CHECK-LABEL: name: foo + ; CHECK: bb.1.entry: + ; CHECK: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; CHECK: [[GV:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var1 + ; CHECK: [[C:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[C1:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 2 + ; CHECK: [[GV1:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var2 + ; CHECK: [[C2:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 3 + ; CHECK: [[GV2:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var3 + ; CHECK: [[C3:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[LOAD:%[0-9]+]]:gpr(s32) = G_LOAD [[GV]](p0) :: (load 4 from @var1) + ; CHECK: [[ICMP:%[0-9]+]]:gpr(s32) = G_ICMP intpred(eq), [[LOAD]](s32), [[C]] + ; CHECK: [[TRUNC:%[0-9]+]]:gpr(s1) = G_TRUNC [[ICMP]](s32) + ; CHECK: G_BRCOND [[TRUNC]](s1), %bb.2 + ; CHECK: G_BR %bb.3 + ; CHECK: bb.2.if.then: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: [[GV3:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var2 + ; CHECK: [[C4:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 2 + ; CHECK: G_STORE [[C4]](s32), [[GV3]](p0) :: (store 4 into @var2) + ; CHECK: [[C5:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 3 + ; CHECK: G_STORE [[C5]](s32), [[GV]](p0) :: (store 4 into @var1) + ; CHECK: [[GV4:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var3 + ; CHECK: G_STORE [[C4]](s32), [[GV4]](p0) :: (store 4 into @var3) + ; CHECK: G_STORE [[C5]](s32), [[GV]](p0) :: (store 4 into @var1) + ; CHECK: bb.3.if.end: + ; CHECK: [[C6:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 0 + ; CHECK: $w0 = COPY [[C6]](s32) + ; CHECK: RET_ReallyLR implicit $w0 +entry: + %0 = load i32, i32* @var1, align 4 + %cmp = icmp eq i32 %0, 1 + br i1 %cmp, label %if.then, label %if.end + +if.then: + store i32 2, i32* @var2, align 4 + store i32 3, i32* @var1, align 4 + store i32 2, i32* @var3, align 4 + store i32 3, i32* @var1, align 4 + br label %if.end + +if.end: + ret i32 0 +} + Index: llvm/trunk/test/CodeGen/AArch64/GlobalISel/localizer.mir =================================================================== --- llvm/trunk/test/CodeGen/AArch64/GlobalISel/localizer.mir +++ llvm/trunk/test/CodeGen/AArch64/GlobalISel/localizer.mir @@ -15,6 +15,29 @@ define void @float_non_local_phi_use_followed_by_use_fi() { ret void } define void @non_local_phi() { ret void } define void @non_local_label() { ret void } + + @var1 = common global i32 0, align 4 + @var2 = common global i32 0, align 4 + @var3 = common global i32 0, align 4 + @var4 = common global i32 0, align 4 + + define i32 @intrablock_with_globalvalue() { + entry: + %0 = load i32, i32* @var1, align 4 + %cmp = icmp eq i32 %0, 1 + br i1 %cmp, label %if.then, label %if.end + + if.then: + store i32 2, i32* @var2, align 4 + store i32 3, i32* @var1, align 4 + store i32 2, i32* @var3, align 4 + store i32 3, i32* @var1, align 4 + br label %if.end + + if.end: + ret i32 0 + } + ... --- @@ -301,3 +324,67 @@ %2:fpr(s32) = G_FADD %0, %1 G_BR %bb.1 ... +--- +name: intrablock_with_globalvalue +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: intrablock_with_globalvalue + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK: [[GV:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var1 + ; CHECK: [[C:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[C1:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 2 + ; CHECK: [[GV1:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var2 + ; CHECK: [[C2:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 3 + ; CHECK: [[GV2:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var3 + ; CHECK: [[C3:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[LOAD:%[0-9]+]]:gpr(s32) = G_LOAD [[GV]](p0) :: (load 4 from @var1) + ; CHECK: [[ICMP:%[0-9]+]]:gpr(s32) = G_ICMP intpred(eq), [[LOAD]](s32), [[C]] + ; CHECK: [[TRUNC:%[0-9]+]]:gpr(s1) = G_TRUNC [[ICMP]](s32) + ; CHECK: G_BRCOND [[TRUNC]](s1), %bb.1 + ; CHECK: G_BR %bb.2 + ; CHECK: bb.1.if.then: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: [[GV3:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var2 + ; CHECK: [[C4:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 2 + ; CHECK: G_STORE [[C4]](s32), [[GV3]](p0) :: (store 4 into @var2) + ; CHECK: [[GV4:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var1 + ; CHECK: [[C5:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 3 + ; CHECK: G_STORE [[C5]](s32), [[GV4]](p0) :: (store 4 into @var1) + ; CHECK: [[GV5:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var3 + ; CHECK: G_STORE [[C4]](s32), [[GV5]](p0) :: (store 4 into @var3) + ; CHECK: G_STORE [[C5]](s32), [[GV4]](p0) :: (store 4 into @var1) + ; CHECK: bb.2.if.end: + ; CHECK: [[C6:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 0 + ; CHECK: $w0 = COPY [[C6]](s32) + ; CHECK: RET_ReallyLR implicit $w0 + + ; Some of these instructions are dead. We're checking that the other instructions are + ; sunk immediately before their first user in the if.then block or as close as possible. + bb.1.entry: + %1:gpr(p0) = G_GLOBAL_VALUE @var1 + %2:gpr(s32) = G_CONSTANT i32 1 + %4:gpr(s32) = G_CONSTANT i32 2 + %5:gpr(p0) = G_GLOBAL_VALUE @var2 + %6:gpr(s32) = G_CONSTANT i32 3 + %7:gpr(p0) = G_GLOBAL_VALUE @var3 + %8:gpr(s32) = G_CONSTANT i32 0 + %0:gpr(s32) = G_LOAD %1(p0) :: (load 4 from @var1) + %9:gpr(s32) = G_ICMP intpred(eq), %0(s32), %2 + %3:gpr(s1) = G_TRUNC %9(s32) + G_BRCOND %3(s1), %bb.2 + G_BR %bb.3 + + bb.2.if.then: + G_STORE %4(s32), %5(p0) :: (store 4 into @var2) + G_STORE %6(s32), %1(p0) :: (store 4 into @var1) + G_STORE %4(s32), %7(p0) :: (store 4 into @var3) + G_STORE %6(s32), %1(p0) :: (store 4 into @var1) + + bb.3.if.end: + $w0 = COPY %8(s32) + RET_ReallyLR implicit $w0 + +...