Index: lib/Target/AMDGPU/SILowerI1Copies.cpp =================================================================== --- lib/Target/AMDGPU/SILowerI1Copies.cpp +++ lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -85,9 +85,9 @@ } private: - void lowerCopiesFromI1(); - void lowerPhis(); - void lowerCopiesToI1(); + void lowerCopiesFromI1(ReversePostOrderTraversal &); + void lowerPhis(ReversePostOrderTraversal &); + void lowerCopiesToI1(ReversePostOrderTraversal &); bool isConstantLaneMask(unsigned Reg, bool &Val) const; void buildMergeLaneMasks(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, @@ -473,9 +473,11 @@ OrN2Op = AMDGPU::S_ORN2_B64; } - lowerCopiesFromI1(); - lowerPhis(); - lowerCopiesToI1(); + ReversePostOrderTraversal RPOT(&TheMF); + + lowerCopiesFromI1(RPOT); + lowerPhis(RPOT); + lowerCopiesToI1(RPOT); for (unsigned Reg : ConstrainRegs) MRI->constrainRegClass(Reg, &AMDGPU::SReg_1_XEXECRegClass); @@ -484,11 +486,12 @@ return true; } -void SILowerI1Copies::lowerCopiesFromI1() { +void SILowerI1Copies::lowerCopiesFromI1( + ReversePostOrderTraversal &RPOT) { SmallVector DeadCopies; - for (MachineBasicBlock &MBB : *MF) { - for (MachineInstr &MI : MBB) { + for (MachineBasicBlock *MBB : RPOT) { + for (MachineInstr &MI : *MBB) { if (MI.getOpcode() != AMDGPU::COPY) continue; @@ -511,7 +514,7 @@ assert(!MI.getOperand(0).getSubReg()); ConstrainRegs.insert(SrcReg); - BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstReg) .addImm(0) .addImm(0) .addImm(0) @@ -526,7 +529,8 @@ } } -void SILowerI1Copies::lowerPhis() { +void SILowerI1Copies::lowerPhis( + ReversePostOrderTraversal &RPOT) { MachineSSAUpdater SSAUpdater(*MF); LoopFinder LF(*DT, *PDT); PhiIncomingAnalysis PIA(*PDT); @@ -538,10 +542,10 @@ DenseSet PhiRegisters; #endif - for (MachineBasicBlock &MBB : *MF) { - LF.initialize(MBB); + for (MachineBasicBlock *MBB : RPOT) { + LF.initialize(*MBB); - for (MachineInstr &MI : MBB.phis()) { + for (MachineInstr &MI : MBB->phis()) { unsigned DstReg = MI.getOperand(0).getReg(); if (MRI->getRegClass(DstReg) != &AMDGPU::VReg_1RegClass) continue; @@ -578,7 +582,7 @@ // Phis in a loop that are observed outside the loop receive a simple but // conservatively correct treatment. - MachineBasicBlock *PostDomBound = &MBB; + MachineBasicBlock *PostDomBound = MBB; for (MachineInstr &Use : MRI->use_instructions(DstReg)) { PostDomBound = PDT->findNearestCommonDominator(PostDomBound, Use.getParent()); @@ -606,7 +610,7 @@ } else { // The phi is not observed from outside a loop. Use a more accurate // lowering. - PIA.analyze(MBB, IncomingBlocks); + PIA.analyze(*MBB, IncomingBlocks); for (MachineBasicBlock *MBB : PIA.predecessors()) SSAUpdater.AddAvailableValue(MBB, insertUndefLaneMask(*MBB)); @@ -633,7 +637,7 @@ } } - unsigned NewReg = SSAUpdater.GetValueInMiddleOfBlock(&MBB); + unsigned NewReg = SSAUpdater.GetValueInMiddleOfBlock(MBB); if (NewReg != DstReg) { MRI->replaceRegWith(NewReg, DstReg); @@ -654,15 +658,16 @@ } } -void SILowerI1Copies::lowerCopiesToI1() { +void SILowerI1Copies::lowerCopiesToI1( + ReversePostOrderTraversal &RPOT) { MachineSSAUpdater SSAUpdater(*MF); LoopFinder LF(*DT, *PDT); SmallVector DeadCopies; - for (MachineBasicBlock &MBB : *MF) { - LF.initialize(MBB); + for (MachineBasicBlock *MBB : RPOT) { + LF.initialize(*MBB); - for (MachineInstr &MI : MBB) { + for (MachineInstr &MI : *MBB) { if (MI.getOpcode() != AMDGPU::IMPLICIT_DEF && MI.getOpcode() != AMDGPU::COPY) continue; @@ -692,7 +697,7 @@ !isLaneMaskReg(SrcReg)) { assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32); unsigned TmpReg = createLaneMaskReg(*MF); - BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg) + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg) .addReg(SrcReg) .addImm(0); MI.getOperand(1).setReg(TmpReg); @@ -701,7 +706,7 @@ // Defs in a loop that are observed outside the loop must be transformed // into appropriate bit manipulation. - MachineBasicBlock *PostDomBound = &MBB; + MachineBasicBlock *PostDomBound = MBB; for (MachineInstr &Use : MRI->use_instructions(DstReg)) { PostDomBound = PDT->findNearestCommonDominator(PostDomBound, Use.getParent()); @@ -710,11 +715,11 @@ unsigned FoundLoopLevel = LF.findLoop(PostDomBound); if (FoundLoopLevel) { SSAUpdater.Initialize(DstReg); - SSAUpdater.AddAvailableValue(&MBB, DstReg); + SSAUpdater.AddAvailableValue(MBB, DstReg); LF.addLoopEntries(FoundLoopLevel, SSAUpdater); - buildMergeLaneMasks(MBB, MI, DL, DstReg, - SSAUpdater.GetValueInMiddleOfBlock(&MBB), SrcReg); + buildMergeLaneMasks(*MBB, MI, DL, DstReg, + SSAUpdater.GetValueInMiddleOfBlock(MBB), SrcReg); DeadCopies.push_back(&MI); } } Index: test/CodeGen/AMDGPU/i1-copies-rpo.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/i1-copies-rpo.mir @@ -0,0 +1,51 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass=si-i1-copies -o - %s | FileCheck %s + +# The strange block ordering visits the use before the def. +--- +name: inserted_cmp_operand_class_rpo +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + ; CHECK-LABEL: name: inserted_cmp_operand_class_rpo + ; CHECK: bb.0: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: S_BRANCH %bb.3 + ; CHECK: bb.1: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY %1 + ; CHECK: bb.2: + ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY [[COPY]] + ; CHECK: S_ENDPGM 0 + ; CHECK: bb.3: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 0 + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MOV_B32_e32_1]], killed [[S_MOV_B32_]], implicit $exec + ; CHECK: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[V_CMP_EQ_U32_e64_]] + ; CHECK: S_BRANCH %bb.1 + bb.0: + successors: %bb.3 + + S_BRANCH %bb.3 + + bb.1: + successors: %bb.2 + + %0:vreg_1 = COPY %1 + + bb.2: + %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %3:sreg_64_xexec = COPY %0 + S_ENDPGM 0 + + bb.3: + successors: %bb.1 + + %4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %5:sreg_32_xm0 = S_MOV_B32 0 + %6:sreg_64 = V_CMP_EQ_U32_e64 killed %4, killed %5, implicit $exec + %1:vreg_1 = COPY %6 + S_BRANCH %bb.1