diff --git a/lib/Target/AMDGPU/SILowerI1Copies.cpp b/lib/Target/AMDGPU/SILowerI1Copies.cpp --- a/lib/Target/AMDGPU/SILowerI1Copies.cpp +++ b/lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -541,7 +541,7 @@ MachineSSAUpdater SSAUpdater(*MF); LoopFinder LF(*DT, *PDT); PhiIncomingAnalysis PIA(*PDT); - SmallVector DeadPhis; + SmallVector Vreg1Phis; SmallVector IncomingBlocks; SmallVector IncomingRegs; SmallVector IncomingUpdated; @@ -550,118 +550,117 @@ #endif for (MachineBasicBlock &MBB : *MF) { - LF.initialize(MBB); - for (MachineInstr &MI : MBB.phis()) { - Register DstReg = MI.getOperand(0).getReg(); - if (!isVreg1(DstReg)) - continue; + if (isVreg1(MI.getOperand(0).getReg())) + Vreg1Phis.push_back(&MI); + } + } - LLVM_DEBUG(dbgs() << "Lower PHI: " << MI); + MachineBasicBlock *PrevMBB = nullptr; + for (MachineInstr *MI : Vreg1Phis) { + MachineBasicBlock &MBB = *MI->getParent(); + if (&MBB != PrevMBB) { + LF.initialize(MBB); + PrevMBB = &MBB; + } - MRI->setRegClass(DstReg, IsWave32 ? &AMDGPU::SReg_32RegClass - : &AMDGPU::SReg_64RegClass); + LLVM_DEBUG(dbgs() << "Lower PHI: " << *MI); - // Collect incoming values. - for (unsigned i = 1; i < MI.getNumOperands(); i += 2) { - assert(i + 1 < MI.getNumOperands()); - Register IncomingReg = MI.getOperand(i).getReg(); - MachineBasicBlock *IncomingMBB = MI.getOperand(i + 1).getMBB(); - MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg); - - if (IncomingDef->getOpcode() == AMDGPU::COPY) { - IncomingReg = IncomingDef->getOperand(1).getReg(); - assert(isLaneMaskReg(IncomingReg) || isVreg1(IncomingReg)); - assert(!IncomingDef->getOperand(1).getSubReg()); - } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) { - continue; - } else { - assert(IncomingDef->isPHI() || PhiRegisters.count(IncomingReg)); - } + Register DstReg = MI->getOperand(0).getReg(); + MRI->setRegClass(DstReg, IsWave32 ? &AMDGPU::SReg_32RegClass + : &AMDGPU::SReg_64RegClass); + + // Collect incoming values. + for (unsigned i = 1; i < MI->getNumOperands(); i += 2) { + assert(i + 1 < MI->getNumOperands()); + Register IncomingReg = MI->getOperand(i).getReg(); + MachineBasicBlock *IncomingMBB = MI->getOperand(i + 1).getMBB(); + MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg); - IncomingBlocks.push_back(IncomingMBB); - IncomingRegs.push_back(IncomingReg); + if (IncomingDef->getOpcode() == AMDGPU::COPY) { + IncomingReg = IncomingDef->getOperand(1).getReg(); + assert(isLaneMaskReg(IncomingReg) || isVreg1(IncomingReg)); + assert(!IncomingDef->getOperand(1).getSubReg()); + } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) { + continue; + } else { + assert(IncomingDef->isPHI() || PhiRegisters.count(IncomingReg)); } + IncomingBlocks.push_back(IncomingMBB); + IncomingRegs.push_back(IncomingReg); + } + #ifndef NDEBUG - PhiRegisters.insert(DstReg); + PhiRegisters.insert(DstReg); #endif - // Phis in a loop that are observed outside the loop receive a simple but - // conservatively correct treatment. - std::vector DomBlocks = {&MBB}; - for (MachineInstr &Use : MRI->use_instructions(DstReg)) - DomBlocks.push_back(Use.getParent()); + // Phis in a loop that are observed outside the loop receive a simple but + // conservatively correct treatment. + std::vector DomBlocks = {&MBB}; + for (MachineInstr &Use : MRI->use_instructions(DstReg)) + DomBlocks.push_back(Use.getParent()); - MachineBasicBlock *PostDomBound = - PDT->findNearestCommonDominator(DomBlocks); - unsigned FoundLoopLevel = LF.findLoop(PostDomBound); - - SSAUpdater.Initialize(DstReg); - - if (FoundLoopLevel) { - LF.addLoopEntries(FoundLoopLevel, SSAUpdater, IncomingBlocks); + MachineBasicBlock *PostDomBound = + PDT->findNearestCommonDominator(DomBlocks); + unsigned FoundLoopLevel = LF.findLoop(PostDomBound); - for (unsigned i = 0; i < IncomingRegs.size(); ++i) { - IncomingUpdated.push_back(createLaneMaskReg(*MF)); - SSAUpdater.AddAvailableValue(IncomingBlocks[i], - IncomingUpdated.back()); - } + SSAUpdater.Initialize(DstReg); - for (unsigned i = 0; i < IncomingRegs.size(); ++i) { - MachineBasicBlock &IMBB = *IncomingBlocks[i]; - buildMergeLaneMasks( - IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i], - SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]); - } - } else { - // The phi is not observed from outside a loop. Use a more accurate - // lowering. - PIA.analyze(MBB, IncomingBlocks); - - for (MachineBasicBlock *MBB : PIA.predecessors()) - SSAUpdater.AddAvailableValue(MBB, insertUndefLaneMask(*MBB)); - - for (unsigned i = 0; i < IncomingRegs.size(); ++i) { - MachineBasicBlock &IMBB = *IncomingBlocks[i]; - if (PIA.isSource(IMBB)) { - IncomingUpdated.push_back(0); - SSAUpdater.AddAvailableValue(&IMBB, IncomingRegs[i]); - } else { - IncomingUpdated.push_back(createLaneMaskReg(*MF)); - SSAUpdater.AddAvailableValue(&IMBB, IncomingUpdated.back()); - } - } + if (FoundLoopLevel) { + LF.addLoopEntries(FoundLoopLevel, SSAUpdater, IncomingBlocks); - for (unsigned i = 0; i < IncomingRegs.size(); ++i) { - if (!IncomingUpdated[i]) - continue; + for (unsigned i = 0; i < IncomingRegs.size(); ++i) { + IncomingUpdated.push_back(createLaneMaskReg(*MF)); + SSAUpdater.AddAvailableValue(IncomingBlocks[i], + IncomingUpdated.back()); + } - MachineBasicBlock &IMBB = *IncomingBlocks[i]; - buildMergeLaneMasks( - IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i], - SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]); + for (unsigned i = 0; i < IncomingRegs.size(); ++i) { + MachineBasicBlock &IMBB = *IncomingBlocks[i]; + buildMergeLaneMasks( + IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i], + SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]); + } + } else { + // The phi is not observed from outside a loop. Use a more accurate + // lowering. + PIA.analyze(MBB, IncomingBlocks); + + for (MachineBasicBlock *MBB : PIA.predecessors()) + SSAUpdater.AddAvailableValue(MBB, insertUndefLaneMask(*MBB)); + + for (unsigned i = 0; i < IncomingRegs.size(); ++i) { + MachineBasicBlock &IMBB = *IncomingBlocks[i]; + if (PIA.isSource(IMBB)) { + IncomingUpdated.push_back(0); + SSAUpdater.AddAvailableValue(&IMBB, IncomingRegs[i]); + } else { + IncomingUpdated.push_back(createLaneMaskReg(*MF)); + SSAUpdater.AddAvailableValue(&IMBB, IncomingUpdated.back()); } } - unsigned NewReg = SSAUpdater.GetValueInMiddleOfBlock(&MBB); - if (NewReg != DstReg) { - MRI->replaceRegWith(NewReg, DstReg); + for (unsigned i = 0; i < IncomingRegs.size(); ++i) { + if (!IncomingUpdated[i]) + continue; - // Ensure that DstReg has a single def and mark the old PHI node for - // deletion. - MI.getOperand(0).setReg(NewReg); - DeadPhis.push_back(&MI); + MachineBasicBlock &IMBB = *IncomingBlocks[i]; + buildMergeLaneMasks( + IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i], + SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]); } - - IncomingBlocks.clear(); - IncomingRegs.clear(); - IncomingUpdated.clear(); } - for (MachineInstr *MI : DeadPhis) + unsigned NewReg = SSAUpdater.GetValueInMiddleOfBlock(&MBB); + if (NewReg != DstReg) { + MRI->replaceRegWith(NewReg, DstReg); MI->eraseFromParent(); - DeadPhis.clear(); + } + + IncomingBlocks.clear(); + IncomingRegs.clear(); + IncomingUpdated.clear(); } } diff --git a/test/CodeGen/AMDGPU/i1_copy_phi_with_phi_incoming_value.ll b/test/CodeGen/AMDGPU/i1_copy_phi_with_phi_incoming_value.ll new file mode 100644 --- /dev/null +++ b/test/CodeGen/AMDGPU/i1_copy_phi_with_phi_incoming_value.ll @@ -0,0 +1,80 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: kernel_i1_copy_phi_with_phi_incoming_value: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s6, s[4:5], 0x0 +; GCN-NEXT: s_mov_b64 [[RESTORE_EXEC:s\[[0-9]+:[0-9]+\]]], 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_cmp_gt_i32_e32 vcc, s6, v0 +; GCN-NEXT: s_and_saveexec_b64 [[SKIP_LOOP_EXEC:s\[[0-9]+:[0-9]+\]]], vcc +; GCN-NEXT: ; mask branch BB0_4 +; GCN-NEXT: s_cbranch_execz BB0_4 + +; GCN-NEXT: BB0_1: ; %loop.prefix +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 +; GCN-NEXT: v_add_u32_e32 v2, 16, v0 +; GCN-NEXT: v_cmp_le_i32_e32 vcc, s6, v2 +; GCN-NEXT: s_mov_b64 [[LOOP_EXEC:s\[[0-9]+:[0-9]+\]]], 0 + +; GCN-NEXT: BB0_2: ; %loop.body +; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN: s_mov_b64 [[BKUP_EXEC:s\[[0-9]+:[0-9]+\]]], [[LOOP_EXEC]] +; GCN-NEXT: s_and_b64 {{s\[[0-9]+:[0-9]+\]}}, exec, vcc +; GCN-NEXT: s_or_b64 [[LOOP_EXEC]], {{s\[[0-9]+:[0-9]+\]}}, [[BKUP_EXEC]] +; GCN: s_andn2_b64 exec, exec, [[LOOP_EXEC]] +; GCN-NEXT: s_cbranch_execnz BB0_2 + +; GCN-NEXT: ; %bb.3: ; %Flow +; GCN-NEXT: s_or_b64 exec, exec, [[LOOP_EXEC]] +; GCN-NEXT: s_mov_b64 [[RESTORE_EXEC]], exec + +; GCN-NEXT: BB0_4: ; %Flow2 +; GCN-NEXT: s_or_b64 exec, exec, [[SKIP_LOOP_EXEC]] +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_barrier +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_and_saveexec_b64 {{s\[[0-9]+:[0-9]+\]}}, [[RESTORE_EXEC]] + +; GCN-NEXT: ; mask branch BB0_5 +; GCN-NEXT: BB0_5: ; %end +; GCN-NEXT: s_endpgm +define amdgpu_kernel void @kernel_i1_copy_phi_with_phi_incoming_value(i32 %arg0, float* %arg1) #0 { +entry: + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %id_cmp = icmp slt i32 %tid, %arg0 + br i1 %id_cmp, label %loop.prefix, label %loop.exit + +loop.prefix: ; preds = %entry + br label %loop.body + +loop.exit: ; preds = %loop.body, %entry + %cmp = phi i1 [ false, %entry ], [ true, %loop.body ] + fence syncscope("workgroup") release + tail call void @llvm.amdgcn.s.barrier() #1 + fence syncscope("workgroup") acquire + br i1 %cmp, label %update, label %end + +update: ; preds = %loop.exit + store volatile i32 undef, i32 addrspace(1)* undef + br label %end + +loop.body: ; preds = %loop.prefix, %loop.body + %tid1 = phi i32 [ %tid, %loop.prefix ], [ %lp_tid, %loop.body ] + %cur_tid = sext i32 %tid1 to i64 + %in_ptr = getelementptr inbounds float, float* %arg1, i64 %cur_tid + %in_val = load float, float* %in_ptr, align 4 + %new_val = fadd float %in_val, 1.0 + store float %new_val, float* %in_ptr + %lp_tid = add i32 %tid, 16 + %lp_cmp = icmp slt i32 %lp_tid, %arg0 + br i1 %lp_cmp, label %loop.body, label %loop.exit + +end: ; preds = %loop.exit, %update + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 +declare void @llvm.amdgcn.s.barrier() #0 + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind }