Index: lib/Target/AMDGPU/SILowerI1Copies.cpp =================================================================== --- lib/Target/AMDGPU/SILowerI1Copies.cpp +++ lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -541,7 +541,7 @@ MachineSSAUpdater SSAUpdater(*MF); LoopFinder LF(*DT, *PDT); PhiIncomingAnalysis PIA(*PDT); - SmallVector DeadPhis; + SmallVector Vreg1Phis; SmallVector IncomingBlocks; SmallVector IncomingRegs; SmallVector IncomingUpdated; @@ -550,118 +550,117 @@ #endif for (MachineBasicBlock &MBB : *MF) { - LF.initialize(MBB); - for (MachineInstr &MI : MBB.phis()) { - Register DstReg = MI.getOperand(0).getReg(); - if (!isVreg1(DstReg)) - continue; + if (isVreg1(MI.getOperand(0).getReg())) + Vreg1Phis.push_back(&MI); + } + } - LLVM_DEBUG(dbgs() << "Lower PHI: " << MI); + MachineBasicBlock *PrevMBB = nullptr; + for (MachineInstr *MI : Vreg1Phis) { + MachineBasicBlock &MBB = *MI->getParent(); + if (&MBB != PrevMBB) { + LF.initialize(MBB); + PrevMBB = &MBB; + } - MRI->setRegClass(DstReg, IsWave32 ? &AMDGPU::SReg_32RegClass - : &AMDGPU::SReg_64RegClass); + LLVM_DEBUG(dbgs() << "Lower PHI: " << *MI); - // Collect incoming values. - for (unsigned i = 1; i < MI.getNumOperands(); i += 2) { - assert(i + 1 < MI.getNumOperands()); - Register IncomingReg = MI.getOperand(i).getReg(); - MachineBasicBlock *IncomingMBB = MI.getOperand(i + 1).getMBB(); - MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg); - - if (IncomingDef->getOpcode() == AMDGPU::COPY) { - IncomingReg = IncomingDef->getOperand(1).getReg(); - assert(isLaneMaskReg(IncomingReg) || isVreg1(IncomingReg)); - assert(!IncomingDef->getOperand(1).getSubReg()); - } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) { - continue; - } else { - assert(IncomingDef->isPHI() || PhiRegisters.count(IncomingReg)); - } + Register DstReg = MI->getOperand(0).getReg(); + MRI->setRegClass(DstReg, IsWave32 ? &AMDGPU::SReg_32RegClass + : &AMDGPU::SReg_64RegClass); + + // Collect incoming values. + for (unsigned i = 1; i < MI->getNumOperands(); i += 2) { + assert(i + 1 < MI->getNumOperands()); + Register IncomingReg = MI->getOperand(i).getReg(); + MachineBasicBlock *IncomingMBB = MI->getOperand(i + 1).getMBB(); + MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg); - IncomingBlocks.push_back(IncomingMBB); - IncomingRegs.push_back(IncomingReg); + if (IncomingDef->getOpcode() == AMDGPU::COPY) { + IncomingReg = IncomingDef->getOperand(1).getReg(); + assert(isLaneMaskReg(IncomingReg) || isVreg1(IncomingReg)); + assert(!IncomingDef->getOperand(1).getSubReg()); + } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) { + continue; + } else { + assert(IncomingDef->isPHI() || PhiRegisters.count(IncomingReg)); } + IncomingBlocks.push_back(IncomingMBB); + IncomingRegs.push_back(IncomingReg); + } + #ifndef NDEBUG - PhiRegisters.insert(DstReg); + PhiRegisters.insert(DstReg); #endif - // Phis in a loop that are observed outside the loop receive a simple but - // conservatively correct treatment. - std::vector DomBlocks = {&MBB}; - for (MachineInstr &Use : MRI->use_instructions(DstReg)) - DomBlocks.push_back(Use.getParent()); + // Phis in a loop that are observed outside the loop receive a simple but + // conservatively correct treatment. + std::vector DomBlocks = {&MBB}; + for (MachineInstr &Use : MRI->use_instructions(DstReg)) + DomBlocks.push_back(Use.getParent()); - MachineBasicBlock *PostDomBound = - PDT->findNearestCommonDominator(DomBlocks); - unsigned FoundLoopLevel = LF.findLoop(PostDomBound); - - SSAUpdater.Initialize(DstReg); - - if (FoundLoopLevel) { - LF.addLoopEntries(FoundLoopLevel, SSAUpdater, IncomingBlocks); + MachineBasicBlock *PostDomBound = + PDT->findNearestCommonDominator(DomBlocks); + unsigned FoundLoopLevel = LF.findLoop(PostDomBound); - for (unsigned i = 0; i < IncomingRegs.size(); ++i) { - IncomingUpdated.push_back(createLaneMaskReg(*MF)); - SSAUpdater.AddAvailableValue(IncomingBlocks[i], - IncomingUpdated.back()); - } + SSAUpdater.Initialize(DstReg); - for (unsigned i = 0; i < IncomingRegs.size(); ++i) { - MachineBasicBlock &IMBB = *IncomingBlocks[i]; - buildMergeLaneMasks( - IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i], - SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]); - } - } else { - // The phi is not observed from outside a loop. Use a more accurate - // lowering. - PIA.analyze(MBB, IncomingBlocks); - - for (MachineBasicBlock *MBB : PIA.predecessors()) - SSAUpdater.AddAvailableValue(MBB, insertUndefLaneMask(*MBB)); - - for (unsigned i = 0; i < IncomingRegs.size(); ++i) { - MachineBasicBlock &IMBB = *IncomingBlocks[i]; - if (PIA.isSource(IMBB)) { - IncomingUpdated.push_back(0); - SSAUpdater.AddAvailableValue(&IMBB, IncomingRegs[i]); - } else { - IncomingUpdated.push_back(createLaneMaskReg(*MF)); - SSAUpdater.AddAvailableValue(&IMBB, IncomingUpdated.back()); - } - } + if (FoundLoopLevel) { + LF.addLoopEntries(FoundLoopLevel, SSAUpdater, IncomingBlocks); - for (unsigned i = 0; i < IncomingRegs.size(); ++i) { - if (!IncomingUpdated[i]) - continue; + for (unsigned i = 0; i < IncomingRegs.size(); ++i) { + IncomingUpdated.push_back(createLaneMaskReg(*MF)); + SSAUpdater.AddAvailableValue(IncomingBlocks[i], + IncomingUpdated.back()); + } - MachineBasicBlock &IMBB = *IncomingBlocks[i]; - buildMergeLaneMasks( - IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i], - SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]); + for (unsigned i = 0; i < IncomingRegs.size(); ++i) { + MachineBasicBlock &IMBB = *IncomingBlocks[i]; + buildMergeLaneMasks( + IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i], + SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]); + } + } else { + // The phi is not observed from outside a loop. Use a more accurate + // lowering. + PIA.analyze(MBB, IncomingBlocks); + + for (MachineBasicBlock *MBB : PIA.predecessors()) + SSAUpdater.AddAvailableValue(MBB, insertUndefLaneMask(*MBB)); + + for (unsigned i = 0; i < IncomingRegs.size(); ++i) { + MachineBasicBlock &IMBB = *IncomingBlocks[i]; + if (PIA.isSource(IMBB)) { + IncomingUpdated.push_back(0); + SSAUpdater.AddAvailableValue(&IMBB, IncomingRegs[i]); + } else { + IncomingUpdated.push_back(createLaneMaskReg(*MF)); + SSAUpdater.AddAvailableValue(&IMBB, IncomingUpdated.back()); } } - unsigned NewReg = SSAUpdater.GetValueInMiddleOfBlock(&MBB); - if (NewReg != DstReg) { - MRI->replaceRegWith(NewReg, DstReg); + for (unsigned i = 0; i < IncomingRegs.size(); ++i) { + if (!IncomingUpdated[i]) + continue; - // Ensure that DstReg has a single def and mark the old PHI node for - // deletion. - MI.getOperand(0).setReg(NewReg); - DeadPhis.push_back(&MI); + MachineBasicBlock &IMBB = *IncomingBlocks[i]; + buildMergeLaneMasks( + IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i], + SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]); } - - IncomingBlocks.clear(); - IncomingRegs.clear(); - IncomingUpdated.clear(); } - for (MachineInstr *MI : DeadPhis) + unsigned NewReg = SSAUpdater.GetValueInMiddleOfBlock(&MBB); + if (NewReg != DstReg) { + MRI->replaceRegWith(NewReg, DstReg); MI->eraseFromParent(); - DeadPhis.clear(); + } + + IncomingBlocks.clear(); + IncomingRegs.clear(); + IncomingUpdated.clear(); } } Index: test/CodeGen/AMDGPU/i1_copy_phi_with_phi_incoming_value.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/i1_copy_phi_with_phi_incoming_value.ll @@ -0,0 +1,137 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: kernel_i1_copy_phi_with_phi_incoming_value: +; GCN: ; %bb.0: +; GCN: s_mov_b64 [[RESTORE_EXEC:s\[[0-9]+:[0-9]+\]]], 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mul_i32 s6, s3, s2 +; GCN-NEXT: s_mul_i32 s7, s6, s8 +; GCN-NEXT: v_cmp_gt_i32_e32 vcc, s6, v0 +; GCN-NEXT: s_and_saveexec_b64 [[SKIP_LOOP_EXEC:s\[[0-9]+:[0-9]+\]]], vcc +; GCN-NEXT: ; mask branch BB0_4 +; GCN-NEXT: s_cbranch_execz BB0_4 + +; GCN: BB0_1: +; GCN-NEXT: s_load_dword s9, s[4:5], 0x4 +; GCN-NEXT: s_load_dword s16, s[4:5], 0xc +; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GCN-NEXT: s_mov_b64 [[LOOP_EXEC:s\[[0-9]+:[0-9]+\]]], 0 + +; GCN: BB0_2: ; =>This Inner Loop Header: Depth=1 +; GCN: s_or_b64 [[LOOP_EXEC]], vcc, [[LOOP_EXEC]] +; GCN-NEXT: v_mov_b32_e32 v5, s13 +; GCN-NEXT: v_add_co_u32_e32 v3, vcc, s12, v3 +; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, v5, v4, vcc +; GCN-NEXT: global_load_dword v3, v[3:4], off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ds_write_b32 v1, v3 +; GCN-NEXT: v_add_u32_e32 v1, s16, v1 +; GCN-NEXT: s_andn2_b64 exec, exec, [[LOOP_EXEC]] +; GCN-NEXT: s_cbranch_execnz BB0_2 + +; GCN: ; %bb.3: ; %Flow17 +; GCN-NEXT: s_or_b64 exec, exec, [[LOOP_EXEC]] +; GCN-NEXT: s_mov_b64 [[RESTORE_EXEC]], exec + +; GCN-NEXT: BB0_4: ; %Flow18 +; GCN-NEXT: s_or_b64 exec, exec, [[SKIP_LOOP_EXEC]] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_barrier +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_and_saveexec_b64 {{s\[[0-9]+:[0-9]+\]}}, [[RESTORE_EXEC]] + +; GCN: BB0_7: ; %Flow16 +; GCN-NEXT: s_endpgm +@l_sdata = internal unnamed_addr addrspace(3) global [32 x float] undef, align 16 + +define amdgpu_kernel void @kernel_i1_copy_phi_with_phi_incoming_value(i32 %0, i32 %1, float* readonly %2, float* %3) #0 { + %5 = mul nsw i32 %1, %0 + %6 = tail call i32 @llvm.amdgcn.workgroup.id.x() #1 + %7 = mul i32 %5, %6 + %8 = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %9 = icmp slt i32 %8, %5 + br i1 %9, label %10, label %.loopexit2 + +10: ; preds = %4 + %11 = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1 + %12 = getelementptr inbounds i8, i8 addrspace(4)* %11, i64 4 + %13 = bitcast i8 addrspace(4)* %12 to i16 addrspace(4)* + %14 = load i16, i16 addrspace(4)* %13, align 4 + %15 = getelementptr inbounds i8, i8 addrspace(4)* %11, i64 12 + %16 = bitcast i8 addrspace(4)* %15 to i32 addrspace(4)* + %17 = load i32, i32 addrspace(4)* %16, align 4 + %18 = zext i16 %14 to i32 + %19 = mul i32 %6, %18 + %20 = sub i32 %17, %19 + %21 = icmp ult i32 %20, %18 + %22 = select i1 %21, i32 %20, i32 %18 + br label %37 + +.loopexit2: ; preds = %37, %4 + %23 = phi i1 [ false, %4 ], [ true, %37 ] + fence syncscope("workgroup") release + tail call void @llvm.amdgcn.s.barrier() #1 + fence syncscope("workgroup") acquire + br i1 %23, label %24, label %.loopexit + +24: ; preds = %.loopexit2 + %25 = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1 + %26 = getelementptr inbounds i8, i8 addrspace(4)* %25, i64 4 + %27 = bitcast i8 addrspace(4)* %26 to i16 addrspace(4)* + %28 = load i16, i16 addrspace(4)* %27, align 4 + %29 = getelementptr inbounds i8, i8 addrspace(4)* %25, i64 12 + %30 = bitcast i8 addrspace(4)* %29 to i32 addrspace(4)* + %31 = load i32, i32 addrspace(4)* %30, align 4 + %32 = zext i16 %28 to i32 + %33 = mul i32 %6, %32 + %34 = sub i32 %31, %33 + %35 = icmp ult i32 %34, %32 + %36 = select i1 %35, i32 %34, i32 %32 + br label %49 + +37: ; preds = %10, %37 + %38 = phi i32 [ %8, %10 ], [ %47, %37 ] + %39 = add nsw i32 %38, %7 + %40 = sext i32 %39 to i64 + %41 = getelementptr inbounds float, float* %2, i64 %40 + %42 = bitcast float* %41 to i32* + %43 = addrspacecast i32* %42 to i32 addrspace(1)* + %44 = load i32, i32 addrspace(1)* %43, align 4 + %45 = getelementptr inbounds [32 x float], [32 x float] addrspace(3)* @l_sdata, i32 0, i32 %38 + %46 = bitcast float addrspace(3)* %45 to i32 addrspace(3)* + store i32 %44, i32 addrspace(3)* %46, align 4 + %47 = add i32 %38, %22 + %48 = icmp slt i32 %47, %5 + br i1 %48, label %37, label %.loopexit2 + +.loopexit: ; preds = %49, %.loopexit2 + ret void + +49: ; preds = %24, %49 + %50 = phi i32 [ %8, %24 ], [ %64, %49 ] + %51 = sdiv i32 %50, %0 + %52 = mul i32 %51, %0 + %53 = sub i32 %50, %52 + %54 = mul nsw i32 %53, %1 + %55 = add nsw i32 %54, %51 + %56 = getelementptr inbounds [32 x float], [32 x float] addrspace(3)* @l_sdata, i32 0, i32 %55 + %57 = bitcast float addrspace(3)* %56 to i32 addrspace(3)* + %58 = load i32, i32 addrspace(3)* %57, align 4 + %59 = add nsw i32 %50, %7 + %60 = sext i32 %59 to i64 + %61 = getelementptr inbounds float, float* %3, i64 %60 + %62 = bitcast float* %61 to i32* + %63 = addrspacecast i32* %62 to i32 addrspace(1)* + store i32 %58, i32 addrspace(1)* %63, align 4 + %64 = add i32 %50, %36 + %65 = icmp slt i32 %64, %5 + br i1 %65, label %49, label %.loopexit +} + +declare i32 @llvm.amdgcn.workgroup.id.x() #1 +declare i32 @llvm.amdgcn.workitem.id.x() #1 +declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1 +declare void @llvm.amdgcn.s.barrier() #0 + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind }