Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -1484,18 +1484,10 @@ [llvm_i64_ty], [IntrConvergent] >; -def int_amdgcn_break : Intrinsic<[llvm_i64_ty], - [llvm_i64_ty], [IntrNoMem, IntrConvergent] ->; - def int_amdgcn_if_break : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_i64_ty], [IntrNoMem, IntrConvergent] >; -def int_amdgcn_else_break : Intrinsic<[llvm_i64_ty], - [llvm_i64_ty, llvm_i64_ty], [IntrNoMem, IntrConvergent] ->; - def int_amdgcn_loop : Intrinsic<[llvm_i1_ty], [llvm_i64_ty], [IntrConvergent] >; Index: lib/Target/AMDGPU/AMDGPUInstrInfo.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -62,18 +62,10 @@ [SDTCisVT<0, i64>, SDTCisVT<1, OtherVT>] >; -def AMDGPUBreakOp : SDTypeProfile<1, 1, - [SDTCisVT<0, i64>, SDTCisVT<1, i64>] ->; - def AMDGPUIfBreakOp : SDTypeProfile<1, 2, [SDTCisVT<0, i64>, SDTCisVT<1, i1>, SDTCisVT<2, i64>] >; -def AMDGPUElseBreakOp : SDTypeProfile<1, 2, - [SDTCisVT<0, i64>, SDTCisVT<1, i64>, SDTCisVT<2, i64>] ->; - def AMDGPUAddeSubeOp : SDTypeProfile<2, 3, [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisVT<0, i32>, SDTCisVT<1, i1>, SDTCisVT<4, i1>] >; Index: lib/Target/AMDGPU/SIAnnotateControlFlow.cpp =================================================================== --- lib/Target/AMDGPU/SIAnnotateControlFlow.cpp +++ lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -66,9 +66,7 @@ Function *If; Function *Else; - Function *Break; Function *IfBreak; - Function *ElseBreak; Function *Loop; Function *EndCf; @@ -95,8 +93,7 @@ Value * handleLoopCondition(Value *Cond, PHINode *Broken, llvm::Loop *L, - BranchInst *Term, - SmallVectorImpl &LoopPhiConditions); + BranchInst *Term); void handleLoop(BranchInst *Term); @@ -149,9 +146,7 @@ If = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if); Else = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_else); - Break = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_break); IfBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if_break); - ElseBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_else_break); Loop = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_loop); EndCf = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_end_cf); return false; @@ -227,76 +222,7 @@ /// Recursively handle the condition leading to a loop Value *SIAnnotateControlFlow::handleLoopCondition( - Value *Cond, PHINode *Broken, llvm::Loop *L, BranchInst *Term, - SmallVectorImpl &LoopPhiConditions) { - // Only search through PHI nodes which are inside the loop. If we try this - // with PHI nodes that are outside of the loop, we end up inserting new PHI - // nodes outside of the loop which depend on values defined inside the loop. - // This will break the module with - // 'Instruction does not dominate all users!' errors. - PHINode *Phi = nullptr; - if ((Phi = dyn_cast(Cond)) && L->contains(Phi)) { - BasicBlock *Parent = Phi->getParent(); - PHINode *NewPhi = PHINode::Create(Int64, 0, "loop.phi", &Parent->front()); - Value *Ret = NewPhi; - - // Handle all non-constant incoming values first - for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) { - Value *Incoming = Phi->getIncomingValue(i); - BasicBlock *From = Phi->getIncomingBlock(i); - if (isa(Incoming)) { - NewPhi->addIncoming(Broken, From); - continue; - } - - Phi->setIncomingValue(i, BoolFalse); - Value *PhiArg = handleLoopCondition(Incoming, Broken, L, - Term, LoopPhiConditions); - NewPhi->addIncoming(PhiArg, From); - } - - BasicBlock *IDom = DT->getNode(Parent)->getIDom()->getBlock(); - - for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) { - Value *Incoming = Phi->getIncomingValue(i); - if (Incoming != BoolTrue) - continue; - - BasicBlock *From = Phi->getIncomingBlock(i); - if (From == IDom) { - // We're in the following situation: - // IDom/From - // | \ - // | If-block - // | / - // Parent - // where we want to break out of the loop if the If-block is not taken. - // Due to the depth-first traversal, there should be an end.cf - // intrinsic in Parent, and we insert an else.break before it. - // - // Note that the end.cf need not be the first non-phi instruction - // of parent, particularly when we're dealing with a multi-level - // break, but it should occur within a group of intrinsic calls - // at the beginning of the block. - CallInst *OldEnd = dyn_cast(Parent->getFirstInsertionPt()); - while (OldEnd && OldEnd->getCalledFunction() != EndCf) - OldEnd = dyn_cast(OldEnd->getNextNode()); - if (OldEnd && OldEnd->getCalledFunction() == EndCf) { - Value *Args[] = { OldEnd->getArgOperand(0), NewPhi }; - Ret = CallInst::Create(ElseBreak, Args, "", OldEnd); - continue; - } - } - - Instruction *Insert = From->getTerminator(); - Value *PhiArg = CallInst::Create(Break, Broken, "", Insert); - NewPhi->setIncomingValue(i, PhiArg); - } - - LoopPhiConditions.push_back(WeakTrackingVH(Phi)); - return Ret; - } - + Value *Cond, PHINode *Broken, llvm::Loop *L, BranchInst *Term) { if (Instruction *Inst = dyn_cast(Cond)) { BasicBlock *Parent = Inst->getParent(); Instruction *Insert; @@ -335,21 +261,15 @@ BasicBlock *Target = Term->getSuccessor(1); PHINode *Broken = PHINode::Create(Int64, 0, "phi.broken", &Target->front()); - SmallVector LoopPhiConditions; Value *Cond = Term->getCondition(); Term->setCondition(BoolTrue); - Value *Arg = handleLoopCondition(Cond, Broken, L, Term, LoopPhiConditions); + Value *Arg = handleLoopCondition(Cond, Broken, L, Term); for (BasicBlock *Pred : predecessors(Target)) Broken->addIncoming(Pred == BB ? Arg : Int64Zero, Pred); Term->setCondition(CallInst::Create(Loop, Arg, "", Term)); - for (WeakTrackingVH Val : llvm::reverse(LoopPhiConditions)) { - if (PHINode *Cond = cast_or_null(Val)) - eraseIfUnused(Cond); - } - push(Term->getSuccessor(0), Arg); } Index: lib/Target/AMDGPU/SIFixSGPRCopies.cpp =================================================================== --- lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -327,9 +327,7 @@ switch (DefInstr->getOpcode()) { default: break; - case AMDGPU::SI_BREAK: case AMDGPU::SI_IF_BREAK: - case AMDGPU::SI_ELSE_BREAK: return true; case AMDGPU::PHI: if (phiHasBreakDef(*DefInstr, MRI, Visited)) Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -264,14 +264,6 @@ let mayStore = 1; } -def SI_BREAK : CFPseudoInstSI < - (outs SReg_64:$dst), (ins SReg_64:$src), - [(set i64:$dst, (int_amdgcn_break i64:$src))], 1> { - let Size = 4; - let isAsCheapAsAMove = 1; - let isReMaterializable = 1; -} - def SI_IF_BREAK : CFPseudoInstSI < (outs SReg_64:$dst), (ins SReg_64:$vcc, SReg_64:$src), [(set i64:$dst, (int_amdgcn_if_break i1:$vcc, i64:$src))]> { @@ -280,14 +272,6 @@ let isReMaterializable = 1; } -def SI_ELSE_BREAK : CFPseudoInstSI < - (outs SReg_64:$dst), (ins SReg_64:$src0, SReg_64:$src1), - [(set i64:$dst, (int_amdgcn_else_break i64:$src0, i64:$src1))]> { - let Size = 4; - let isAsCheapAsAMove = 1; - let isReMaterializable = 1; -} - let Uses = [EXEC] in { multiclass PseudoInstKill { Index: lib/Target/AMDGPU/SILowerControlFlow.cpp =================================================================== --- lib/Target/AMDGPU/SILowerControlFlow.cpp +++ lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -85,9 +85,7 @@ void emitIf(MachineInstr &MI); void emitElse(MachineInstr &MI); - void emitBreak(MachineInstr &MI); void emitIfBreak(MachineInstr &MI); - void emitElseBreak(MachineInstr &MI); void emitLoop(MachineInstr &MI); void emitEndCf(MachineInstr &MI); @@ -329,20 +327,6 @@ LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI)); } -void SILowerControlFlow::emitBreak(MachineInstr &MI) { - MachineBasicBlock &MBB = *MI.getParent(); - const DebugLoc &DL = MI.getDebugLoc(); - unsigned Dst = MI.getOperand(0).getReg(); - - MachineInstr *Or = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) - .addReg(AMDGPU::EXEC) - .add(MI.getOperand(1)); - - if (LIS) - LIS->ReplaceMachineInstrInMaps(MI, *Or); - MI.eraseFromParent(); -} - void SILowerControlFlow::emitIfBreak(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); @@ -384,11 +368,6 @@ MI.eraseFromParent(); } -void SILowerControlFlow::emitElseBreak(MachineInstr &MI) { - // Lowered in the same way as emitIfBreak above. - emitIfBreak(MI); -} - void SILowerControlFlow::emitLoop(MachineInstr &MI) { MachineBasicBlock &MBB = *MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); @@ -515,18 +494,10 @@ emitElse(MI); break; - case AMDGPU::SI_BREAK: - emitBreak(MI); - break; - case AMDGPU::SI_IF_BREAK: emitIfBreak(MI); break; - case AMDGPU::SI_ELSE_BREAK: - emitElseBreak(MI); - break; - case AMDGPU::SI_LOOP: emitLoop(MI); break; Index: test/CodeGen/AMDGPU/loop_break.ll =================================================================== --- test/CodeGen/AMDGPU/loop_break.ll +++ test/CodeGen/AMDGPU/loop_break.ll @@ -5,16 +5,17 @@ ; OPT-LABEL: @break_loop( ; OPT: bb1: -; OPT: call i64 @llvm.amdgcn.break(i64 +; OPT: icmp slt i32 ; OPT-NEXT: br i1 %cmp0, label %bb4, label %Flow ; OPT: bb4: ; OPT: load volatile +; OPT: icmp slt i32 ; OPT: xor i1 %cmp1 -; OPT: call i64 @llvm.amdgcn.if.break( ; OPT: br label %Flow ; OPT: Flow: +; OPT: call i64 @llvm.amdgcn.if.break( ; OPT: call i1 @llvm.amdgcn.loop(i64 ; OPT: br i1 %{{[0-9]+}}, label %bb9, label %bb1 @@ -23,21 +24,19 @@ ; TODO: Can remove exec fixes in return block ; GCN-LABEL: {{^}}break_loop: -; GCN: s_mov_b64 [[INITMASK:s\[[0-9]+:[0-9]+\]]], 0{{$}} +; GCN: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], 0{{$}} ; GCN: [[LOOP_ENTRY:BB[0-9]+_[0-9]+]]: ; %bb1 -; GCN: s_or_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec, [[INITMASK]] ; GCN: v_cmp_lt_i32_e32 vcc, -1 ; GCN: s_and_b64 vcc, exec, vcc -; GCN-NEXT: s_cbranch_vccnz [[FLOW:BB[0-9]+_[0-9]+]] +; GCN: s_cbranch_vccnz [[FLOW:BB[0-9]+_[0-9]+]] ; GCN: ; %bb.2: ; %bb4 ; GCN: buffer_load_dword ; GCN: v_cmp_ge_i32_e32 vcc, -; GCN: s_or_b64 [[MASK]], vcc, [[INITMASK]] ; GCN: [[FLOW]]: -; GCN: s_mov_b64 [[INITMASK]], [[MASK]] +; GCN: s_or_b64 [[MASK]], vcc, [[MASK]] ; GCN: s_andn2_b64 exec, exec, [[MASK]] ; GCN-NEXT: s_cbranch_execnz [[LOOP_ENTRY]] @@ -66,25 +65,26 @@ ; OPT-LABEL: @undef_phi_cond_break_loop( ; OPT: bb1: -; OPT-NEXT: %phi.broken = phi i64 [ %loop.phi, %Flow ], [ 0, %bb ] +; OPT-NEXT: %phi.broken = phi i64 [ %0, %Flow ], [ 0, %bb ] ; OPT-NEXT: %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ] -; OPT: %0 = call i64 @llvm.amdgcn.if.break(i1 undef, i64 %phi.broken) +; OPT-NEXT: %lsr.iv.next = add i32 %lsr.iv, 1 +; OPT-NEXT: %cmp0 = icmp slt i32 %lsr.iv.next, 0 ; OPT-NEXT: br i1 %cmp0, label %bb4, label %Flow ; OPT: bb4: ; OPT-NEXT: %load = load volatile i32, i32 addrspace(1)* undef, align 4 ; OPT-NEXT: %cmp1 = icmp sge i32 %tmp, %load -; OPT-NEXT: %1 = call i64 @llvm.amdgcn.if.break(i1 %cmp1, i64 %phi.broken) ; OPT-NEXT: br label %Flow ; OPT: Flow: -; OPT-NEXT: %loop.phi = phi i64 [ %1, %bb4 ], [ %0, %bb1 ] ; OPT-NEXT: %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ] -; OPT-NEXT: %2 = call i1 @llvm.amdgcn.loop(i64 %loop.phi) -; OPT-NEXT: br i1 %2, label %bb9, label %bb1 +; OPT-NEXT: %tmp3 = phi i1 [ %cmp1, %bb4 ], [ undef, %bb1 ] +; OPT-NEXT: %0 = call i64 @llvm.amdgcn.if.break(i1 %tmp3, i64 %phi.broken) +; OPT-NEXT: %1 = call i1 @llvm.amdgcn.loop(i64 %0) +; OPT-NEXT: br i1 %1, label %bb9, label %bb1 ; OPT: bb9: ; preds = %Flow -; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi) +; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %0) ; OPT-NEXT: store volatile i32 7 ; OPT-NEXT: ret void define amdgpu_kernel void @undef_phi_cond_break_loop(i32 %arg) #0 { @@ -119,25 +119,26 @@ ; OPT-LABEL: @constexpr_phi_cond_break_loop( ; OPT: bb1: -; OPT-NEXT: %phi.broken = phi i64 [ %loop.phi, %Flow ], [ 0, %bb ] +; OPT-NEXT: %phi.broken = phi i64 [ %0, %Flow ], [ 0, %bb ] ; OPT-NEXT: %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ] -; OPT: %0 = call i64 @llvm.amdgcn.if.break(i1 icmp ne (i32 addrspace(3)* inttoptr (i32 4 to i32 addrspace(3)*), i32 addrspace(3)* @lds), i64 %phi.broken) +; OPT-NEXT: %lsr.iv.next = add i32 %lsr.iv, 1 +; OPT-NEXT: %cmp0 = icmp slt i32 %lsr.iv.next, 0 ; OPT-NEXT: br i1 %cmp0, label %bb4, label %Flow ; OPT: bb4: ; OPT-NEXT: %load = load volatile i32, i32 addrspace(1)* undef, align 4 ; OPT-NEXT: %cmp1 = icmp sge i32 %tmp, %load -; OPT-NEXT: %1 = call i64 @llvm.amdgcn.if.break(i1 %cmp1, i64 %phi.broken) ; OPT-NEXT: br label %Flow ; OPT: Flow: -; OPT-NEXT: %loop.phi = phi i64 [ %1, %bb4 ], [ %0, %bb1 ] ; OPT-NEXT: %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ] -; OPT-NEXT: %2 = call i1 @llvm.amdgcn.loop(i64 %loop.phi) -; OPT-NEXT: br i1 %2, label %bb9, label %bb1 +; OPT-NEXT: %tmp3 = phi i1 [ %cmp1, %bb4 ], [ icmp ne (i32 addrspace(3)* inttoptr (i32 4 to i32 addrspace(3)*), i32 addrspace(3)* @lds), %bb1 ] +; OPT-NEXT: %0 = call i64 @llvm.amdgcn.if.break(i1 %tmp3, i64 %phi.broken) +; OPT-NEXT: %1 = call i1 @llvm.amdgcn.loop(i64 %0) +; OPT-NEXT: br i1 %1, label %bb9, label %bb1 ; OPT: bb9: ; preds = %Flow -; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi) +; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %0) ; OPT-NEXT: store volatile i32 7 ; OPT-NEXT: ret void define amdgpu_kernel void @constexpr_phi_cond_break_loop(i32 %arg) #0 { @@ -169,25 +170,26 @@ ; OPT-LABEL: @true_phi_cond_break_loop( ; OPT: bb1: -; OPT-NEXT: %phi.broken = phi i64 [ %loop.phi, %Flow ], [ 0, %bb ] +; OPT-NEXT: %phi.broken = phi i64 [ %0, %Flow ], [ 0, %bb ] ; OPT-NEXT: %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ] -; OPT: %0 = call i64 @llvm.amdgcn.break(i64 %phi.broken) -; OPT: br i1 %cmp0, label %bb4, label %Flow +; OPT-NEXT: %lsr.iv.next = add i32 %lsr.iv, 1 +; OPT-NEXT: %cmp0 = icmp slt i32 %lsr.iv.next, 0 +; OPT-NEXT: br i1 %cmp0, label %bb4, label %Flow ; OPT: bb4: ; OPT-NEXT: %load = load volatile i32, i32 addrspace(1)* undef, align 4 ; OPT-NEXT: %cmp1 = icmp sge i32 %tmp, %load -; OPT-NEXT: %1 = call i64 @llvm.amdgcn.if.break(i1 %cmp1, i64 %phi.broken) ; OPT-NEXT: br label %Flow ; OPT: Flow: -; OPT-NEXT: %loop.phi = phi i64 [ %1, %bb4 ], [ %0, %bb1 ] ; OPT-NEXT: %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ] -; OPT-NEXT: %2 = call i1 @llvm.amdgcn.loop(i64 %loop.phi) -; OPT-NEXT: br i1 %2, label %bb9, label %bb1 +; OPT-NEXT: %tmp3 = phi i1 [ %cmp1, %bb4 ], [ true, %bb1 ] +; OPT-NEXT: %0 = call i64 @llvm.amdgcn.if.break(i1 %tmp3, i64 %phi.broken) +; OPT-NEXT: %1 = call i1 @llvm.amdgcn.loop(i64 %0) +; OPT-NEXT: br i1 %1, label %bb9, label %bb1 ; OPT: bb9: ; preds = %Flow -; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi) +; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %0) ; OPT-NEXT: store volatile i32 7 ; OPT-NEXT: ret void define amdgpu_kernel void @true_phi_cond_break_loop(i32 %arg) #0 { @@ -219,7 +221,7 @@ ; OPT-LABEL: @false_phi_cond_break_loop( ; OPT: bb1: -; OPT-NEXT: %phi.broken = phi i64 [ %loop.phi, %Flow ], [ 0, %bb ] +; OPT-NEXT: %phi.broken = phi i64 [ %0, %Flow ], [ 0, %bb ] ; OPT-NEXT: %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ] ; OPT-NOT: call ; OPT: br i1 %cmp0, label %bb4, label %Flow @@ -227,17 +229,17 @@ ; OPT: bb4: ; OPT-NEXT: %load = load volatile i32, i32 addrspace(1)* undef, align 4 ; OPT-NEXT: %cmp1 = icmp sge i32 %tmp, %load -; OPT-NEXT: %0 = call i64 @llvm.amdgcn.if.break(i1 %cmp1, i64 %phi.broken) ; OPT-NEXT: br label %Flow ; OPT: Flow: -; OPT-NEXT: %loop.phi = phi i64 [ %0, %bb4 ], [ %phi.broken, %bb1 ] ; OPT-NEXT: %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ] -; OPT-NEXT: %1 = call i1 @llvm.amdgcn.loop(i64 %loop.phi) +; OPT-NEXT: %tmp3 = phi i1 [ %cmp1, %bb4 ], [ false, %bb1 ] +; OPT-NEXT: %0 = call i64 @llvm.amdgcn.if.break(i1 %tmp3, i64 %phi.broken) +; OPT-NEXT: %1 = call i1 @llvm.amdgcn.loop(i64 %0) ; OPT-NEXT: br i1 %1, label %bb9, label %bb1 ; OPT: bb9: ; preds = %Flow -; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi) +; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %0) ; OPT-NEXT: store volatile i32 7 ; OPT-NEXT: ret void define amdgpu_kernel void @false_phi_cond_break_loop(i32 %arg) #0 { Index: test/CodeGen/AMDGPU/multilevel-break.ll =================================================================== --- test/CodeGen/AMDGPU/multilevel-break.ll +++ test/CodeGen/AMDGPU/multilevel-break.ll @@ -10,11 +10,12 @@ ; ; OPT: Flow: ; -; Ensure two else.break calls, for both the inner and outer loops +; Ensure two if.break calls, for both the inner and outer loops -; OPT: call i64 @llvm.amdgcn.else.break(i64 [[if_exec]], -; OPT-NEXT: call i64 @llvm.amdgcn.else.break(i64 [[if_exec]], -; OPT-NEXT: call void @llvm.amdgcn.end.cf +; OPT: call void @llvm.amdgcn.end.cf +; OPT-NEXT: call i64 @llvm.amdgcn.if.break(i1 +; OPT-NEXT: call i1 @llvm.amdgcn.loop(i64 +; OPT-NEXT: call i64 @llvm.amdgcn.if.break(i1 ; ; OPT: Flow1: @@ -30,10 +31,9 @@ ; Ensure extra or eliminated ; GCN-NEXT: s_or_b64 exec, exec, [[SAVE_BREAK]] -; GCN-NEXT: s_mov_b64 -; GCN-NEXT: s_and_b64 [[MASKED_SAVE_BREAK:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_BREAK]] -; GCN-NEXT: s_or_b64 [[OR_BREAK:s\[[0-9]+:[0-9]+\]]], [[MASKED_SAVE_BREAK]], s{{\[[0-9]+:[0-9]+\]}} -; TODO: get rid of redundant loop counter moves +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} +; GCN-NEXT: s_or_b64 [[OR_BREAK:s\[[0-9]+:[0-9]+\]]], vcc, s{{\[[0-9]+:[0-9]+\]}} +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} ; GCN-NEXT: v_mov_b32_e32 ; GCN-NEXT: s_andn2_b64 exec, exec, [[OR_BREAK]] ; GCN-NEXT: s_cbranch_execnz [[INNER_LOOP]] @@ -43,8 +43,9 @@ ; Ensure copy is eliminated ; GCN-NEXT: s_or_b64 exec, exec, [[OR_BREAK]] -; GCN-NEXT: s_and_b64 [[MASKED2_SAVE_BREAK:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_BREAK]] +; GCN-NEXT: s_and_b64 [[MASKED2_SAVE_BREAK:s\[[0-9]+:[0-9]+\]]], exec, vcc ; GCN-NEXT: s_or_b64 [[OUTER_OR_BREAK:s\[[0-9]+:[0-9]+\]]], [[MASKED2_SAVE_BREAK]], s{{\[[0-9]+:[0-9]+\]}} +; GCN-NEXT: s_mov_b64 ; GCN-NEXT: v_mov_b32_e32 ; GCN-NEXT: s_andn2_b64 exec, exec, [[OUTER_OR_BREAK]] ; GCN-NEXT: s_cbranch_execnz [[OUTER_LOOP]] @@ -71,9 +72,8 @@ } ; OPT-LABEL: define amdgpu_kernel void @multi_if_break_loop( -; OPT: llvm.amdgcn.break -; OPT: llvm.amdgcn.loop ; OPT: llvm.amdgcn.if.break +; OPT: llvm.amdgcn.loop ; OPT: llvm.amdgcn.if.break ; OPT: llvm.amdgcn.end.cf @@ -82,9 +82,10 @@ ; GCN: [[LOOP:BB[0-9]+_[0-9]+]]: ; %bb1{{$}} -; Uses a copy intsead of an or -; GCN: s_mov_b64 [[COPY:s\[[0-9]+:[0-9]+\]]], [[BREAK_REG]] -; GCN: s_or_b64 [[BREAK_REG]], exec, [[BREAK_REG]] +; GCN: s_or_b64 [[BREAK_REG]], vcc, [[BREAK_REG]] +; GCN: s_andn2_b64 exec, exec, [[BREAK_REG]] +; GCN-NEXT: s_cbranch_execnz + define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 { bb: %id = call i32 @llvm.amdgcn.workitem.id.x() Index: test/CodeGen/AMDGPU/nested-loop-conditions.ll =================================================================== --- test/CodeGen/AMDGPU/nested-loop-conditions.ll +++ test/CodeGen/AMDGPU/nested-loop-conditions.ll @@ -10,7 +10,7 @@ ; IR-LABEL: @reduced_nested_loop_conditions( ; IR: bb5: -; IR-NEXT: %phi.broken = phi i64 [ %loop.phi, %bb10 ], [ 0, %bb ] +; IR-NEXT: %phi.broken = phi i64 [ %3, %bb10 ], [ 0, %bb ] ; IR-NEXT: %tmp6 = phi i32 [ 0, %bb ], [ %tmp11, %bb10 ] ; IR-NEXT: %tmp7 = icmp eq i32 %tmp6, 1 ; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %tmp7) @@ -19,25 +19,23 @@ ; IR-NEXT: br i1 %1, label %bb8, label %Flow ; IR: bb8: -; IR-NEXT: %3 = call i64 @llvm.amdgcn.break(i64 %phi.broken) ; IR-NEXT: br label %bb13 ; IR: bb10: -; IR-NEXT: %loop.phi = phi i64 [ %6, %Flow ] -; IR-NEXT: %tmp11 = phi i32 [ %5, %Flow ] -; IR-NEXT: %4 = call i1 @llvm.amdgcn.loop(i64 %loop.phi) +; IR-NEXT: %tmp11 = phi i32 [ %6, %Flow ] +; IR-NEXT: %tmp12 = phi i1 [ %5, %Flow ] +; IR-NEXT: %3 = call i64 @llvm.amdgcn.if.break(i1 %tmp12, i64 %phi.broken) +; IR-NEXT: %4 = call i1 @llvm.amdgcn.loop(i64 %3) ; IR-NEXT: br i1 %4, label %bb23, label %bb5 ; IR: Flow: -; IR-NEXT: %loop.phi1 = phi i64 [ %loop.phi2, %bb4 ], [ %phi.broken, %bb5 ] -; IR-NEXT: %5 = phi i32 [ %tmp21, %bb4 ], [ undef, %bb5 ] -; IR-NEXT: %6 = call i64 @llvm.amdgcn.else.break(i64 %2, i64 %loop.phi1) +; IR-NEXT: %5 = phi i1 [ %tmp22, %bb4 ], [ true, %bb5 ] +; IR-NEXT: %6 = phi i32 [ %tmp21, %bb4 ], [ undef, %bb5 ] ; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %2) ; IR-NEXT: br label %bb10 ; IR: bb13: -; IR-NEXT: %loop.phi3 = phi i64 [ %loop.phi4, %bb3 ], [ %3, %bb8 ] -; IR-NEXT: %tmp14 = phi i1 [ false, %bb3 ], [ true, %bb8 ] +; IR-NEXT: %tmp14 = phi i1 [ %tmp22, %bb3 ], [ true, %bb8 ] ; IR-NEXT: %tmp15 = bitcast i64 %tmp2 to <2 x i32> ; IR-NEXT: br i1 %tmp14, label %bb16, label %bb20 @@ -48,13 +46,12 @@ ; IR-NEXT: br label %bb20 ; IR: bb20: -; IR-NEXT: %loop.phi4 = phi i64 [ %phi.broken, %bb16 ], [ %phi.broken, %bb13 ] -; IR-NEXT: %loop.phi2 = phi i64 [ %phi.broken, %bb16 ], [ %loop.phi3, %bb13 ] ; IR-NEXT: %tmp21 = phi i32 [ %tmp19, %bb16 ], [ 0, %bb13 ] +; IR-NEXT: %tmp22 = phi i1 [ false, %bb16 ], [ %tmp14, %bb13 ] ; IR-NEXT: br label %bb9 ; IR: bb23: -; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi) +; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %3) ; IR-NEXT: ret void ; GCN-LABEL: {{^}}reduced_nested_loop_conditions: @@ -125,7 +122,7 @@ ; IR: Flow3: ; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %21) -; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %13) +; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %14) ; IR-NEXT: %1 = extractvalue { i1, i64 } %0, 0 ; IR-NEXT: %2 = extractvalue { i1, i64 } %0, 1 ; IR-NEXT: br i1 %1, label %bb4.bb13_crit_edge, label %Flow4 @@ -147,25 +144,24 @@ ; IR-NEXT: %8 = call { i1, i64 } @llvm.amdgcn.if(i1 %tmp15) ; IR: Flow1: -; IR-NEXT: %loop.phi = phi i64 [ %18, %bb21 ], [ %phi.broken, %bb14 ] ; IR-NEXT: %11 = phi <4 x i32> [ %tmp9, %bb21 ], [ undef, %bb14 ] ; IR-NEXT: %12 = phi i32 [ %tmp10, %bb21 ], [ undef, %bb14 ] -; IR-NEXT: %13 = phi i1 [ %17, %bb21 ], [ false, %bb14 ] -; IR-NEXT: %14 = phi i1 [ false, %bb21 ], [ true, %bb14 ] -; IR-NEXT: %15 = call i64 @llvm.amdgcn.else.break(i64 %10, i64 %loop.phi) +; IR-NEXT: %13 = phi i1 [ %18, %bb21 ], [ true, %bb14 ] +; IR-NEXT: %14 = phi i1 [ %18, %bb21 ], [ false, %bb14 ] +; IR-NEXT: %15 = phi i1 [ false, %bb21 ], [ true, %bb14 ] ; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %10) -; IR-NEXT: %16 = call i1 @llvm.amdgcn.loop(i64 %15) -; IR-NEXT: br i1 %16, label %Flow2, label %bb14 +; IR-NEXT: %16 = call i64 @llvm.amdgcn.if.break(i1 %13, i64 %phi.broken) +; IR-NEXT: %17 = call i1 @llvm.amdgcn.loop(i64 %16) +; IR-NEXT: br i1 %17, label %Flow2, label %bb14 ; IR: bb21: ; IR: %tmp12 = icmp slt i32 %tmp11, 9 -; IR-NEXT: %17 = xor i1 %tmp12, true -; IR-NEXT: %18 = call i64 @llvm.amdgcn.if.break(i1 %17, i64 %phi.broken) +; IR-NEXT: %18 = xor i1 %tmp12, true ; IR-NEXT: br label %Flow1 ; IR: Flow2: -; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %15) -; IR-NEXT: %19 = call { i1, i64 } @llvm.amdgcn.if(i1 %14) +; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %16) +; IR-NEXT: %19 = call { i1, i64 } @llvm.amdgcn.if(i1 %15) ; IR-NEXT: %20 = extractvalue { i1, i64 } %19, 0 ; IR-NEXT: %21 = extractvalue { i1, i64 } %19, 1 ; IR-NEXT: br i1 %20, label %bb31.loopexit, label %Flow3 Index: test/CodeGen/AMDGPU/valu-i1.ll =================================================================== --- test/CodeGen/AMDGPU/valu-i1.ll +++ test/CodeGen/AMDGPU/valu-i1.ll @@ -212,20 +212,16 @@ ; SI-DAG: v_cmp_ne_u32_e32 [[NEG1_CHECK_1:vcc]], -1, [[B]] ; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]] ; SI: s_and_saveexec_b64 [[ORNEG2:s\[[0-9]+:[0-9]+\]]], [[ORNEG1]] -; SI: s_xor_b64 [[ORNEG3:s\[[0-9]+:[0-9]+\]]], exec, [[ORNEG2]] ; SI: s_cbranch_execz [[LABEL_FLOW:BB[0-9]+_[0-9]+]] ; SI: BB{{[0-9]+_[0-9]+}}: ; %bb20 ; SI: buffer_store_dword -; SI: v_cmp_ge_i64_e{{32|64}} [[CMP:s\[[0-9]+:[0-9]+\]|vcc]] -; SI: s_or_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], [[CMP]], [[COND_STATE]] ; SI: [[LABEL_FLOW]]: ; SI-NEXT: ; in Loop: Header=[[LABEL_LOOP]] -; SI-NEXT: s_or_b64 exec, exec, [[ORNEG3]] -; SI-NEXT: s_mov_b64 [[MOVED_TMP:s\[[0-9]+:[0-9]+\]]], [[TMP]] -; SI-NEXT: s_and_b64 [[MASKED_ORNEG3:s\[[0-9]+:[0-9]+\]]], exec, [[ORNEG3]] -; SI-NEXT: s_or_b64 [[COND_STATE]], [[MASKED_ORNEG3]], [[MOVED_TMP]] +; SI-NEXT: s_or_b64 exec, exec, [[ORNEG2]] +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; SI-NEXT: s_or_b64 [[COND_STATE]], vcc, [[COND_STATE]] ; SI-NEXT: s_andn2_b64 exec, exec, [[COND_STATE]] ; SI-NEXT: s_cbranch_execnz [[LABEL_LOOP]]