Index: llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -344,11 +344,49 @@ } void SILowerControlFlow::emitIfBreak(MachineInstr &MI) { - MI.setDesc(TII->get(AMDGPU::S_OR_B64)); + MachineBasicBlock &MBB = *MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + auto Dst = MI.getOperand(0).getReg(); + + // Skip ANDing with exec if the break condition is already masked by exec + // because it is a V_CMP in the same basic block. (We know the break + // condition operand was an i1 in IR, so if it is a VALU instruction it must + // be one with a carry-out.) + bool SkipAnding = false; + if (MI.getOperand(1).isReg()) { + if (MachineInstr *Def = MRI->getUniqueVRegDef(MI.getOperand(1).getReg())) { + SkipAnding = Def->getParent() == MI.getParent() + && SIInstrInfo::isVALU(*Def); + } + } + + // AND the break condition operand with exec, then OR that into the "loop + // exit" mask. + MachineInstr *And = nullptr, *Or = nullptr; + if (!SkipAnding) { + And = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst) + .addReg(AMDGPU::EXEC) + .add(MI.getOperand(1)); + Or = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) + .addReg(Dst) + .add(MI.getOperand(2)); + } else + Or = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) + .add(MI.getOperand(1)) + .add(MI.getOperand(2)); + + if (LIS) { + if (And) + LIS->InsertMachineInstrInMaps(*And); + LIS->ReplaceMachineInstrInMaps(MI, *Or); + } + + MI.eraseFromParent(); } void SILowerControlFlow::emitElseBreak(MachineInstr &MI) { - MI.setDesc(TII->get(AMDGPU::S_OR_B64)); + // Lowered in the same way as emitIfBreak above. + emitIfBreak(MI); } void SILowerControlFlow::emitLoop(MachineInstr &MI) { Index: llvm/trunk/test/CodeGen/AMDGPU/loop_exit_with_xor.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/loop_exit_with_xor.ll +++ llvm/trunk/test/CodeGen/AMDGPU/loop_exit_with_xor.ll @@ -0,0 +1,93 @@ +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; Where the mask of lanes wanting to exit the loop on this iteration is not +; obviously already masked by exec (in this case, the xor with -1 inserted by +; control flow annotation), then lower control flow must insert an S_AND_B64 +; with exec. + +; GCN-LABEL: {{^}}needs_and: +; GCN: s_xor_b64 [[REG1:[^ ,]*]], {{[^ ,]*, -1$}} +; GCN: s_and_b64 [[REG2:[^ ,]*]], exec, [[REG1]] +; GCN: s_or_b64 [[REG3:[^ ,]*]], [[REG2]], +; GCN: s_andn2_b64 exec, exec, [[REG3]] + +define void @needs_and(i32 %arg) { +entry: + br label %loop + +loop: + %tmp23phi = phi i32 [ %tmp23, %endif ], [ 0, %entry ] + %tmp23 = add nuw i32 %tmp23phi, 1 + %tmp27 = icmp ult i32 %arg, %tmp23 + br i1 %tmp27, label %then, label %endif + +then: ; preds = %bb + call void @llvm.amdgcn.buffer.store.f32(float undef, <4 x i32> undef, i32 0, i32 undef, i1 false, i1 false) #1 + br label %endif + +endif: ; preds = %bb28, %bb + br i1 %tmp27, label %loop, label %loopexit + +loopexit: + ret void +} + +; Where the mask of lanes wanting to exit the loop on this iteration is +; obviously already masked by exec (a V_CMP), then lower control flow can omit +; the S_AND_B64 to avoid an unnecessary instruction. + +; GCN-LABEL: {{^}}doesnt_need_and: +; GCN: v_cmp{{[^ ]*}} [[REG1:[^ ,]*]] +; GCN: s_or_b64 [[REG2:[^ ,]*]], [[REG1]], +; GCN: s_andn2_b64 exec, exec, [[REG2]] + +define void @doesnt_need_and(i32 %arg) { +entry: + br label %loop + +loop: + %tmp23phi = phi i32 [ %tmp23, %loop ], [ 0, %entry ] + %tmp23 = add nuw i32 %tmp23phi, 1 + %tmp27 = icmp ult i32 %arg, %tmp23 + call void @llvm.amdgcn.buffer.store.f32(float undef, <4 x i32> undef, i32 0, i32 undef, i1 false, i1 false) #1 + br i1 %tmp27, label %loop, label %loopexit + +loopexit: + ret void +} + +; Another case where the mask of lanes wanting to exit the loop is not masked +; by exec, because it is a function parameter. + +; GCN-LABEL: {{^}}break_cond_is_arg: +; GCN: s_xor_b64 [[REG1:[^ ,]*]], {{[^ ,]*, -1$}} +; GCN: s_and_b64 [[REG2:[^ ,]*]], exec, [[REG1]] +; GCN: s_or_b64 [[REG3:[^ ,]*]], [[REG2]], +; GCN: s_andn2_b64 exec, exec, [[REG3]] + +define void @break_cond_is_arg(i32 %arg, i1 %breakcond) { +entry: + br label %loop + +loop: + %tmp23phi = phi i32 [ %tmp23, %endif ], [ 0, %entry ] + %tmp23 = add nuw i32 %tmp23phi, 1 + %tmp27 = icmp ult i32 %arg, %tmp23 + br i1 %tmp27, label %then, label %endif + +then: ; preds = %bb + call void @llvm.amdgcn.buffer.store.f32(float undef, <4 x i32> undef, i32 0, i32 undef, i1 false, i1 false) #1 + br label %endif + +endif: ; preds = %bb28, %bb + br i1 %breakcond, label %loop, label %loopexit + +loopexit: + ret void +} + + +declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #3 + +attributes #3 = { nounwind writeonly } + Index: llvm/trunk/test/CodeGen/AMDGPU/multilevel-break.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/multilevel-break.ll +++ llvm/trunk/test/CodeGen/AMDGPU/multilevel-break.ll @@ -30,7 +30,9 @@ ; Ensure extra or eliminated ; GCN-NEXT: s_or_b64 exec, exec, [[SAVE_BREAK]] -; GCN-NEXT: s_or_b64 [[OR_BREAK:s\[[0-9]+:[0-9]+\]]], [[SAVE_BREAK]], s{{\[[0-9]+:[0-9]+\]}} +; GCN-NEXT: s_mov_b64 +; GCN-NEXT: s_and_b64 [[MASKED_SAVE_BREAK:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_BREAK]] +; GCN-NEXT: s_or_b64 [[OR_BREAK:s\[[0-9]+:[0-9]+\]]], [[MASKED_SAVE_BREAK]], s{{\[[0-9]+:[0-9]+\]}} ; GCN-NEXT: s_andn2_b64 exec, exec, [[OR_BREAK]] ; GCN-NEXT: s_cbranch_execnz [[INNER_LOOP]] @@ -39,7 +41,8 @@ ; Ensure copy is eliminated ; GCN-NEXT: s_or_b64 exec, exec, [[OR_BREAK]] -; GCN-NEXT: s_or_b64 [[OUTER_OR_BREAK:s\[[0-9]+:[0-9]+\]]], [[SAVE_BREAK]], s{{\[[0-9]+:[0-9]+\]}} +; GCN-NEXT: s_and_b64 [[MASKED2_SAVE_BREAK:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_BREAK]] +; GCN-NEXT: s_or_b64 [[OUTER_OR_BREAK:s\[[0-9]+:[0-9]+\]]], [[MASKED2_SAVE_BREAK]], s{{\[[0-9]+:[0-9]+\]}} ; GCN-NEXT: s_andn2_b64 exec, exec, [[OUTER_OR_BREAK]] ; GCN-NEXT: s_cbranch_execnz [[OUTER_LOOP]] define amdgpu_vs void @multi_else_break(<4 x float> %vec, i32 %ub, i32 %cont) { Index: llvm/trunk/test/CodeGen/AMDGPU/valu-i1.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/valu-i1.ll +++ llvm/trunk/test/CodeGen/AMDGPU/valu-i1.ll @@ -223,7 +223,9 @@ ; SI: [[LABEL_FLOW]]: ; SI-NEXT: ; in Loop: Header=[[LABEL_LOOP]] ; SI-NEXT: s_or_b64 exec, exec, [[ORNEG3]] -; SI-NEXT: s_or_b64 [[COND_STATE]], [[ORNEG3]], [[TMP]] +; SI-NEXT: s_mov_b64 [[MOVED_TMP:s\[[0-9]+:[0-9]+\]]], [[TMP]] +; SI-NEXT: s_and_b64 [[MASKED_ORNEG3:s\[[0-9]+:[0-9]+\]]], exec, [[ORNEG3]] +; SI-NEXT: s_or_b64 [[COND_STATE]], [[MASKED_ORNEG3]], [[MOVED_TMP]] ; SI-NEXT: s_andn2_b64 exec, exec, [[COND_STATE]] ; SI-NEXT: s_cbranch_execnz [[LABEL_LOOP]]