Index: lib/Target/AMDGPU/SILowerControlFlow.cpp =================================================================== --- lib/Target/AMDGPU/SILowerControlFlow.cpp +++ lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -343,11 +343,44 @@ } void SILowerControlFlow::emitIfBreak(MachineInstr &MI) { - MI.setDesc(TII->get(AMDGPU::S_OR_B64)); + MachineBasicBlock &MBB = *MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + auto Dst = MI.getOperand(0).getReg(); + + // Skip ANDing with exec if the operand is already masked by exec because it + // is a V_CMP in the same basic block. + bool SkipAnding = false; + if (MI.getOperand(1).isReg()) + if (MachineInstr *Def = MRI->getUniqueVRegDef(MI.getOperand(1).getReg())) + if (Def->getParent() == MI.getParent()) + SkipAnding = SIInstrInfo::isVALU(*Def); + + // AND the operand with exec, then OR that into the "loop exit" mask. + MachineInstr *And = nullptr, *Or = nullptr; + if (!SkipAnding) { + And = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst) + .addReg(AMDGPU::EXEC) + .add(MI.getOperand(1)); + Or = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) + .addReg(Dst) + .add(MI.getOperand(2)); + } else + Or = BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) + .add(MI.getOperand(1)) + .add(MI.getOperand(2)); + + if (LIS) { + if (And) + LIS->InsertMachineInstrInMaps(*And); + LIS->ReplaceMachineInstrInMaps(MI, *Or); + } + + MI.eraseFromParent(); } void SILowerControlFlow::emitElseBreak(MachineInstr &MI) { - MI.setDesc(TII->get(AMDGPU::S_OR_B64)); + // Lowered in the same way as emitIfBreak above. + emitIfBreak(MI); } void SILowerControlFlow::emitLoop(MachineInstr &MI) { Index: test/CodeGen/AMDGPU/loop_exit_with_xor.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/loop_exit_with_xor.ll @@ -0,0 +1,63 @@ +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; Where the mask of lanes wanting to exit the loop on this iteration is not +; obviously already masked by exec (in this case, the xor with -1 inserted by +; control flow annotation), then lower control flow must insert an S_AND_B64 +; with exec. + +; GCN-LABEL: {{^}}needs_and: +; GCN: s_xor_b64 [[REG1:[^ ,]*]], {{[^ ,]*, -1$}} +; GCN: s_and_b64 [[REG2:[^ ,]*]], exec, [[REG1]] +; GCN: s_or_b64 [[REG3:[^ ,]*]], [[REG2]], +; GCN: s_andn2_b64 exec, exec, [[REG3]] + +define void @needs_and(i32 %arg) { +entry: + br label %loop + +loop: + %tmp23phi = phi i32 [ %tmp23, %endif ], [ 0, %entry ] + %tmp23 = add nuw i32 %tmp23phi, 1 + %tmp27 = icmp ult i32 %arg, %tmp23 + br i1 %tmp27, label %then, label %endif + +then: ; preds = %bb + call void @llvm.amdgcn.buffer.store.f32(float undef, <4 x i32> undef, i32 0, i32 undef, i1 false, i1 false) #1 + br label %endif + +endif: ; preds = %bb28, %bb + br i1 %tmp27, label %loop, label %loopexit + +loopexit: + ret void +} + +; Where the mask of lanes wanting to exit the loop on this iteration is +; obviously already masked by exec (a V_CMP), then lower control flow can omit +; the S_AND_B64 to avoid an unnecessary instruction. + +; GCN-LABEL: {{^}}doesnt_need_and: +; GCN: v_cmp{{[^ ]*}} [[REG1:[^ ,]*]] +; GCN: s_or_b64 [[REG2:[^ ,]*]], [[REG1]], +; GCN: s_andn2_b64 exec, exec, [[REG2]] + +define void @doesnt_need_and(i32 %arg) { +entry: + br label %loop + +loop: + %tmp23phi = phi i32 [ %tmp23, %loop ], [ 0, %entry ] + %tmp23 = add nuw i32 %tmp23phi, 1 + %tmp27 = icmp ult i32 %arg, %tmp23 + call void @llvm.amdgcn.buffer.store.f32(float undef, <4 x i32> undef, i32 0, i32 undef, i1 false, i1 false) #1 + br i1 %tmp27, label %loop, label %loopexit + +loopexit: + ret void +} + + +declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #3 + +attributes #3 = { nounwind writeonly } +