Index: lib/Target/AMDGPU/SIAnnotateControlFlow.cpp =================================================================== --- lib/Target/AMDGPU/SIAnnotateControlFlow.cpp +++ lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -148,12 +148,15 @@ Break = M.getOrInsertFunction( BreakIntrinsic, Int64, Int64, (Type *)nullptr); + cast(Break)->setDoesNotAccessMemory(); IfBreak = M.getOrInsertFunction( IfBreakIntrinsic, Int64, Boolean, Int64, (Type *)nullptr); + cast(IfBreak)->setDoesNotAccessMemory();; ElseBreak = M.getOrInsertFunction( ElseBreakIntrinsic, Int64, Int64, Int64, (Type *)nullptr); + cast(ElseBreak)->setDoesNotAccessMemory(); Loop = M.getOrInsertFunction( LoopIntrinsic, Boolean, Int64, (Type *)nullptr); Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -1640,20 +1640,30 @@ } bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const { - if (Intr->getOpcode() != ISD::INTRINSIC_W_CHAIN) - return false; + if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) { + switch (cast(Intr->getOperand(1))->getZExtValue()) { + case AMDGPUIntrinsic::amdgcn_if: + case AMDGPUIntrinsic::amdgcn_else: + case AMDGPUIntrinsic::amdgcn_end_cf: + case AMDGPUIntrinsic::amdgcn_loop: + return true; + default: + return false; + } + } - switch (cast(Intr->getOperand(1))->getZExtValue()) { - default: return false; - case AMDGPUIntrinsic::amdgcn_if: - case AMDGPUIntrinsic::amdgcn_else: - case AMDGPUIntrinsic::amdgcn_break: - case AMDGPUIntrinsic::amdgcn_if_break: - case AMDGPUIntrinsic::amdgcn_else_break: - case AMDGPUIntrinsic::amdgcn_loop: - case AMDGPUIntrinsic::amdgcn_end_cf: - return true; + if (Intr->getOpcode() == ISD::INTRINSIC_WO_CHAIN) { + switch (cast(Intr->getOperand(0))->getZExtValue()) { + case AMDGPUIntrinsic::amdgcn_break: + case AMDGPUIntrinsic::amdgcn_if_break: + case AMDGPUIntrinsic::amdgcn_else_break: + return true; + default: + return false; + } } + + return false; } void SITargetLowering::createDebuggerPrologueStackObjects( @@ -1705,30 +1715,50 @@ Target = BR->getOperand(1); } + // FIXME: This changes the types of the intrinsics instead of introducing new + // nodes with the correct types. + // e.g. llvm.amdgcn.loop + + // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3 + // => t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch + if (!isCFIntrinsic(Intr)) { // This is a uniform branch so we don't need to legalize. return BRCOND; } + bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID || + Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN; + assert(!SetCC || (SetCC->getConstantOperandVal(1) == 1 && cast(SetCC->getOperand(2).getNode())->get() == ISD::SETNE)); - // Build the result and - ArrayRef Res(Intr->value_begin() + 1, Intr->value_end()); - // operands of the new intrinsic call SmallVector Ops; - Ops.push_back(BRCOND.getOperand(0)); - Ops.append(Intr->op_begin() + 1, Intr->op_end()); + if (HaveChain) + Ops.push_back(BRCOND.getOperand(0)); + + Ops.append(Intr->op_begin() + (HaveChain ? 1 : 0), Intr->op_end()); Ops.push_back(Target); + ArrayRef Res(Intr->value_begin() + 1, Intr->value_end()); + // build the new intrinsic call SDNode *Result = DAG.getNode( Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL, DAG.getVTList(Res), Ops).getNode(); + if (!HaveChain) { + SDValue Ops[] = { + SDValue(Result, 0), + BRCOND.getOperand(0) + }; + + Result = DAG.getMergeValues(Ops, DL).getNode(); + } + if (BR) { // Give the branch instruction our target SDValue Ops[] = { Index: lib/Target/AMDGPU/SIInstrFormats.td =================================================================== --- lib/Target/AMDGPU/SIInstrFormats.td +++ lib/Target/AMDGPU/SIInstrFormats.td @@ -120,6 +120,9 @@ let Uses = !if(UseExec, [EXEC], []); let Defs = !if(DefExec, [EXEC, SCC], [SCC]); + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; } class Enc32 { Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -1345,8 +1345,6 @@ // SI pseudo instructions. These are used by the CFG structurizer pass // and should be lowered to ISA instructions prior to codegen. -let hasSideEffects = 1 in { - // Dummy terminator instruction to use after control flow instructions // replaced with exec mask operations. def SI_MASK_BRANCH : PseudoInstSI < @@ -1365,12 +1363,18 @@ [(set i64:$dst, (int_amdgcn_if i1:$vcc, bb:$target))], 1, 1> { let Constraints = ""; let Size = 8; + let mayStore = 1; + let mayLoad = 1; + let hasSideEffects = 1; } def SI_ELSE : CFPseudoInstSI < (outs SReg_64:$dst), (ins SReg_64:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> { let Constraints = "$src = $dst"; let Size = 12; + let mayStore = 1; + let mayLoad = 1; + let hasSideEffects = 1; } def SI_LOOP : CFPseudoInstSI < @@ -1378,6 +1382,9 @@ [(int_amdgcn_loop i64:$saved, bb:$target)], 1, 1> { let Size = 8; let isBranch = 1; + let hasSideEffects = 1; + let mayLoad = 1; + let mayStore = 1; } } // End isBranch = 1, isTerminator = 1 @@ -1386,24 +1393,35 @@ (outs), (ins SReg_64:$saved), [(int_amdgcn_end_cf i64:$saved)], 1, 1> { let Size = 4; + let isAsCheapAsAMove = 1; + let isReMaterializable = 1; + let mayLoad = 1; + let mayStore = 1; + let hasSideEffects = 1; } def SI_BREAK : CFPseudoInstSI < (outs SReg_64:$dst), (ins SReg_64:$src), [(set i64:$dst, (int_amdgcn_break i64:$src))], 1> { let Size = 4; + let isAsCheapAsAMove = 1; + let isReMaterializable = 1; } def SI_IF_BREAK : CFPseudoInstSI < (outs SReg_64:$dst), (ins SReg_64:$vcc, SReg_64:$src), [(set i64:$dst, (int_amdgcn_if_break i1:$vcc, i64:$src))]> { let Size = 4; + let isAsCheapAsAMove = 1; + let isReMaterializable = 1; } def SI_ELSE_BREAK : CFPseudoInstSI < (outs SReg_64:$dst), (ins SReg_64:$src0, SReg_64:$src1), [(set i64:$dst, (int_amdgcn_else_break i64:$src0, i64:$src1))]> { let Size = 4; + let isAsCheapAsAMove = 1; + let isReMaterializable = 1; } let Uses = [EXEC], Defs = [EXEC,VCC] in { @@ -1421,7 +1439,6 @@ } // End Uses = [EXEC], Defs = [EXEC,VCC] -} // End mayLoad = 1, mayStore = 1, hasSideEffects = 1 def SI_PS_LIVE : PseudoInstSI < (outs SReg_64:$dst), (ins), Index: lib/Target/AMDGPU/SIIntrinsics.td =================================================================== --- lib/Target/AMDGPU/SIIntrinsics.td +++ lib/Target/AMDGPU/SIIntrinsics.td @@ -186,11 +186,11 @@ /* Control flow Intrinsics */ - def int_amdgcn_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], []>; - def int_amdgcn_else : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_empty_ty], []>; - def int_amdgcn_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], []>; - def int_amdgcn_if_break : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_i64_ty], []>; - def int_amdgcn_else_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], []>; - def int_amdgcn_loop : Intrinsic<[], [llvm_i64_ty, llvm_empty_ty], []>; - def int_amdgcn_end_cf : Intrinsic<[], [llvm_i64_ty], []>; + def int_amdgcn_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], [IntrConvergent]>; + def int_amdgcn_else : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_empty_ty], [IntrConvergent]>; + def int_amdgcn_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], [IntrNoMem, IntrConvergent]>; + def int_amdgcn_if_break : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_i64_ty], [IntrNoMem, IntrConvergent]>; + def int_amdgcn_else_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem, IntrConvergent]>; + def int_amdgcn_loop : Intrinsic<[], [llvm_i64_ty, llvm_empty_ty], [IntrConvergent]>; + def int_amdgcn_end_cf : Intrinsic<[], [llvm_i64_ty], [IntrConvergent]>; } Index: test/CodeGen/AMDGPU/multilevel-break.ll =================================================================== --- test/CodeGen/AMDGPU/multilevel-break.ll +++ test/CodeGen/AMDGPU/multilevel-break.ll @@ -1,22 +1,48 @@ -; RUN: opt -S -mtriple=amdgcn-- -structurizecfg -si-annotate-control-flow < %s | FileCheck %s - -; CHECK-LABEL: {{^}}define amdgpu_vs void @main -; CHECK: main_body: -; CHECK: LOOP.outer: -; CHECK: LOOP: -; CHECK: [[if:%[0-9]+]] = call { i1, i64 } @llvm.amdgcn.if( -; CHECK: [[if_exec:%[0-9]+]] = extractvalue { i1, i64 } [[if]], 1 +; RUN: opt -S -mtriple=amdgcn-- -structurizecfg -si-annotate-control-flow < %s | FileCheck -check-prefix=OPT %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; OPT-LABEL: {{^}}define amdgpu_vs void @multi_else_break( +; OPT: main_body: +; OPT: LOOP.outer: +; OPT: LOOP: +; OPT: [[if:%[0-9]+]] = call { i1, i64 } @llvm.amdgcn.if( +; OPT: [[if_exec:%[0-9]+]] = extractvalue { i1, i64 } [[if]], 1 ; -; CHECK: Flow: +; OPT: Flow: ; ; Ensure two else.break calls, for both the inner and outer loops + +; OPT: call i64 @llvm.amdgcn.else.break(i64 [[if_exec]], +; OPT-NEXT: call i64 @llvm.amdgcn.else.break(i64 [[if_exec]], +; OPT-NEXT: call void @llvm.amdgcn.end.cf ; -; CHECK: call i64 @llvm.amdgcn.else.break(i64 [[if_exec]], -; CHECK-NEXT: call i64 @llvm.amdgcn.else.break(i64 [[if_exec]], -; CHECK-NEXT: call void @llvm.amdgcn.end.cf -; -; CHECK: Flow1: -define amdgpu_vs void @main(<4 x float> %vec, i32 %ub, i32 %cont) { +; OPT: Flow1: + +; GCN-LABEL: {{^}}multi_else_break: + +; GCN: [[OUTER_LOOP:BB[0-9]+_[0-9]+]]: ; %LOOP.outer{{$}} + +; GCN: [[INNER_LOOP:BB[0-9]+_[0-9]+]]: ; %LOOP{{$}} +; GCN: s_and_saveexec_b64 [[SAVE_BREAK:s\[[0-9]+:[0-9]+\]]], vcc + +; GCN: BB{{[0-9]+}}_{{[0-9]+}}: ; %Flow{{$}} +; GCN-NEXT: ; in Loop: Header=[[INNER_LOOP]] Depth=2 + +; Ensure extra or eliminated +; GCN-NEXT: s_or_b64 exec, exec, [[SAVE_BREAK]] +; GCN-NEXT: s_or_b64 [[OR_BREAK:s\[[0-9]+:[0-9]+\]]], [[SAVE_BREAK]], s{{\[[0-9]+:[0-9]+\]}} +; GCN-NEXT: s_andn2_b64 exec, exec, [[OR_BREAK]] +; GCN-NEXT: s_cbranch_execnz [[INNER_LOOP]] + +; GCN: ; BB#{{[0-9]+}}: ; %Flow1{{$}} +; GCN-NEXT: ; in Loop: Header=[[OUTER_LOOP]] Depth=1 + +; Ensure copy is eliminated +; GCN-NEXT: s_or_b64 exec, exec, [[OR_BREAK]] +; GCN-NEXT: s_or_b64 [[OUTER_OR_BREAK:s\[[0-9]+:[0-9]+\]]], [[SAVE_BREAK]], s{{\[[0-9]+:[0-9]+\]}} +; GCN-NEXT: s_andn2_b64 exec, exec, [[OUTER_OR_BREAK]] +; GCN-NEXT: s_cbranch_execnz [[OUTER_LOOP]] +define amdgpu_vs void @multi_else_break(<4 x float> %vec, i32 %ub, i32 %cont) { main_body: br label %LOOP.outer @@ -38,4 +64,52 @@ br i1 %tmp51, label %LOOP, label %LOOP.outer } -attributes #0 = { nounwind readnone } +; OPT-LABEL: define void @multi_if_break_loop( +; OPT: llvm.amdgcn.break +; OPT: llvm.amdgcn.loop +; OPT: llvm.amdgcn.if.break +; OPT: llvm.amdgcn.if.break +; OPT: llvm.amdgcn.end.cf + +; GCN-LABEL: {{^}}multi_if_break_loop: +; GCN: s_mov_b64 [[BREAK_REG:s\[[0-9]+:[0-9]+\]]], 0{{$}} + +; GCN: [[LOOP:BB[0-9]+_[0-9]+]]: ; %bb1{{$}} + +; Uses a copy intsead of an or +; GCN: s_mov_b64 [[COPY:s\[[0-9]+:[0-9]+\]]], [[BREAK_REG]] +; GCN: s_or_b64 [[BREAK_REG]], exec, [[COPY]] +define void @multi_if_break_loop(i32 %arg) #0 { +bb: + %id = call i32 @llvm.amdgcn.workitem.id.x() + %tmp = sub i32 %id, %arg + br label %bb1 + +bb1: + %lsr.iv = phi i32 [ undef, %bb ], [ %lsr.iv.next, %case0 ], [ %lsr.iv.next, %case1 ] + %lsr.iv.next = add i32 %lsr.iv, 1 + %cmp0 = icmp slt i32 %lsr.iv.next, 0 + %load0 = load volatile i32, i32 addrspace(1)* undef, align 4 + switch i32 %load0, label %bb9 [ + i32 0, label %case0 + i32 1, label %case1 + ] + +case0: + %load1 = load volatile i32, i32 addrspace(1)* undef, align 4 + %cmp1 = icmp slt i32 %tmp, %load1 + br i1 %cmp1, label %bb1, label %bb9 + +case1: + %load2 = load volatile i32, i32 addrspace(1)* undef, align 4 + %cmp2 = icmp slt i32 %tmp, %load2 + br i1 %cmp2, label %bb1, label %bb9 + +bb9: + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone }