diff --git a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp --- a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -313,8 +313,15 @@ Value *Exec = popSaved(); Instruction *FirstInsertionPt = &*BB->getFirstInsertionPt(); - if (!isa(Exec) && !isa(FirstInsertionPt)) + if (!isa(Exec) && !isa(FirstInsertionPt)) { + Instruction *ExecDef = cast(Exec); + BasicBlock *DefBB = ExecDef->getParent(); + if (!DT->dominates(DefBB, BB)) { + // Split edge to make Def dominate Use + FirstInsertionPt = &*SplitEdge(DefBB, BB, DT, LI)->getFirstInsertionPt(); + } CallInst::Create(EndCf, Exec, "", FirstInsertionPt); + } } /// Annotate the control flow with intrinsics so the backend can @@ -327,7 +334,6 @@ const TargetMachine &TM = TPC.getTM(); initialize(*F.getParent(), TM.getSubtarget(F)); - for (df_iterator I = df_begin(&F.getEntryBlock()), E = df_end(&F.getEntryBlock()); I != E; ++I) { BasicBlock *BB = *I; @@ -344,7 +350,8 @@ if (isTopOfStack(BB)) closeControlFlow(BB); - handleLoop(Term); + if (DT->dominates(Term->getSuccessor(1), BB)) + handleLoop(Term); continue; } diff --git a/llvm/test/CodeGen/AMDGPU/virtual-register-defs-dont-dominate-all-uses.ll b/llvm/test/CodeGen/AMDGPU/virtual-register-defs-dont-dominate-all-uses.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/virtual-register-defs-dont-dominate-all-uses.ll @@ -0,0 +1,92 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s + +define hidden void @kernel_path_surface_bounce() { +; GCN-LABEL: kernel_path_surface_bounce: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: v_writelane_b32 v40, s33, 2 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_add_u32 s32, s32, 0x400 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: flat_load_dword v0, v[0:1] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 21, v0 +; GCN-NEXT: s_and_b64 vcc, exec, vcc +; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 +; GCN-NEXT: s_cbranch_vccz BB0_3 +; GCN-NEXT: ; %bb.1: ; %bb4 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 9, v0 +; GCN-NEXT: s_and_b64 vcc, exec, vcc +; GCN-NEXT: s_cbranch_vccnz BB0_4 +; GCN-NEXT: ; %bb.2: ; %bb7 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, _Z3dotDv3_fS_@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, _Z3dotDv3_fS_@rel32@hi+12 +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: s_branch BB0_7 +; GCN-NEXT: BB0_3: ; %bb2 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 21, v0 +; GCN-NEXT: s_and_b64 vcc, exec, vcc +; GCN-NEXT: s_cbranch_vccnz BB0_6 +; GCN-NEXT: BB0_4: ; %bb9 +; GCN-NEXT: s_getpc_b64 s[4:5] +; GCN-NEXT: s_add_u32 s4, s4, _Z3dotDv3_fS_@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s5, s5, _Z3dotDv3_fS_@rel32@hi+12 +; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GCN-NEXT: v_cmp_lt_f32_e32 vcc, 0, v0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execnz BB0_7 +; GCN-NEXT: ; %bb.5: ; %bb9.bb12_crit_edge +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: BB0_6: ; %bb12 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: flat_store_dword v[0:1], v2 +; GCN-NEXT: BB0_7: ; %UnifiedReturnBlock +; GCN-NEXT: v_readlane_b32 s4, v40, 0 +; GCN-NEXT: v_readlane_b32 s5, v40, 1 +; GCN-NEXT: s_sub_u32 s32, s32, 0x400 +; GCN-NEXT: v_readlane_b32 s33, v40, 2 +; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; GCN-NEXT: s_mov_b64 exec, s[6:7] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[4:5] +bb: + %i = load i32, i32 addrspace(1)* null, align 16 + %i1 = icmp slt i32 %i, 21 + br i1 %i1, label %bb4, label %bb2 + +bb2: ; preds = %bb + %i3 = icmp eq i32 %i, 21 + br i1 %i3, label %bb12, label %bb9 + +bb4: ; preds = %bb + %i5 = icmp eq i32 %i, 9 + br i1 %i5, label %bb7, label %bb9 + +bb6: ; preds = %bb9 + ret void + +bb7: ; preds = %bb4 + %i8 = call float @_Z3dotDv3_fS_() + ret void + +bb9: ; preds = %bb4, %bb2 + %i10 = call float @_Z3dotDv3_fS_() + %i11 = fcmp nsz ogt float %i10, 0.000000e+00 + br i1 %i11, label %bb6, label %bb12 + +bb12: ; preds = %bb9, %bb2 + store float 0.000000e+00, float addrspace(1)* null, align 8 + ret void +} + +declare hidden float @_Z3dotDv3_fS_() local_unnamed_addr