diff --git a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp --- a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -313,8 +313,15 @@ Value *Exec = popSaved(); Instruction *FirstInsertionPt = &*BB->getFirstInsertionPt(); - if (!isa(Exec) && !isa(FirstInsertionPt)) + if (!isa(Exec) && !isa(FirstInsertionPt)) { + Instruction *ExecDef = cast(Exec); + BasicBlock *DefBB = ExecDef->getParent(); + if (!DT->dominates(DefBB, BB)) { + // Split edge to make Def dominate Use + FirstInsertionPt = &*SplitEdge(DefBB, BB, DT, LI)->getFirstInsertionPt(); + } CallInst::Create(EndCf, Exec, "", FirstInsertionPt); + } } /// Annotate the control flow with intrinsics so the backend can @@ -327,7 +334,6 @@ const TargetMachine &TM = TPC.getTM(); initialize(*F.getParent(), TM.getSubtarget(F)); - for (df_iterator I = df_begin(&F.getEntryBlock()), E = df_end(&F.getEntryBlock()); I != E; ++I) { BasicBlock *BB = *I; @@ -344,7 +350,8 @@ if (isTopOfStack(BB)) closeControlFlow(BB); - handleLoop(Term); + if (DT->dominates(Term->getSuccessor(1), BB)) + handleLoop(Term); continue; } diff --git a/llvm/test/CodeGen/AMDGPU/virtual-register-defs-dont-dominate-all-uses.ll b/llvm/test/CodeGen/AMDGPU/virtual-register-defs-dont-dominate-all-uses.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/virtual-register-defs-dont-dominate-all-uses.ll @@ -0,0 +1,118 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -si-annotate-control-flow -mtriple=amdgcn-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=SI-OPT %s + +define hidden void @blam() { +; SI-OPT-LABEL: @blam( +; SI-OPT-NEXT: bb: +; SI-OPT-NEXT: [[TMP:%.*]] = load float, float* null, align 16 +; SI-OPT-NEXT: br label [[BB2:%.*]] +; SI-OPT: bb1: +; SI-OPT-NEXT: br label [[BB2]] +; SI-OPT: bb2: +; SI-OPT-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; SI-OPT-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* null, i32 [[TID]] +; SI-OPT-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(1)* [[GEP]], align 16 +; SI-OPT-NEXT: store float 0.000000e+00, float addrspace(5)* null, align 8 +; SI-OPT-NEXT: br label [[BB4:%.*]] +; SI-OPT: bb4: +; SI-OPT-NEXT: [[TMP5:%.*]] = icmp slt i32 [[TMP3]], 3 +; SI-OPT-NEXT: [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP5]]) +; SI-OPT-NEXT: [[TMP1:%.*]] = extractvalue { i1, i64 } [[TMP0]], 0 +; SI-OPT-NEXT: [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP0]], 1 +; SI-OPT-NEXT: br i1 [[TMP1]], label [[BB8:%.*]], label [[BB6:%.*]] +; SI-OPT: bb6: +; SI-OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP2]]) +; SI-OPT-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP3]], 3 +; SI-OPT-NEXT: br i1 [[TMP7]], label [[BB11:%.*]], label [[BB1:%.*]] +; SI-OPT: bb8: +; SI-OPT-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP3]], 1 +; SI-OPT-NEXT: [[TMP3:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP9]]) +; SI-OPT-NEXT: [[TMP4:%.*]] = extractvalue { i1, i64 } [[TMP3]], 0 +; SI-OPT-NEXT: [[TMP5:%.*]] = extractvalue { i1, i64 } [[TMP3]], 1 +; SI-OPT-NEXT: br i1 [[TMP4]], label [[BB10:%.*]], label [[BB8_BB1_CRIT_EDGE:%.*]] +; SI-OPT: bb8.bb1_crit_edge: +; SI-OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP5]]) +; SI-OPT-NEXT: br label [[BB1]] +; SI-OPT: bb10: +; SI-OPT-NEXT: store float 0x7FF8000000000000, float addrspace(5)* null, align 16 +; SI-OPT-NEXT: br label [[BB18:%.*]] +; SI-OPT: bb11: +; SI-OPT-NEXT: [[TMP12:%.*]] = call float @spam() +; SI-OPT-NEXT: [[TMP13:%.*]] = fcmp nsz oeq float [[TMP12]], 0.000000e+00 +; SI-OPT-NEXT: [[TMP6:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP13]]) +; SI-OPT-NEXT: [[TMP7:%.*]] = extractvalue { i1, i64 } [[TMP6]], 0 +; SI-OPT-NEXT: [[TMP8:%.*]] = extractvalue { i1, i64 } [[TMP6]], 1 +; SI-OPT-NEXT: br i1 [[TMP7]], label [[BB2]], label [[BB14:%.*]] +; SI-OPT: bb14: +; SI-OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP8]]) +; SI-OPT-NEXT: [[TMP15:%.*]] = fcmp nsz oeq float [[TMP]], 0.000000e+00 +; SI-OPT-NEXT: [[TMP9:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP15]]) +; SI-OPT-NEXT: [[TMP10:%.*]] = extractvalue { i1, i64 } [[TMP9]], 0 +; SI-OPT-NEXT: [[TMP11:%.*]] = extractvalue { i1, i64 } [[TMP9]], 1 +; SI-OPT-NEXT: br i1 [[TMP10]], label [[BB17:%.*]], label [[BB16:%.*]] +; SI-OPT: bb16: +; SI-OPT-NEXT: call void @llvm.amdgcn.end.cf.i64(i64 [[TMP11]]) +; SI-OPT-NEXT: store float 0x7FF8000000000000, float addrspace(5)* null, align 16 +; SI-OPT-NEXT: br label [[BB17]] +; SI-OPT: bb17: +; SI-OPT-NEXT: store float [[TMP]], float addrspace(5)* null, align 16 +; SI-OPT-NEXT: br label [[BB18]] +; SI-OPT: bb18: +; SI-OPT-NEXT: store float 0x7FF8000000000000, float addrspace(5)* null, align 4 +; SI-OPT-NEXT: br label [[BB2]] +; +bb: + %tmp = load float, float* null, align 16 + br label %bb2 + +bb1: ; preds = %bb8, %bb6 + br label %bb2 + +bb2: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds i32, i32 addrspace(1)* null, i32 %tid + %tmp3 = load i32, i32 addrspace(1)* %gep, align 16 + store float 0.000000e+00, float addrspace(5)* null, align 8 + br label %bb4 + +bb4: ; preds = %bb2 + %tmp5 = icmp slt i32 %tmp3, 3 + br i1 %tmp5, label %bb8, label %bb6 + +bb6: ; preds = %bb4 + %tmp7 = icmp eq i32 %tmp3, 3 + br i1 %tmp7, label %bb11, label %bb1 + +bb8: ; preds = %bb4 + %tmp9 = icmp eq i32 %tmp3, 1 + br i1 %tmp9, label %bb10, label %bb1 + +bb10: ; preds = %bb8 + store float 0x7FF8000000000000, float addrspace(5)* null, align 16 + br label %bb18 + +bb11: ; preds = %bb6 + %tmp12 = call float @spam() + %tmp13 = fcmp nsz oeq float %tmp12, 0.000000e+00 + br i1 %tmp13, label %bb2, label %bb14 + +bb14: ; preds = %bb11 + %tmp15 = fcmp nsz oeq float %tmp, 0.000000e+00 + br i1 %tmp15, label %bb17, label %bb16 + +bb16: ; preds = %bb14 + store float 0x7FF8000000000000, float addrspace(5)* null, align 16 + br label %bb17 + +bb17: ; preds = %bb16, %bb14 + store float %tmp, float addrspace(5)* null, align 16 + br label %bb18 + +bb18: ; preds = %bb17, %bb10 + store float 0x7FF8000000000000, float addrspace(5)* null, align 4 + br label %bb2 +} + +declare i32 @llvm.amdgcn.workitem.id.x() + +declare hidden float @spam()