Index: lib/Transforms/Scalar/StructurizeCFG.cpp =================================================================== --- lib/Transforms/Scalar/StructurizeCFG.cpp +++ lib/Transforms/Scalar/StructurizeCFG.cpp @@ -936,6 +936,11 @@ static bool hasOnlyUniformBranches(Region *R, unsigned UniformMDKindID, const LegacyDivergenceAnalysis &DA) { + // Bool for if all sub-regions are uniform. + bool SubRegionsAreUniform = true; + // Count of how many direct children are conditional. + unsigned ConditionalDirectChildren = 0; + for (auto E : R->elements()) { if (!E->isSubRegion()) { auto Br = dyn_cast(E->getEntry()->getTerminator()); @@ -944,6 +949,10 @@ if (!DA.isUniform(Br)) return false; + + // One of our direct children is conditional. + ConditionalDirectChildren++; + LLVM_DEBUG(dbgs() << "BB: " << Br->getParent()->getName() << " has uniform terminator\n"); } else { @@ -961,12 +970,21 @@ if (!Br || !Br->isConditional()) continue; - if (!Br->getMetadata(UniformMDKindID)) - return false; + if (!Br->getMetadata(UniformMDKindID)) { + SubRegionsAreUniform = false; + break; + } } } } - return true; + + // Our region is uniform if: + // 1. All conditional branches that are direct children are uniform (checked + // above). + // 2. And either: + // a. All sub-regions are uniform. + // b. There is one or less conditional branches among the direct children. + return SubRegionsAreUniform || (ConditionalDirectChildren <= 1); } /// Run the transformation for each region found Index: test/CodeGen/AMDGPU/control-flow-optnone.ll =================================================================== --- test/CodeGen/AMDGPU/control-flow-optnone.ll +++ test/CodeGen/AMDGPU/control-flow-optnone.ll @@ -15,8 +15,8 @@ ; GCN: s_mov_b64 exec ; GCN: s_or_b64 exec, exec -; GCN: v_cmp_eq_u32 -; GCN: s_cbranch_vccnz +; GCN: s_cmp_eq_u32 +; GCN: s_cbranch_scc1 ; GCN-NEXT: s_branch define amdgpu_kernel void @copytoreg_divergent_brcond(i32 %arg, i32 %arg1, i32 %arg2) #0 { bb: Index: test/Transforms/StructurizeCFG/AMDGPU/uniform-regions.ll =================================================================== --- test/Transforms/StructurizeCFG/AMDGPU/uniform-regions.ll +++ test/Transforms/StructurizeCFG/AMDGPU/uniform-regions.ll @@ -79,4 +79,110 @@ ret void } +define amdgpu_cs void @uniform_branch_to_nonuniform_subregions(i32 addrspace(4)* %ptr, i32 inreg %data) { +; CHECK-LABEL: @uniform_branch_to_nonuniform_subregions( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[DATA:%.*]], 42 +; CHECK-NEXT: br i1 [[C]], label [[UNIFORM_FOR_BODY:%.*]], label [[FOR_BODY:%.*]], !structurizecfg.uniform !0 +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP0:%.*]], [[FLOW1:%.*]] ] +; CHECK-NEXT: [[CC:%.*]] = icmp ult i32 [[I]], 4 +; CHECK-NEXT: br i1 [[CC]], label [[MID_LOOP:%.*]], label [[FLOW1]] +; CHECK: mid.loop: +; CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[CC2:%.*]] = icmp eq i32 [[V]], 0 +; CHECK-NEXT: br i1 [[CC2]], label [[END_LOOP:%.*]], label [[FLOW2:%.*]] +; CHECK: Flow1: +; CHECK-NEXT: [[TMP0]] = phi i32 [ [[TMP2:%.*]], [[FLOW2]] ], [ undef, [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi i1 [ [[TMP3:%.*]], [[FLOW2]] ], [ true, [[FOR_BODY]] ] +; CHECK-NEXT: br i1 [[TMP1]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK: end.loop: +; CHECK-NEXT: [[I_INC:%.*]] = add i32 [[I]], 1 +; CHECK-NEXT: br label [[FLOW2]] +; CHECK: Flow2: +; CHECK-NEXT: [[TMP2]] = phi i32 [ [[I_INC]], [[END_LOOP]] ], [ undef, [[MID_LOOP]] ] +; CHECK-NEXT: [[TMP3]] = phi i1 [ false, [[END_LOOP]] ], [ true, [[MID_LOOP]] ] +; CHECK-NEXT: br label [[FLOW1]] +; CHECK: for.end: +; CHECK-NEXT: br i1 [[CC]], label [[IF:%.*]], label [[FLOW:%.*]] +; CHECK: if: +; CHECK-NEXT: br label [[FLOW]] +; CHECK: uniform.for.body: +; CHECK-NEXT: [[UNIFORM_I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP4:%.*]], [[FLOW4:%.*]] ] +; CHECK-NEXT: [[UNIFORM_CC:%.*]] = icmp ult i32 [[UNIFORM_I]], 4 +; CHECK-NEXT: br i1 [[UNIFORM_CC]], label [[UNIFORM_MID_LOOP:%.*]], label [[FLOW4]] +; CHECK: uniform.mid.loop: +; CHECK-NEXT: [[UNIFORM_V:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[UNIFORM_CC2:%.*]] = icmp eq i32 [[UNIFORM_V]], 0 +; CHECK-NEXT: br i1 [[UNIFORM_CC2]], label [[UNIFORM_END_LOOP:%.*]], label [[FLOW5:%.*]] +; CHECK: Flow4: +; CHECK-NEXT: [[TMP4]] = phi i32 [ [[TMP6:%.*]], [[FLOW5]] ], [ undef, [[UNIFORM_FOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = phi i1 [ [[TMP7:%.*]], [[FLOW5]] ], [ true, [[UNIFORM_FOR_BODY]] ] +; CHECK-NEXT: br i1 [[TMP5]], label [[UNIFORM_FOR_END:%.*]], label [[UNIFORM_FOR_BODY]] +; CHECK: uniform.end.loop: +; CHECK-NEXT: [[UNIFORM_I_INC:%.*]] = add i32 [[UNIFORM_I]], 1 +; CHECK-NEXT: br label [[FLOW5]] +; CHECK: Flow5: +; CHECK-NEXT: [[TMP6]] = phi i32 [ [[UNIFORM_I_INC]], [[UNIFORM_END_LOOP]] ], [ undef, [[UNIFORM_MID_LOOP]] ] +; CHECK-NEXT: [[TMP7]] = phi i1 [ false, [[UNIFORM_END_LOOP]] ], [ true, [[UNIFORM_MID_LOOP]] ] +; CHECK-NEXT: br label [[FLOW4]] +; CHECK: uniform.for.end: +; CHECK-NEXT: br i1 [[UNIFORM_CC]], label [[UNIFORM_IF:%.*]], label [[FLOW3:%.*]] +; CHECK: uniform.if: +; CHECK-NEXT: br label [[FLOW3]] +; CHECK: Flow: +; CHECK-NEXT: br label [[END:%.*]] +; CHECK: Flow3: +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: ret void +; +entry: + %c = icmp eq i32 %data, 42 + br i1 %c, label %uniform.for.body, label %for.body + +for.body: + %i = phi i32 [0, %entry], [%i.inc, %end.loop] + %cc = icmp ult i32 %i, 4 + br i1 %cc, label %mid.loop, label %for.end + +mid.loop: + %v = call i32 @llvm.amdgcn.workitem.id.x() + %cc2 = icmp eq i32 %v, 0 + br i1 %cc2, label %end.loop, label %for.end + +end.loop: + %i.inc = add i32 %i, 1 + br label %for.body + +for.end: + br i1 %cc, label %if, label %end + +if: + br label %end + +uniform.for.body: + %uniform.i = phi i32 [0, %entry], [%uniform.i.inc, %uniform.end.loop] + %uniform.cc = icmp ult i32 %uniform.i, 4 + br i1 %uniform.cc, label %uniform.mid.loop, label %uniform.for.end + +uniform.mid.loop: + %uniform.v = call i32 @llvm.amdgcn.workitem.id.x() + %uniform.cc2 = icmp eq i32 %uniform.v, 0 + br i1 %uniform.cc2, label %uniform.end.loop, label %uniform.for.end + +uniform.end.loop: + %uniform.i.inc = add i32 %uniform.i, 1 + br label %uniform.for.body + +uniform.for.end: + br i1 %uniform.cc, label %uniform.if, label %end + +uniform.if: + br label %end + +end: + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x()