Index: lib/CodeGen/MachineBlockPlacement.cpp =================================================================== --- lib/CodeGen/MachineBlockPlacement.cpp +++ lib/CodeGen/MachineBlockPlacement.cpp @@ -133,6 +133,14 @@ "that won't conflict."), cl::init(2), cl::Hidden); +// Heuristic for aggressive tail duplication. +static cl::opt TailDupPlacementAggressiveThreshold( + "tail-dup-placement-aggressive-threshold", + cl::desc("Instruction cutoff for aggressive tail duplication during " + "layout. Used at -O3. Tail merging during layout is forced to " + "have a threshold that won't conflict."), cl::init(3), + cl::Hidden); + // Heuristic for tail duplication. static cl::opt TailDupPlacementPenalty( "tail-dup-placement-penalty", @@ -2646,9 +2654,26 @@ assert(BlockToChain.empty()); assert(ComputedEdges.empty()); + unsigned TailDupSize = TailDupPlacementThreshold; + // If only the aggressive threshold is explicitly set, use it. + if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 && + TailDupPlacementThreshold.getNumOccurrences() == 0) + TailDupSize = TailDupPlacementAggressiveThreshold; + + TargetPassConfig *PassConfig = &getAnalysis(); + // For agressive optimization, we can adjust some thresholds to be less + // conservative. + if (PassConfig->getOptLevel() >= CodeGenOpt::Aggressive) { + // At O3 we should be more willing to copy blocks for tail duplication. This + // increases size pressure, so we only do it at O3 + // Do this unless only the regular threshold is explicitly set. + if (TailDupPlacementThreshold.getNumOccurrences() == 0 || + TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0) + TailDupSize = TailDupPlacementAggressiveThreshold; + } + if (TailDupPlacement) { MPDT = &getAnalysis(); - unsigned TailDupSize = TailDupPlacementThreshold; if (MF.getFunction()->optForSize()) TailDupSize = 1; TailDup.initMF(MF, MBPI, /* LayoutMode */ true, TailDupSize); @@ -2658,7 +2683,6 @@ buildCFGChains(); // Changing the layout can create new tail merging opportunities. - TargetPassConfig *PassConfig = &getAnalysis(); // TailMerge can create jump into if branches that make CFG irreducible for // HW that requires structured CFG. bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() && @@ -2666,7 +2690,7 @@ BranchFoldPlacement; // No tail merging opportunities if the block number is less than four. if (MF.size() > 3 && EnableTailMerge) { - unsigned TailMergeSize = TailDupPlacementThreshold + 1; + unsigned TailMergeSize = TailDupSize + 1; BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI, *MBPI, TailMergeSize); Index: test/CodeGen/PowerPC/tail-dup-layout.ll =================================================================== --- test/CodeGen/PowerPC/tail-dup-layout.ll +++ test/CodeGen/PowerPC/tail-dup-layout.ll @@ -1,4 +1,5 @@ -; RUN: llc -O2 < %s | FileCheck %s +; RUN: llc -O2 -o - %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-O2 %s +; RUN: llc -O3 -o - %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-O3 %s target datalayout = "e-m:e-i64:64-n32:64" target triple = "powerpc64le-grtev4-linux-gnu" @@ -99,11 +100,9 @@ ; test1 ; test2 ; test3 -; test4 ; optional1 ; optional2 ; optional3 -; optional4 ; exit ; even for 50/50 branches. ; Tail duplication puts test n+1 at the end of optional n @@ -162,6 +161,98 @@ ret void } +; Intended layout: +; The chain-of-triangles based duplicating produces the layout when 3 +; instructions are allowed for tail-duplication. +; test1 +; test2 +; test3 +; optional1 +; optional2 +; optional3 +; exit +; +; Otherwise it produces the layout: +; test1 +; optional1 +; test2 +; optional2 +; test3 +; optional3 +; exit + +;CHECK-LABEL: straight_test_3_instr_test: +; test1 may have been merged with entry +;CHECK: mr [[TAGREG:[0-9]+]], 3 +;CHECK: clrlwi {{[0-9]+}}, [[TAGREG]], 30 +;CHECK-NEXT: cmplwi {{[0-9]+}}, 2 + +;CHECK-O3-NEXT: bne 0, .[[OPT1LABEL:[_0-9A-Za-z]+]] +;CHECK-O3-NEXT: # %test2 +;CHECK-O3-NEXT: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 28, 29 +;CHECK-O3-NEXT: cmplwi {{[0-9]+}}, 8 +;CHECK-O3-NEXT: bne 0, .[[OPT2LABEL:[_0-9A-Za-z]+]] +;CHECK-O3-NEXT: .[[TEST3LABEL:[_0-9A-Za-z]+]]: # %test3 +;CHECK-O3-NEXT: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 26, 27 +;CHECK-O3-NEXT: cmplwi {{[0-9]+}}, 32 +;CHECK-O3-NEXT: bne 0, .[[OPT3LABEL:[_0-9A-Za-z]+]] +;CHECK-O3-NEXT: .[[EXITLABEL:[_0-9A-Za-z]+]]: # %exit +;CHECK-O3: blr +;CHECK-O3-NEXT: .[[OPT1LABEL]]: +;CHECK-O3: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 28, 29 +;CHECK-O3-NEXT: cmplwi {{[0-9]+}}, 8 +;CHECK-O3-NEXT: beq 0, .[[TEST3LABEL]] +;CHECK-O3-NEXT: .[[OPT2LABEL]]: +;CHECK-O3: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 26, 27 +;CHECK-O3-NEXT: cmplwi {{[0-9]+}}, 32 +;CHECK-O3-NEXT: beq 0, .[[EXITLABEL]] +;CHECK-O3-NEXT: .[[OPT3LABEL]]: +;CHECK-O3: b .[[EXITLABEL]] + +;CHECK-O2-NEXT: beq 0, .[[TEST2LABEL:[_0-9A-Za-z]+]] +;CHECK-O2-NEXT: # %optional1 +;CHECK-O2: .[[TEST2LABEL]]: # %test2 +;CHECK-O2-NEXT: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 28, 29 +;CHECK-O2-NEXT: cmplwi {{[0-9]+}}, 8 +;CHECK-O2-NEXT: beq 0, .[[TEST3LABEL:[_0-9A-Za-z]+]] +;CHECK-O2-NEXT: # %optional2 +;CHECK-O2: .[[TEST3LABEL]]: # %test3 +;CHECK-O2-NEXT: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 26, 27 +;CHECK-O2-NEXT: cmplwi {{[0-9]+}}, 32 +;CHECK-O2-NEXT: beq 0, .[[EXITLABEL:[_0-9A-Za-z]+]] +;CHECK-O2-NEXT: # %optional3 +;CHECK-O2: .[[EXITLABEL:[_0-9A-Za-z]+]]: # %exit +;CHECK-O2: blr + + +define void @straight_test_3_instr_test(i32 %tag) { +entry: + br label %test1 +test1: + %tagbit1 = and i32 %tag, 3 + %tagbit1eq0 = icmp eq i32 %tagbit1, 2 + br i1 %tagbit1eq0, label %test2, label %optional1, !prof !2 +optional1: + call void @a() + br label %test2 +test2: + %tagbit2 = and i32 %tag, 12 + %tagbit2eq0 = icmp eq i32 %tagbit2, 8 + br i1 %tagbit2eq0, label %test3, label %optional2, !prof !2 +optional2: + call void @b() + br label %test3 +test3: + %tagbit3 = and i32 %tag, 48 + %tagbit3eq0 = icmp eq i32 %tagbit3, 32 + br i1 %tagbit3eq0, label %exit, label %optional3, !prof !1 +optional3: + call void @c() + br label %exit +exit: + ret void +} + ; Intended layout: ; The chain-based outlining produces the layout ; entry Index: test/CodeGen/X86/sse1.ll =================================================================== --- test/CodeGen/X86/sse1.ll +++ test/CodeGen/X86/sse1.ll @@ -66,7 +66,10 @@ ; X32-NEXT: jne .LBB1_8 ; X32-NEXT: .LBB1_7: ; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; X32-NEXT: jmp .LBB1_9 +; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X32-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X32-NEXT: je .LBB1_10 +; X32-NEXT: jmp .LBB1_11 ; X32-NEXT: .LBB1_1: ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp) @@ -77,11 +80,10 @@ ; X32-NEXT: je .LBB1_7 ; X32-NEXT: .LBB1_8: # %entry ; X32-NEXT: xorps %xmm3, %xmm3 -; X32-NEXT: .LBB1_9: # %entry ; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp) ; X32-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; X32-NEXT: jne .LBB1_11 -; X32-NEXT: # BB#10: +; X32-NEXT: .LBB1_10: ; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-NEXT: .LBB1_11: # %entry ; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -103,7 +105,10 @@ ; X64-NEXT: jne .LBB1_8 ; X64-NEXT: .LBB1_7: ; X64-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; X64-NEXT: jmp .LBB1_9 +; X64-NEXT: testl %esi, %esi +; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X64-NEXT: je .LBB1_10 +; X64-NEXT: jmp .LBB1_11 ; X64-NEXT: .LBB1_1: ; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X64-NEXT: testl %edx, %edx @@ -114,11 +119,10 @@ ; X64-NEXT: je .LBB1_7 ; X64-NEXT: .LBB1_8: # %entry ; X64-NEXT: xorps %xmm3, %xmm3 -; X64-NEXT: .LBB1_9: # %entry ; X64-NEXT: testl %esi, %esi ; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; X64-NEXT: jne .LBB1_11 -; X64-NEXT: # BB#10: +; X64-NEXT: .LBB1_10: ; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-NEXT: .LBB1_11: # %entry ; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]