Index: include/llvm/CodeGen/TailDuplicator.h =================================================================== --- include/llvm/CodeGen/TailDuplicator.h +++ include/llvm/CodeGen/TailDuplicator.h @@ -34,6 +34,7 @@ const TargetInstrInfo *TII; const TargetRegisterInfo *TRI; const MachineBranchProbabilityInfo *MBPI; + MachineLoopInfo *MLI; const MachineModuleInfo *MMI; MachineRegisterInfo *MRI; MachineFunction *MF; @@ -53,6 +54,8 @@ public: /// Prepare to run on a specific machine function. /// @param MF - Function that will be processed + /// @param MLI - Machine Loop Info. Used to not duplicate latches, headers, + /// or preheaders when duplicating early. /// @param MBPI - Branch Probability Info. Used to propagate correct /// probabilities when modifying the CFG. /// @param LayoutMode - When true, don't use the existing layout to make @@ -61,7 +64,7 @@ /// default implies using the command line value TailDupSize. void initMF(MachineFunction &MF, const MachineBranchProbabilityInfo *MBPI, - bool LayoutMode, unsigned TailDupSize = 0); + MachineLoopInfo *MLI, bool LayoutMode, unsigned TailDupSize = 0); bool tailDuplicateBlocks(); static bool isSimpleBB(MachineBasicBlock *TailBB); bool shouldTailDuplicate(bool IsSimple, MachineBasicBlock &TailBB); Index: lib/CodeGen/MachineBlockPlacement.cpp =================================================================== --- lib/CodeGen/MachineBlockPlacement.cpp +++ lib/CodeGen/MachineBlockPlacement.cpp @@ -1971,7 +1971,7 @@ unsigned TailDupSize = TailDuplicatePlacementThreshold; if (MF.getFunction()->optForSize()) TailDupSize = 1; - TailDup.initMF(MF, MBPI, /* LayoutMode */ true, TailDupSize); + TailDup.initMF(MF, MBPI, MLI, /* LayoutMode */ true, TailDupSize); } assert(BlockToChain.empty()); Index: lib/CodeGen/TailDuplication.cpp =================================================================== --- lib/CodeGen/TailDuplication.cpp +++ lib/CodeGen/TailDuplication.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TailDuplicator.h" #include "llvm/IR/Function.h" @@ -48,8 +49,9 @@ return false; auto MBPI = &getAnalysis(); + auto MLI = &getAnalysis(); - Duplicator.initMF(MF, MBPI, /* LayoutMode */ false); + Duplicator.initMF(MF, MBPI, MLI, /* LayoutMode */ false); bool MadeChange = false; while (Duplicator.tailDuplicateBlocks()) @@ -60,5 +62,6 @@ void TailDuplicatePass::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); + AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } Index: lib/CodeGen/TailDuplicator.cpp =================================================================== --- lib/CodeGen/TailDuplicator.cpp +++ lib/CodeGen/TailDuplicator.cpp @@ -65,6 +65,7 @@ void TailDuplicator::initMF(MachineFunction &MFin, const MachineBranchProbabilityInfo *MBPIin, + MachineLoopInfo *MLIin, bool LayoutModeIn, unsigned TailDupSizeIn) { MF = &MFin; TII = MF->getSubtarget().getInstrInfo(); @@ -72,9 +73,12 @@ MRI = &MF->getRegInfo(); MMI = &MF->getMMI(); MBPI = MBPIin; + MLI = MLIin; TailDupSize = TailDupSizeIn; assert(MBPI != nullptr && "Machine Branch Probability Info required"); + assert((LayoutModeIn || MLI != nullptr) && + "Machine Loop Info required in non-layout mode."); LayoutMode = LayoutModeIn; PreRegAlloc = MRI->isSSA(); @@ -554,6 +558,21 @@ && TailBB.canFallThrough()) return false; + // When tail-duplicating relatively early, we shouldn't de-normalize loops by + // duplicating latches, headers, or loop-predecessors. This interferes with + // later optimizations. During layout this is fine. + if (!LayoutMode) { + MachineLoop *ML = MLI->getLoopFor(&TailBB); + if (ML && (&TailBB == ML->getHeader() || + &TailBB == ML->getLoopLatch())) + return false; + for (auto SB : TailBB.successors()) { + MachineLoop *SML = MLI->getLoopFor(SB); + if (SML && (&TailBB == SML->getLoopPredecessor())) + return false; + } + } + // If the target has hardware branch prediction that can handle indirect // branches, duplicating them can often make them predictable when there // are common paths through the code. The limit needs to be high enough @@ -977,6 +996,8 @@ if (RemovalCallback) (*RemovalCallback)(MBB); + if (MLI) + MLI->removeBlock(MBB); // Remove all successors. while (!MBB->succ_empty()) Index: test/CodeGen/WebAssembly/cfg-stackify.ll =================================================================== --- test/CodeGen/WebAssembly/cfg-stackify.ll +++ test/CodeGen/WebAssembly/cfg-stackify.ll @@ -402,32 +402,39 @@ ; CHECK: .LBB11_1: ; CHECK: loop{{$}} ; CHECK: block{{$}} +; CHECK: block{{$}} ; CHECK: br_if 0, $0{{$}} ; CHECK: br 1{{$}} ; CHECK: .LBB11_3: -; CHECK: end_block{{$}} +; CHECK-NEXT: end_block{{$}} ; CHECK: block{{$}} ; CHECK: br_if 0, $1{{$}} ; CHECK: br 1{{$}} ; CHECK: .LBB11_5: -; CHECK: br 0{{$}} +; CHECK-NEXT: end_block{{$}} ; CHECK: .LBB11_6: +; CHECK-NEXT: end_block{{$}} +; CHECK: br 0{{$}} +; CHECK: .LBB11_7: ; CHECK-NEXT: end_loop{{$}} ; OPT-LABEL: doublediamond_in_a_loop: ; OPT: .LBB11_1: ; OPT: loop{{$}} ; OPT: block{{$}} -; OPT: br_if 0, {{[^,]+}}{{$}} +; OPT: block{{$}} ; OPT: block{{$}} ; OPT: br_if 0, {{[^,]+}}{{$}} +; OPT: br_if 1, {{[^,]+}}{{$}} ; OPT: br 2{{$}} ; OPT-NEXT: .LBB11_4: ; OPT-NEXT: end_block{{$}} ; OPT: br 1{{$}} ; OPT: .LBB11_5: ; OPT-NEXT: end_block{{$}} -; OPT: br 0{{$}} ; OPT: .LBB11_6: +; OPT-NEXT: end_block{{$}} +; OPT: br 0{{$}} +; OPT: .LBB11_7: ; OPT-NEXT: end_loop{{$}} define i32 @doublediamond_in_a_loop(i32 %a, i32 %b, i32* %p) { entry: Index: test/CodeGen/X86/avx512-i1test.ll =================================================================== --- test/CodeGen/X86/avx512-i1test.ll +++ test/CodeGen/X86/avx512-i1test.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s +; RUN: llc -O2 < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s ; ModuleID = 'bugpoint-reduced-simplified.bc' target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @@ -16,6 +16,7 @@ ; CHECK-NEXT: LBB0_2: ## %bb35 ; CHECK-NEXT: ## in Loop: Header=BB0_1 Depth=1 ; CHECK-NEXT: kortestw %k0, %k0 +; CHECK-NEXT: kortestw %k0, %k0 ; CHECK-NEXT: LBB0_1: ## %bb33 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: kortestw %k0, %k0 Index: test/CodeGen/X86/tail-dup-no-denormalize-loops.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/tail-dup-no-denormalize-loops.ll @@ -0,0 +1,61 @@ +; RUN: llc -O2 %s -o - | FileCheck %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.binaryTree = type { i32, %struct.binaryTree*, %struct.binaryTree* } + +; Function Attrs: norecurse nounwind readonly uwtable +define i32 @memberOfSortedBinaryTree(%struct.binaryTree* readonly %tree, i32 %searchedValue) local_unnamed_addr #4 { +entry: + %tobool122 = icmp eq %struct.binaryTree* %tree, null + br i1 %tobool122, label %while.end, label %while.cond.us.preheader.preheader + +while.cond.us.preheader.preheader: ; preds = %entry + br label %while.cond.us.preheader + +while.cond.us.preheader: ; preds = %while.cond.outer.backedge, %while.cond.us.preheader.preheader + %tree.pn = phi %struct.binaryTree* [ %1, %while.cond.outer.backedge ], [ %tree, %while.cond.us.preheader.preheader ] + %value25 = getelementptr inbounds %struct.binaryTree, %struct.binaryTree* %tree.pn, i64 0, i32 0 + %0 = load i32, i32* %value25, align 8 + %cmp.us = icmp sgt i32 %0, %searchedValue + br i1 %cmp.us, label %if.then, label %if.else.us + +if.else.us: ; preds = %while.cond.us.preheader + %cmp3.us = icmp slt i32 %0, %searchedValue + br i1 %cmp3.us, label %if.then4, label %while.end.loopexit + +; Make sure that %while.cond.outer.backedge is not duplicated into %if.then +; CHECK: # %if.then{{$}} +; CHECK-NEXT: # in Loop: +; CHECK-NEXT: addq $8 +; CHECK-NEXT: jmp +if.then: ; preds = %while.cond.us.preheader + %left = getelementptr inbounds %struct.binaryTree, %struct.binaryTree* %tree.pn, i64 0, i32 1 + br label %while.cond.outer.backedge + +while.cond.outer.backedge: ; preds = %if.then4, %if.then + %left.sink = phi %struct.binaryTree** [ %left, %if.then ], [ %right, %if.then4 ] + %1 = load %struct.binaryTree*, %struct.binaryTree** %left.sink, align 8 + %tobool1 = icmp eq %struct.binaryTree* %1, null + br i1 %tobool1, label %while.end.loopexit, label %while.cond.us.preheader + +; Make sure that %while.cond.outer.backedge is not duplicated into %if.then4 +; CHECK: # %if.then4{{$}} +; CHECK-NEXT: # in Loop: +; CHECK-NEXT: addq $16 +; CHECK-NEXT: jmp +if.then4: ; preds = %if.else.us + %right = getelementptr inbounds %struct.binaryTree, %struct.binaryTree* %tree.pn, i64 0, i32 2 + br label %while.cond.outer.backedge + +while.end.loopexit: ; preds = %while.cond.outer.backedge, %if.else.us + %found.0.lcssa.ph = phi i32 [ 0, %while.cond.outer.backedge ], [ 1, %if.else.us ] + br label %while.end + +while.end: ; preds = %while.end.loopexit, %entry + %found.0.lcssa = phi i32 [ 0, %entry ], [ %found.0.lcssa.ph, %while.end.loopexit ] + ret i32 %found.0.lcssa +} + + +attributes #4 = { norecurse nounwind readonly uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }