Index: include/llvm/CodeGen/TailDuplicator.h =================================================================== --- include/llvm/CodeGen/TailDuplicator.h +++ include/llvm/CodeGen/TailDuplicator.h @@ -35,6 +35,7 @@ MachineRegisterInfo *MRI; std::unique_ptr RS; bool PreRegAlloc; + bool LayoutMode; // A list of virtual registers for which to update SSA form. SmallVector SSAUpdateVRs; @@ -46,14 +47,18 @@ DenseMap SSAUpdateVals; public: + /// Prepare to run on a specific machine function. + /// LayoutMode - When true, don't use the existing layout to make decisions, + /// and don't remove blocks. void initMF(MachineFunction &MF, const MachineModuleInfo *MMI, - const MachineBranchProbabilityInfo *MBPI); + const MachineBranchProbabilityInfo *MBPI, bool LayoutMode); bool tailDuplicateBlocks(MachineFunction &MF); static bool isSimpleBB(MachineBasicBlock *TailBB); bool shouldTailDuplicate(const MachineFunction &MF, bool IsSimple, MachineBasicBlock &TailBB); bool tailDuplicateAndUpdate(MachineFunction &MF, bool IsSimple, - MachineBasicBlock *MBB); + MachineBasicBlock *MBB, + MachineBasicBlock *ForcedLayoutPred); private: typedef TargetInstrInfo::RegSubRegPair RegSubRegPair; @@ -79,6 +84,7 @@ SmallVectorImpl &Copies); bool tailDuplicate(MachineFunction &MF, bool IsSimple, MachineBasicBlock *TailBB, + MachineBasicBlock *ForcedLayoutPred, SmallVectorImpl &TDBBs, SmallVectorImpl &Copies); void appendCopies(MachineBasicBlock *MBB, Index: lib/CodeGen/BranchFolding.cpp =================================================================== --- lib/CodeGen/BranchFolding.cpp +++ lib/CodeGen/BranchFolding.cpp @@ -591,13 +591,21 @@ /// and decide if it would be profitable to merge those tails. Return the /// length of the common tail and iterators to the first common instruction /// in each block. +/// MBB1, MBB2 The blocks to check +/// I1, I2 Iterator references that will be changed to point to the first +/// instruction in the common tail shared by MBB1,MBB2 +/// SuccBB A common successor of MBB1, MBB2 which are in a canonical form +/// relative to SuccBB +/// PredBB The layout predecessor of SuccBB, if any. +/// AfterPlacement Flag to indicate that block placement has already run. static bool ProfitableToMerge(MachineBasicBlock *MBB1, MachineBasicBlock *MBB2, unsigned minCommonTailLength, unsigned &CommonTailLen, MachineBasicBlock::iterator &I1, MachineBasicBlock::iterator &I2, MachineBasicBlock *SuccBB, MachineBasicBlock *PredBB, - DenseMap &FuncletMembership) { + DenseMap &FuncletMembership, + bool AfterPlacement) { // It is never profitable to tail-merge blocks from two different funclets. if (!FuncletMembership.empty()) { auto Funclet1 = FuncletMembership.find(MBB1); @@ -620,7 +628,9 @@ if (MBB1 == PredBB || MBB2 == PredBB) { MachineBasicBlock::iterator I; unsigned NumTerms = CountTerminators(MBB1 == PredBB ? MBB2 : MBB1, I); - if (CommonTailLen > NumTerms) + // When running after block placement, we don't want to undo + // tail-duplication that occurred during placement. + if (!AfterPlacement && CommonTailLen > NumTerms) return true; } @@ -637,10 +647,17 @@ // count that as an additional common instruction for the following // heuristics. unsigned EffectiveTailLen = CommonTailLen; - if (SuccBB && MBB1 != PredBB && MBB2 != PredBB && - !MBB1->back().isBarrier() && - !MBB2->back().isBarrier()) - ++EffectiveTailLen; + // We can't do this during layout because it undoes tail duplication, despite + // non-overlapping thresholds. What happens is that a block with a + // conditional branch and fallthrough gets tail-duplicated and this heuristic + // gets run with the non-fallthrough successor as SuccBB, and counts an extra + // branch, despite the block having only 2 instructions when it was + // duplicated. + if (!AfterPlacement) + if (SuccBB && MBB1 != PredBB && MBB2 != PredBB && + !MBB1->back().isBarrier() && + !MBB2->back().isBarrier()) + ++EffectiveTailLen; // Check if the common tail is long enough to be worthwhile. if (EffectiveTailLen >= minCommonTailLength) @@ -682,7 +699,8 @@ minCommonTailLength, CommonTailLen, TrialBBI1, TrialBBI2, SuccBB, PredBB, - FuncletMembership)) { + FuncletMembership, + AfterBlockPlacement)) { if (CommonTailLen > maxCommonTailLength) { SameTails.clear(); maxCommonTailLength = CommonTailLen; @@ -816,7 +834,7 @@ } } -// See if any of the blocks in MergePotentials (which all have a common single +// See if any of the blocks in MergePotentials (which all have a common // successor, or all have no successor) can be tail-merged. If there is a // successor, any blocks in MergePotentials that are not tail-merged and // are not immediately before Succ must have an unconditional branch to Index: lib/CodeGen/MachineBlockPlacement.cpp =================================================================== --- lib/CodeGen/MachineBlockPlacement.cpp +++ lib/CodeGen/MachineBlockPlacement.cpp @@ -40,6 +40,7 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/TailDuplicator.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -117,6 +118,12 @@ static cl::opt JumpInstCost("jump-inst-cost", cl::desc("Cost of jump instructions."), cl::init(1), cl::Hidden); +static cl::opt +TailDupPlacement("tail-dup-placement", + cl::desc("Perform tail duplication during placement. " + "Creates more fallthrough opportunites in " + "outline branches."), + cl::init(true), cl::Hidden); static cl::opt BranchFoldPlacement("branch-fold-placement", @@ -258,6 +265,13 @@ /// \brief A handle to the post dominator tree. MachineDominatorTree *MDT; + /// \brief Duplicator used to duplicate tails during placement. + /// + /// Placement decisions can open up new tail duplication opportunities, but + /// since tail duplication affects placement decisions of later blocks, it + /// must be done inline. + TailDuplicator TailDup; + /// \brief A set of blocks that are unavoidably execute, i.e. they dominate /// all terminators of the MachineFunction. SmallPtrSet UnavoidableBlocks; @@ -864,6 +878,16 @@ // after this block. MachineBasicBlock *BestSucc = selectBestSuccessor(BB, Chain, BlockFilter); + // Placing an actual successor may have changed tail duplication + // opportunities. Check for that now. + if (TailDupPlacement && BestSucc) { + DEBUG(dbgs() << "Redoing tail duplication for BestSucc#" + << BestSucc->getNumber() << "\n"); + bool IsSimple = TailDup.isSimpleBB(BestSucc); + if (TailDup.shouldTailDuplicate(*F, IsSimple, *BestSucc)) + TailDup.tailDuplicateAndUpdate(*F, IsSimple, BestSucc, BB); + } + // If an immediate successor isn't available, look for the best viable // block among those we've identified as not violating the loop's CFG at // this point. This won't be a fallthrough, but it will increase locality. @@ -1659,6 +1683,10 @@ TII = MF.getSubtarget().getInstrInfo(); TLI = MF.getSubtarget().getTargetLowering(); MDT = &getAnalysis(); + auto MMI = getAnalysisIfAvailable(); + if (TailDupPlacement) + TailDup.initMF(MF, MMI, MBPI, /* LayoutMode */ true); + assert(BlockToChain.empty()); buildCFGChains(); @@ -1680,6 +1708,8 @@ /*AfterBlockPlacement=*/true)) { // Redo the layout if tail merging creates/removes/moves blocks. BlockToChain.clear(); + // Must redo the dominator tree if blocks were changed. + MDT->runOnMachineFunction(MF); ChainAllocator.DestroyAll(); buildCFGChains(); } Index: lib/CodeGen/TailDuplication.cpp =================================================================== --- lib/CodeGen/TailDuplication.cpp +++ lib/CodeGen/TailDuplication.cpp @@ -50,7 +50,7 @@ auto MMI = getAnalysisIfAvailable(); auto MBPI = &getAnalysis(); - Duplicator.initMF(MF, MMI, MBPI); + Duplicator.initMF(MF, MMI, MBPI, /* LayoutMode */ false); bool MadeChange = false; while (Duplicator.tailDuplicateBlocks(MF)) Index: lib/CodeGen/TailDuplicator.cpp =================================================================== --- lib/CodeGen/TailDuplicator.cpp +++ lib/CodeGen/TailDuplicator.cpp @@ -57,7 +57,8 @@ namespace llvm { void TailDuplicator::initMF(MachineFunction &MF, const MachineModuleInfo *MMIin, - const MachineBranchProbabilityInfo *MBPIin) { + const MachineBranchProbabilityInfo *MBPIin, + bool LayoutModeIn) { TII = MF.getSubtarget().getInstrInfo(); TRI = MF.getSubtarget().getRegisterInfo(); MRI = &MF.getRegInfo(); @@ -66,6 +67,7 @@ assert(MBPI != nullptr && "Machine Branch Probability Info required"); + LayoutMode = LayoutModeIn; PreRegAlloc = MRI->isSSA(); RS.reset(); @@ -123,15 +125,16 @@ } /// Tail duplicate the block and cleanup. -bool TailDuplicator::tailDuplicateAndUpdate(MachineFunction &MF, bool IsSimple, - MachineBasicBlock *MBB) { +bool TailDuplicator::tailDuplicateAndUpdate( + MachineFunction &MF, bool IsSimple, MachineBasicBlock *MBB, + MachineBasicBlock *ForcedLayoutPred) { // Save the successors list. SmallSetVector Succs(MBB->succ_begin(), MBB->succ_end()); SmallVector TDBBs; SmallVector Copies; - if (!tailDuplicate(MF, IsSimple, MBB, TDBBs, Copies)) + if (!tailDuplicate(MF, IsSimple, MBB, ForcedLayoutPred, TDBBs, Copies)) return false; ++NumTails; @@ -245,7 +248,7 @@ if (!shouldTailDuplicate(MF, IsSimple, *MBB)) continue; - MadeChange |= tailDuplicateAndUpdate(MF, IsSimple, MBB); + MadeChange |= tailDuplicateAndUpdate(MF, IsSimple, MBB, nullptr); } if (PreRegAlloc && TailDupVerify) @@ -510,8 +513,18 @@ bool TailDuplicator::shouldTailDuplicate(const MachineFunction &MF, bool IsSimple, MachineBasicBlock &TailBB) { - // Only duplicate blocks that end with unconditional branches. - if (TailBB.canFallThrough()) + // When doing tail-duplication during layout, the block ordering is in flux, + // so canFallThrough returns a result based on incorrect information and + // should just be ignored. + if (!LayoutMode && TailBB.canFallThrough()) + return false; + + // If the block to be duplicated ends in an unanalyzable fallthrough, don't + // duplicate it. + MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr; + SmallVector PredCond; + if (TailBB.canFallThrough() && + TII->AnalyzeBranch(TailBB, PredTBB, PredFBB, PredCond, true)) return false; // Don't try to tail-duplicate single-block loops. @@ -723,8 +736,14 @@ /// If it is profitable, duplicate TailBB's contents in each /// of its predecessors. +/// ForcedLayoutPred When non-null, use this block as the layout predecessor +/// instead of the previous block in MF's order. +/// TDBBs A vector to keep track of all blocks tail-duplicated into. +/// Copies A vector of copy instructions inserted. Used later to walk +/// all the inserted copies and remove redundant ones. bool TailDuplicator::tailDuplicate(MachineFunction &MF, bool IsSimple, MachineBasicBlock *TailBB, + MachineBasicBlock *ForcedLayoutPred, SmallVectorImpl &TDBBs, SmallVectorImpl &Copies) { DEBUG(dbgs() << "\n*** Tail-duplicating BB#" << TailBB->getNumber() << '\n'); @@ -759,7 +778,12 @@ if (!PredCond.empty()) continue; // Don't duplicate into a fall-through predecessor (at least for now). - if (PredBB->isLayoutSuccessor(TailBB) && PredBB->canFallThrough()) + bool IsLayoutSuccessor = false; + if (ForcedLayoutPred) + IsLayoutSuccessor = (ForcedLayoutPred == PredBB); + else if (PredBB->isLayoutSuccessor(TailBB) && PredBB->canFallThrough()) + IsLayoutSuccessor = true; + if (IsLayoutSuccessor) continue; DEBUG(dbgs() << "\nTail-duplicating into PredBB: " << *PredBB @@ -825,13 +849,14 @@ // If TailBB was duplicated into all its predecessors except for the prior // block, which falls through unconditionally, move the contents of this - // block into the prior block. + // block into the prior block. Don't do this when LayoutMode is + // true, as there is no point in removing the block during layout. MachineBasicBlock *PrevBB = &*std::prev(TailBB->getIterator()); MachineBasicBlock *PriorTBB = nullptr, *PriorFBB = nullptr; SmallVector PriorCond; // This has to check PrevBB->succ_size() because EH edges are ignored by // AnalyzeBranch. - if (PrevBB->succ_size() == 1 && + if (!LayoutMode && PrevBB->succ_size() == 1 && !TII->AnalyzeBranch(*PrevBB, PriorTBB, PriorFBB, PriorCond, true) && PriorCond.empty() && !PriorTBB && TailBB->pred_size() == 1 && !TailBB->hasAddressTaken()) { Index: test/CodeGen/AArch64/machine_cse.ll =================================================================== --- test/CodeGen/AArch64/machine_cse.ll +++ test/CodeGen/AArch64/machine_cse.ll @@ -1,4 +1,8 @@ -; RUN: llc < %s -mtriple=aarch64-linux-gnuabi -O2 | FileCheck %s +; RUN: llc < %s -mtriple=aarch64-linux-gnuabi -O2 -tail-dup-placement=0 | FileCheck %s +; -tail-dup-placement causes tail duplication during layout. This breaks the +; assumptions of the test case as written (specifically, it creates an +; additional cmp instruction, creating a false positive), so we pass +; -tail-dup-placement=0 to restore the original behavior ; marked as external to prevent possible optimizations @a = external global i32 Index: test/CodeGen/PowerPC/branch-opt.ll =================================================================== --- test/CodeGen/PowerPC/branch-opt.ll +++ test/CodeGen/PowerPC/branch-opt.ll @@ -1,9 +1,14 @@ -; RUN: llc < %s -march=ppc32 | \ -; RUN: grep "b LBB.*" | count 4 +; RUN: llc < %s -march=ppc32 | FileCheck %s target datalayout = "E-p:32:32" target triple = "powerpc-apple-darwin8.7.0" +;CHECK-LABEL: foo: +;CHECK: b LBB0_14 +;CHECK: b LBB0_14 +;CHECK-NOT: b LBB0_14 +;CHECK: LBB0_14: ; %cond_next48 + define void @foo(i32 %W, i32 %X, i32 %Y, i32 %Z) { entry: %tmp1 = and i32 %W, 1 ; [#uses=1] Index: test/CodeGen/PowerPC/tail-dup-analyzable-fallthrough.ll =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/tail-dup-analyzable-fallthrough.ll @@ -0,0 +1,64 @@ +; RUN: llc -O2 < %s | FileCheck %s +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le--linux-gnu" + +%node = type { %node*, i64 } +; This test is checking for a bug where blocks with un-analyzable fallthrough +; were getting tail-duplicated, leading to incorrect code. The loop-entry check +; gets transformed to have a conditional exit, which is currently un-analyzable. +; This block should not be tail-duplicated. This is from an actual hash table +; that was miscompiling. +; CHECK-LABEL: walkList + +; Don't duplicate the entry block +; CHECK: b [[LOOPENTRY:[._0-9A-Za-z]+]] + +; The Loop top does a pointer chase. +; CHECK: [[LOOPTOP:[._0-9A-Za-z]+]]: +; CHECK-NOT: {{{[._0-9A-Za-z]+}}}: +; CHECK: ld [[PTRREG:[0-9]+]], 0([[PTRREG]]) + +; The loop entry compares the pointer to 0 before the chase and early exits if +; it is null. +; CHECK-NEXT: [[LOOPENTRY]]: +; CHECK-NOT: {{{[._0-9A-Za-z]+}}}: +; CHECK: cmpldi [[PTRREG]], 0 +; CHECK-NEXT: beqlr + +; Walk a linked list, returning the %node that contains %Key, or null if no +; such node exists. +; Function Attrs: norecurse nounwind readonly +define %node* @walkList(%node* nocapture readonly %current.in, i64 %Key) #0 { +entry: + %tobool13 = icmp eq %node* %current.in, null + br i1 %tobool13, label %cond.end, label %land.rhs.preheader + +land.rhs.preheader: ; preds = %entry + br label %land.rhs + +land.rhs: ; preds = %land.rhs.preheader, %for.inc + %current.014 = phi %node* [ %current.0, %for.inc ], [ %current.in, %land.rhs.preheader ] + %Key1 = getelementptr inbounds %node, %node* %current.014, i64 0, i32 1 + %0 = load i64, i64* %Key1, align 8 + %lnot = icmp eq i64 %0, %Key + br i1 %lnot, label %cond.true, label %for.inc + +for.inc: ; preds = %land.rhs + %Next = getelementptr inbounds %node, %node* %current.014, i64 0, i32 0 + %current.0 = load %node*, %node** %Next, align 8 + %tobool = icmp eq %node* %current.0, null + br i1 %tobool, label %cond.end.loopexit, label %land.rhs + +cond.true: ; preds = %land.rhs + %current.014.lcssa = phi %node* [ %current.014, %land.rhs ] + br label %cond.end + +cond.end.loopexit: ; preds = %for.inc + br label %cond.end + +cond.end: ; preds = %cond.end.loopexit, %entry, %cond.true + %cond = phi %node* [ %current.014.lcssa, %cond.true ], [ null, %entry ], [ null, %cond.end.loopexit ] + ret %node* %cond +} + +attributes #0 = { norecurse nounwind readonly } Index: test/CodeGen/PowerPC/tail-dup-layout.ll =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/tail-dup-layout.ll @@ -0,0 +1,100 @@ +; RUN: llc -outline-optional-branches -O2 < %s | FileCheck %s +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-grtev4-linux-gnu" + +; Intended layout: +; The outlining flag produces the layout +; test1 +; test2 +; test3 +; test4 +; exit +; optional1 +; optional2 +; optional3 +; optional4 +; Tail duplication puts test n+1 at the end of optional n +; so optional1 includes a copy of test2 at the end, and branches +; to test3 (at the top) or falls through to optional 2. +; The CHECK statements check for the whole string of tests and exit block, +; and then check that the correct test has been duplicated into the end of +; the optional blocks and that the optional blocks are in the correct order. +;CHECK-LABEL: f: +; test1 may have been merged with entry +;CHECK: mr [[TAGREG:[0-9]+]], 3 +;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1 +;CHECK-NEXT: bc 12, 1, [[OPT1LABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: [[TEST2LABEL:[._0-9A-Za-z]+]]: # %test2 +;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30 +;CHECK-NEXT: bne 0, [[OPT2LABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: [[TEST3LABEL:[._0-9A-Za-z]+]]: # %test3 +;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29 +;CHECK-NEXT: bne 0, .[[OPT3LABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: [[TEST4LABEL:[._0-9A-Za-z]+]]: # %test4 +;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28 +;CHECK-NEXT: bne 0, .[[OPT4LABEL:[._0-9A-Za-z]+]] +;CHECK-NEXT: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit +;CHECK: blr +;CHECK-NEXT: [[OPT1LABEL]] +;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30 +;CHECK-NEXT: beq 0, [[TEST3LABEL]] +;CHECK-NEXT: [[OPT2LABEL]] +;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29 +;CHECK-NEXT: beq 0, [[TEST4LABEL]] +;CHECK-NEXT: [[OPT3LABEL]] +;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28 +;CHECK-NEXT: beq 0, [[EXITLABEL]] +;CHECK-NEXT: [[OPT4LABEL]] +;CHECK: b [[EXITLABEL]] + +define void @f(i32 %tag) { +entry: + br label %test1 +test1: + %tagbit1 = and i32 %tag, 1 + %tagbit1eq0 = icmp eq i32 %tagbit1, 0 + br i1 %tagbit1eq0, label %test2, label %optional1 +optional1: + call void @a() + call void @a() + call void @a() + call void @a() + br label %test2 +test2: + %tagbit2 = and i32 %tag, 2 + %tagbit2eq0 = icmp eq i32 %tagbit2, 0 + br i1 %tagbit2eq0, label %test3, label %optional2 +optional2: + call void @b() + call void @b() + call void @b() + call void @b() + br label %test3 +test3: + %tagbit3 = and i32 %tag, 4 + %tagbit3eq0 = icmp eq i32 %tagbit3, 0 + br i1 %tagbit3eq0, label %test4, label %optional3 +optional3: + call void @c() + call void @c() + call void @c() + call void @c() + br label %test4 +test4: + %tagbit4 = and i32 %tag, 8 + %tagbit4eq0 = icmp eq i32 %tagbit4, 0 + br i1 %tagbit4eq0, label %exit, label %optional4 +optional4: + call void @d() + call void @d() + call void @d() + call void @d() + br label %exit +exit: + ret void +} + +declare void @a() +declare void @b() +declare void @c() +declare void @d() Index: test/CodeGen/WebAssembly/cfg-stackify.ll =================================================================== --- test/CodeGen/WebAssembly/cfg-stackify.ll +++ test/CodeGen/WebAssembly/cfg-stackify.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-block-placement -verify-machineinstrs -fast-isel=false | FileCheck %s -; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -verify-machineinstrs -fast-isel=false | FileCheck -check-prefix=OPT %s +; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0 -verify-machineinstrs -fast-isel=false | FileCheck -check-prefix=OPT %s ; Test the CFG stackifier pass. Index: test/CodeGen/WebAssembly/mem-intrinsics.ll =================================================================== --- test/CodeGen/WebAssembly/mem-intrinsics.ll +++ test/CodeGen/WebAssembly/mem-intrinsics.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s +; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0| FileCheck %s ; Test memcpy, memmove, and memset intrinsics. Index: test/CodeGen/X86/fp-une-cmp.ll =================================================================== --- test/CodeGen/X86/fp-une-cmp.ll +++ test/CodeGen/X86/fp-une-cmp.ll @@ -56,11 +56,11 @@ ; CHECK-NEXT: ucomisd %xmm1, %xmm0 ; CHECK-NEXT: jne .LBB1_1 ; CHECK-NEXT: jp .LBB1_1 -; CHECK-NEXT: .LBB1_2: # %bb2 +; CHECK-NEXT: # %bb2 ; CHECK-NEXT: retq ; CHECK-NEXT: .LBB1_1: # %bb1 ; CHECK-NEXT: addsd {{.*}}(%rip), %xmm0 -; CHECK-NEXT: jmp .LBB1_2 +; CHECK-NEXT: retq entry: %mul = fmul double %x, %y