Index: include/llvm/CodeGen/TailDuplicator.h
===================================================================
--- include/llvm/CodeGen/TailDuplicator.h
+++ include/llvm/CodeGen/TailDuplicator.h
@@ -51,9 +51,10 @@
   bool tailDuplicateBlocks(MachineFunction &MF);
   static bool isSimpleBB(MachineBasicBlock *TailBB);
   bool shouldTailDuplicate(const MachineFunction &MF, bool IsSimple,
-                           MachineBasicBlock &TailBB);
+                           MachineBasicBlock &TailBB, bool IgnoreFallthrough);
   bool tailDuplicateAndUpdate(MachineFunction &MF, bool IsSimple,
-                              MachineBasicBlock *MBB);
+                              MachineBasicBlock *MBB,
+                              MachineBasicBlock *ForcedLayoutPred);
 
 private:
   typedef TargetInstrInfo::RegSubRegPair RegSubRegPair;
@@ -79,6 +80,7 @@
                          SmallVectorImpl<MachineInstr *> &Copies);
   bool tailDuplicate(MachineFunction &MF, bool IsSimple,
                      MachineBasicBlock *TailBB,
+                     MachineBasicBlock *ForcedLayoutPred,
                      SmallVectorImpl<MachineBasicBlock *> &TDBBs,
                      SmallVectorImpl<MachineInstr *> &Copies);
   void appendCopies(MachineBasicBlock *MBB,
Index: lib/CodeGen/BranchFolding.cpp
===================================================================
--- lib/CodeGen/BranchFolding.cpp
+++ lib/CodeGen/BranchFolding.cpp
@@ -597,7 +597,8 @@
                   MachineBasicBlock::iterator &I1,
                   MachineBasicBlock::iterator &I2, MachineBasicBlock *SuccBB,
                   MachineBasicBlock *PredBB,
-                  DenseMap<const MachineBasicBlock *, int> &FuncletMembership) {
+                  DenseMap<const MachineBasicBlock *, int> &FuncletMembership,
+                  bool AfterPlacement) {
   // It is never profitable to tail-merge blocks from two different funclets.
   if (!FuncletMembership.empty()) {
     auto Funclet1 = FuncletMembership.find(MBB1);
@@ -620,7 +621,8 @@
   if (MBB1 == PredBB || MBB2 == PredBB) {
     MachineBasicBlock::iterator I;
     unsigned NumTerms = CountTerminators(MBB1 == PredBB ? MBB2 : MBB1, I);
-    if (CommonTailLen > NumTerms)
+    // Don't undo tail-duplication during layout.
+    if (!AfterPlacement && CommonTailLen > NumTerms)
       return true;
   }
 
@@ -637,10 +639,17 @@
   // count that as an additional common instruction for the following
   // heuristics.
   unsigned EffectiveTailLen = CommonTailLen;
-  if (SuccBB && MBB1 != PredBB && MBB2 != PredBB &&
-      !MBB1->back().isBarrier() &&
-      !MBB2->back().isBarrier())
-    ++EffectiveTailLen;
+  // We can't do this during layout because it undoes tail duplication, despite
+  // non-overlapping thresholds. What happens is that a block with a
+  // conditional branch and fallthrough gets tail-duplicated and this heuristic
+  // gets run with the non-fallthrough successor as SuccBB, and counts an extra
+  // branch, despite the block having only 2 instructions when it was
+  // duplicated.
+  if (!AfterPlacement)
+    if (SuccBB && MBB1 != PredBB && MBB2 != PredBB &&
+        !MBB1->back().isBarrier() &&
+        !MBB2->back().isBarrier())
+      ++EffectiveTailLen;
 
   // Check if the common tail is long enough to be worthwhile.
   if (EffectiveTailLen >= minCommonTailLength)
@@ -682,7 +691,8 @@
                             minCommonTailLength,
                             CommonTailLen, TrialBBI1, TrialBBI2,
                             SuccBB, PredBB,
-                            FuncletMembership)) {
+                            FuncletMembership,
+                            AfterBlockPlacement)) {
         if (CommonTailLen > maxCommonTailLength) {
           SameTails.clear();
           maxCommonTailLength = CommonTailLen;
Index: lib/CodeGen/MachineBlockPlacement.cpp
===================================================================
--- lib/CodeGen/MachineBlockPlacement.cpp
+++ lib/CodeGen/MachineBlockPlacement.cpp
@@ -40,6 +40,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/TailDuplicator.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -117,6 +118,12 @@
 static cl::opt<unsigned> JumpInstCost("jump-inst-cost",
                                       cl::desc("Cost of jump instructions."),
                                       cl::init(1), cl::Hidden);
+static cl::opt<bool>
+TailDupPlacement("tail-dup-placement",
+              cl::desc("Perform tail duplication during placement. "
+                       "Creates more fallthrough opportunites in "
+                       "outline branches."),
+              cl::init(true), cl::Hidden);
 
 static cl::opt<bool>
 BranchFoldPlacement("branch-fold-placement",
@@ -258,6 +265,13 @@
   /// \brief A handle to the post dominator tree.
   MachineDominatorTree *MDT;
 
+  /// \brief Duplicator used to duplicate tails during placement.
+  ///
+  /// Placement decisions can open up new tail duplication opportunities, but
+  /// since tail duplication affects placement decisions of later blocks, it
+  /// must be done inline.
+  TailDuplicator TailDup;
+
   /// \brief A set of blocks that are unavoidably execute, i.e. they dominate
   /// all terminators of the MachineFunction.
   SmallPtrSet<MachineBasicBlock *, 4> UnavoidableBlocks;
@@ -864,6 +878,20 @@
     // after this block.
     MachineBasicBlock *BestSucc = selectBestSuccessor(BB, Chain, BlockFilter);
 
+    // Placing an actual successor may have changed tail duplication
+    // opportunities. Check for that now.
+    if (TailDupPlacement && BestSucc) {
+      DEBUG(dbgs() << "Redoing tail duplication for BestSucc#"
+                   << BestSucc->getNumber() << "\n");
+      bool IsSimple = TailDup.isSimpleBB(BestSucc);
+      bool IgnoreFallthrough = true;
+      // Simple blocks should just fallthrough, so only worry about non-simple
+      // ones.
+      if (!IsSimple && TailDup.shouldTailDuplicate(*F, IsSimple,
+                                                   *BestSucc, IgnoreFallthrough))
+        TailDup.tailDuplicateAndUpdate(*F, IsSimple, BestSucc, BB);
+    }
+
     // If an immediate successor isn't available, look for the best viable
     // block among those we've identified as not violating the loop's CFG at
     // this point. This won't be a fallthrough, but it will increase locality.
@@ -1659,6 +1687,10 @@
   TII = MF.getSubtarget().getInstrInfo();
   TLI = MF.getSubtarget().getTargetLowering();
   MDT = &getAnalysis<MachineDominatorTree>();
+  auto MMI = getAnalysisIfAvailable<MachineModuleInfo>();
+  if (TailDupPlacement)
+    TailDup.initMF(MF, MMI, MBPI);
+
   assert(BlockToChain.empty());
 
   buildCFGChains();
@@ -1680,6 +1712,8 @@
                             /*AfterBlockPlacement=*/true)) {
       // Redo the layout if tail merging creates/removes/moves blocks.
       BlockToChain.clear();
+      // Must redo the dominator tree if blocks were changed.
+      MDT->runOnMachineFunction(MF);
       ChainAllocator.DestroyAll();
       buildCFGChains();
     }
Index: lib/CodeGen/TailDuplicator.cpp
===================================================================
--- lib/CodeGen/TailDuplicator.cpp
+++ lib/CodeGen/TailDuplicator.cpp
@@ -123,15 +123,16 @@
 }
 
 /// Tail duplicate the block and cleanup.
-bool TailDuplicator::tailDuplicateAndUpdate(MachineFunction &MF, bool IsSimple,
-                                            MachineBasicBlock *MBB) {
+bool TailDuplicator::tailDuplicateAndUpdate(
+    MachineFunction &MF, bool IsSimple, MachineBasicBlock *MBB,
+    MachineBasicBlock *ForcedLayoutPred) {
   // Save the successors list.
   SmallSetVector<MachineBasicBlock *, 8> Succs(MBB->succ_begin(),
                                                MBB->succ_end());
 
   SmallVector<MachineBasicBlock *, 8> TDBBs;
   SmallVector<MachineInstr *, 16> Copies;
-  if (!tailDuplicate(MF, IsSimple, MBB, TDBBs, Copies))
+  if (!tailDuplicate(MF, IsSimple, MBB, ForcedLayoutPred, TDBBs, Copies))
     return false;
 
   ++NumTails;
@@ -242,10 +243,11 @@
 
     bool IsSimple = isSimpleBB(MBB);
 
-    if (!shouldTailDuplicate(MF, IsSimple, *MBB))
+    bool IgnoreFallthrough = false;
+    if (!shouldTailDuplicate(MF, IsSimple, *MBB, IgnoreFallthrough))
       continue;
 
-    MadeChange |= tailDuplicateAndUpdate(MF, IsSimple, MBB);
+    MadeChange |= tailDuplicateAndUpdate(MF, IsSimple, MBB, nullptr);
   }
 
   if (PreRegAlloc && TailDupVerify)
@@ -509,9 +511,20 @@
 /// Determine if it is profitable to duplicate this block.
 bool TailDuplicator::shouldTailDuplicate(const MachineFunction &MF,
                                          bool IsSimple,
-                                         MachineBasicBlock &TailBB) {
-  // Only duplicate blocks that end with unconditional branches.
-  if (TailBB.canFallThrough())
+                                         MachineBasicBlock &TailBB,
+                                         bool IgnoreFallthrough) {
+  // IgnoreFallthrough is set when considering duplication during layout.
+  // Because the ultimate layout may change, it is better to consider
+  // duplicating blocks that can't fall through.
+  if (TailBB.canFallThrough() && !IgnoreFallthrough)
+    return false;
+
+  // If the block to be duplicated ends in an unanalyzable fallthrough, don't
+  // duplicate it.
+  MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr;
+  SmallVector<MachineOperand, 4> PredCond;
+  if (TailBB.canFallThrough() &&
+      TII->AnalyzeBranch(TailBB, PredTBB, PredFBB, PredCond, true))
     return false;
 
   // Don't try to tail-duplicate single-block loops.
@@ -725,6 +738,7 @@
 /// of its predecessors.
 bool TailDuplicator::tailDuplicate(MachineFunction &MF, bool IsSimple,
                                    MachineBasicBlock *TailBB,
+                                   MachineBasicBlock *ForcedLayoutPred,
                                    SmallVectorImpl<MachineBasicBlock *> &TDBBs,
                                    SmallVectorImpl<MachineInstr *> &Copies) {
   DEBUG(dbgs() << "\n*** Tail-duplicating BB#" << TailBB->getNumber() << '\n');
@@ -759,7 +773,12 @@
     if (!PredCond.empty())
       continue;
     // Don't duplicate into a fall-through predecessor (at least for now).
-    if (PredBB->isLayoutSuccessor(TailBB) && PredBB->canFallThrough())
+    bool IsLayoutSuccessor = false;
+    if (ForcedLayoutPred)
+      IsLayoutSuccessor = (ForcedLayoutPred == PredBB);
+    else if (PredBB->isLayoutSuccessor(TailBB) && PredBB->canFallThrough())
+      IsLayoutSuccessor = true;
+    if (IsLayoutSuccessor)
       continue;
 
     DEBUG(dbgs() << "\nTail-duplicating into PredBB: " << *PredBB
@@ -825,13 +844,14 @@
 
   // If TailBB was duplicated into all its predecessors except for the prior
   // block, which falls through unconditionally, move the contents of this
-  // block into the prior block.
+  // block into the prior block. Don't do this when ForcedLayoutPred is
+  // non-null, as it can break layout to remove blocks.
   MachineBasicBlock *PrevBB = &*std::prev(TailBB->getIterator());
   MachineBasicBlock *PriorTBB = nullptr, *PriorFBB = nullptr;
   SmallVector<MachineOperand, 4> PriorCond;
   // This has to check PrevBB->succ_size() because EH edges are ignored by
   // AnalyzeBranch.
-  if (PrevBB->succ_size() == 1 &&
+  if (ForcedLayoutPred == nullptr && PrevBB->succ_size() == 1 &&
       !TII->AnalyzeBranch(*PrevBB, PriorTBB, PriorFBB, PriorCond, true) &&
       PriorCond.empty() && !PriorTBB && TailBB->pred_size() == 1 &&
       !TailBB->hasAddressTaken()) {
Index: test/CodeGen/AArch64/machine_cse.ll
===================================================================
--- test/CodeGen/AArch64/machine_cse.ll
+++ test/CodeGen/AArch64/machine_cse.ll
@@ -1,4 +1,8 @@
-; RUN: llc < %s -mtriple=aarch64-linux-gnuabi -O2 | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-linux-gnuabi -O2 -tail-dup-placement=0 | FileCheck %s
+; -tail-dup-placement causes tail duplication during layout. This breaks the
+; assumptions of the test case as written (specifically, it creates an
+; additional cmp instruction, creating a false positive), so we pass
+; -tail-dup-placement=0 to restore the original behavior
 
 ; marked as external to prevent possible optimizations
 @a = external global i32
Index: test/CodeGen/PowerPC/branch-opt.ll
===================================================================
--- test/CodeGen/PowerPC/branch-opt.ll
+++ test/CodeGen/PowerPC/branch-opt.ll
@@ -1,9 +1,14 @@
-; RUN: llc < %s -march=ppc32 | \
-; RUN:   grep "b LBB.*" | count 4
+; RUN: llc < %s -march=ppc32 | FileCheck %s
 
 target datalayout = "E-p:32:32"
 target triple = "powerpc-apple-darwin8.7.0"
 
+;CHECK-LABEL: foo:
+;CHECK: b LBB0_14
+;CHECK: b LBB0_14
+;CHECK-NOT: b LBB0_14
+;CHECK: LBB0_14: ; %cond_next48
+
 define void @foo(i32 %W, i32 %X, i32 %Y, i32 %Z) {
 entry:
 	%tmp1 = and i32 %W, 1		; <i32> [#uses=1]
Index: test/CodeGen/PowerPC/tail-dup-analyzable-fallthrough.ll
===================================================================
--- /dev/null
+++ test/CodeGen/PowerPC/tail-dup-analyzable-fallthrough.ll
@@ -0,0 +1,64 @@
+; RUN: llc -O2 < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le--linux-gnu"
+
+%node = type { %node*, i64 }
+; This test is checking for a bug where blocks with un-analyzable fallthrough
+; were getting tail-duplicated, leading to incorrect code. The loop-entry check
+; gets transformed to have a conditional exit, which is currently un-analyzable.
+; This block should not be tail-duplicated. This is from an actual hash table
+; that was miscompiling.
+; CHECK-LABEL: walkList
+
+; Don't duplicate the entry block
+; CHECK: b [[LOOPENTRY:[._0-9A-Za-z]+]]
+
+; The Loop top does a pointer chase.
+; CHECK: [[LOOPTOP:[._0-9A-Za-z]+]]:
+; CHECK-NOT: {{{[._0-9A-Za-z]+}}}:
+; CHECK: ld [[PTRREG:[0-9]+]], 0([[PTRREG]])
+
+; The loop entry compares the pointer to 0 before the chase and early exits if
+; it is null.
+; CHECK-NEXT: [[LOOPENTRY]]:
+; CHECK-NOT: {{{[._0-9A-Za-z]+}}}:
+; CHECK: cmpldi [[PTRREG]], 0
+; CHECK-NEXT: beqlr
+
+; Walk a linked list, returning the %node that contains %Key, or null if no
+; such node exists.
+; Function Attrs: norecurse nounwind readonly
+define %node* @walkList(%node* nocapture readonly %current.in, i64 %Key) #0 {
+entry:
+  %tobool13 = icmp eq %node* %current.in, null
+  br i1 %tobool13, label %cond.end, label %land.rhs.preheader
+
+land.rhs.preheader:                               ; preds = %entry
+  br label %land.rhs
+
+land.rhs:                                         ; preds = %land.rhs.preheader, %for.inc
+  %current.014 = phi %node* [ %current.0, %for.inc ], [ %current.in, %land.rhs.preheader ]
+  %Key1 = getelementptr inbounds %node, %node* %current.014, i64 0, i32 1
+  %0 = load i64, i64* %Key1, align 8
+  %lnot = icmp eq i64 %0, %Key
+  br i1 %lnot, label %cond.true, label %for.inc
+
+for.inc:                                          ; preds = %land.rhs
+  %Next = getelementptr inbounds %node, %node* %current.014, i64 0, i32 0
+  %current.0 = load %node*, %node** %Next, align 8
+  %tobool = icmp eq %node* %current.0, null
+  br i1 %tobool, label %cond.end.loopexit, label %land.rhs
+
+cond.true:                                        ; preds = %land.rhs
+  %current.014.lcssa = phi %node* [ %current.014, %land.rhs ]
+  br label %cond.end
+
+cond.end.loopexit:                                ; preds = %for.inc
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.end.loopexit, %entry, %cond.true
+  %cond = phi %node* [ %current.014.lcssa, %cond.true ], [ null, %entry ], [ null, %cond.end.loopexit ]
+  ret %node* %cond
+}
+
+attributes #0 = { norecurse nounwind readonly }
Index: test/CodeGen/PowerPC/tail-dup-layout.ll
===================================================================
--- /dev/null
+++ test/CodeGen/PowerPC/tail-dup-layout.ll
@@ -0,0 +1,100 @@
+; RUN: llc -outline-optional-branches -O2 < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-grtev4-linux-gnu"
+
+; Intended layout:
+; The outlining flag produces the layout
+; test1
+; test2
+; test3
+; test4
+; exit
+; optional1
+; optional2
+; optional3
+; optional4
+; Tail duplication puts test n+1 at the end of optional n
+; so optional1 includes a copy of test2 at the end, and branches
+; to test3 (at the top) or falls through to optional 2.
+; The CHECK statements check for the whole string of tests and exit block,
+; and then check that the correct test has been duplicated into the end of
+; the optional blocks and that the optional blocks are in the correct order.
+;CHECK-LABEL: f:
+; test1 may have been merged with entry
+;CHECK: mr [[TAGREG:[0-9]+]], 3
+;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1
+;CHECK-NEXT: bc 12, 1, [[OPT1LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: [[TEST2LABEL:[._0-9A-Za-z]+]]: # %test2
+;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
+;CHECK-NEXT: bne 0, [[OPT2LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: [[TEST3LABEL:[._0-9A-Za-z]+]]: # %test3
+;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
+;CHECK-NEXT: bne 0, .[[OPT3LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: [[TEST4LABEL:[._0-9A-Za-z]+]]: # %test4
+;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
+;CHECK-NEXT: bne 0, .[[OPT4LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit
+;CHECK: blr
+;CHECK-NEXT: [[OPT1LABEL]]
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
+;CHECK-NEXT: beq 0, [[TEST3LABEL]]
+;CHECK-NEXT: [[OPT2LABEL]]
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
+;CHECK-NEXT: beq 0, [[TEST4LABEL]]
+;CHECK-NEXT: [[OPT3LABEL]]
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
+;CHECK-NEXT: beq 0, [[EXITLABEL]]
+;CHECK-NEXT: [[OPT4LABEL]]
+;CHECK: b [[EXITLABEL]]
+
+define void @f(i32 %tag) {
+entry:
+  br label %test1
+test1:
+  %tagbit1 = and i32 %tag, 1
+  %tagbit1eq0 = icmp eq i32 %tagbit1, 0
+  br i1 %tagbit1eq0, label %test2, label %optional1
+optional1:
+  call void @a()
+  call void @a()
+  call void @a()
+  call void @a()
+  br label %test2
+test2:
+  %tagbit2 = and i32 %tag, 2
+  %tagbit2eq0 = icmp eq i32 %tagbit2, 0
+  br i1 %tagbit2eq0, label %test3, label %optional2
+optional2:
+  call void @b()
+  call void @b()
+  call void @b()
+  call void @b()
+  br label %test3
+test3:
+  %tagbit3 = and i32 %tag, 4
+  %tagbit3eq0 = icmp eq i32 %tagbit3, 0
+  br i1 %tagbit3eq0, label %test4, label %optional3
+optional3:
+  call void @c()
+  call void @c()
+  call void @c()
+  call void @c()
+  br label %test4
+test4:
+  %tagbit4 = and i32 %tag, 8
+  %tagbit4eq0 = icmp eq i32 %tagbit4, 0
+  br i1 %tagbit4eq0, label %exit, label %optional4
+optional4:
+  call void @d()
+  call void @d()
+  call void @d()
+  call void @d()
+  br label %exit
+exit:
+  ret void
+}
+
+declare void @a()
+declare void @b()
+declare void @c()
+declare void @d()
Index: test/CodeGen/WebAssembly/cfg-stackify.ll
===================================================================
--- test/CodeGen/WebAssembly/cfg-stackify.ll
+++ test/CodeGen/WebAssembly/cfg-stackify.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-block-placement -verify-machineinstrs -fast-isel=false | FileCheck %s
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -verify-machineinstrs -fast-isel=false | FileCheck -check-prefix=OPT %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0 -verify-machineinstrs -fast-isel=false | FileCheck -check-prefix=OPT %s
 
 ; Test the CFG stackifier pass.
 
Index: test/CodeGen/WebAssembly/mem-intrinsics.ll
===================================================================
--- test/CodeGen/WebAssembly/mem-intrinsics.ll
+++ test/CodeGen/WebAssembly/mem-intrinsics.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0| FileCheck %s
 
 ; Test memcpy, memmove, and memset intrinsics.
 
Index: test/CodeGen/X86/fp-une-cmp.ll
===================================================================
--- test/CodeGen/X86/fp-une-cmp.ll
+++ test/CodeGen/X86/fp-une-cmp.ll
@@ -56,11 +56,11 @@
 ; CHECK-NEXT:    ucomisd %xmm1, %xmm0
 ; CHECK-NEXT:    jne .LBB1_1
 ; CHECK-NEXT:    jp .LBB1_1
-; CHECK-NEXT:  .LBB1_2: # %bb2
+; CHECK-NEXT:  # %bb2
 ; CHECK-NEXT:    retq
 ; CHECK-NEXT:  .LBB1_1: # %bb1
 ; CHECK-NEXT:    addsd {{.*}}(%rip), %xmm0
-; CHECK-NEXT:    jmp .LBB1_2
+; CHECK-NEXT:    retq
 
 entry:
   %mul = fmul double %x, %y