diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -18,6 +18,9 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/KnownBits.h"
@@ -47,6 +50,7 @@
 
   AssumptionCache *AC = nullptr;
   UniformityInfo *UA = nullptr;
+  DominatorTree *DT = nullptr;
 
 public:
   static char ID;
@@ -76,6 +80,9 @@
 
   bool canWidenScalarExtLoad(LoadInst &LI) const;
   bool visitLoadInst(LoadInst &LI);
+  bool visitIntrinsicInst(IntrinsicInst &I);
+  bool cloneInstructionToUsers(Instruction *I);
+  bool hasUndefOrPoisonOperand(IntrinsicInst &I, unsigned Operand = 0) const;
 };
 
 } // end anonymous namespace
@@ -93,6 +100,9 @@
   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
 
+  auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+  DT = DTWP ? &DTWP->getDomTree() : nullptr;
+
   bool Changed = false;
   for (auto &BB : F)
     for (Instruction &I : llvm::make_early_inc_range(BB))
@@ -177,6 +187,115 @@
   return true;
 }
 
+bool AMDGPULateCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
+  switch (I.getIntrinsicID()) {
+  case Intrinsic::fabs:
+    return cloneInstructionToUsers(&I);
+  default:
+    return false;
+  }
+}
+
+/**
+ * This function enables the SelectionDAG to use some modifiers more often.
+ * In some cases, like after inlining a function which introduces a call to
+ * the llvm.fabs intrinsic, the IR ends up with users which reside outside
+ * of the basic block. The generated code then will be suboptimal because
+ * instruction selection cannot apply the abs modifier.
+ * In this function, all users of a given instruction are recorded if
+ * they don't reside in the same BB, and are dominated by the intrinsic call.
+ * A unique clone of the intrinsic call will be inserted at the earliest point
+ * in the BB and then all recorded users are adjusted to use the clone instead.
+ * For now, this is only implemented for the fabs intrinsic.
+ * It appears that this already works for fneg instructions.
+ * NOTE: This could be adjusted to move intrinsic calls to their use if there is
+ * only a single user outside of the caller BB.
+ *
+ * @param I The instruction to clone
+ * @return Whether something has changed or not.
+ */
+bool AMDGPULateCodeGenPrepare::cloneInstructionToUsers(Instruction *I) {
+  using namespace PatternMatch;
+
+  bool Changed = false;
+  IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(I);
+
+  // Early opt-out as we don't want to operate on undef or poison values.
+  if (Intrinsic && hasUndefOrPoisonOperand(*Intrinsic))
+    return false;
+
+  DenseMap<BasicBlock *, SmallVector<Instruction *, 1>> UsersPerBB;
+  bool CanErase = true;
+
+  // Filter all users which we don't want to operate on, keep the rest.
+  for (User *U : I->users()) {
+    Instruction *UI = cast<Instruction>(U);
+    if (!UI)
+      continue;
+
+    // Don't do anything if there is a user in the same BB.
+    if (UI->getParent() == I->getParent()) {
+      CanErase = false;
+      continue;
+    }
+
+    // Ignore complex control flow for now.
+    const bool IsPhi = isa<PHINode>(UI);
+
+    // Removing the original instruction will cause badref for
+    // phi nodes, so ignore these cases as well.
+    if (IsPhi || !DT || !DT->dominates(I, UI)) {
+      CanErase = false;
+      continue;
+    }
+
+    if (Intrinsic) {
+      // Don't generate fabs(fabs(x)) calls.
+      if (auto *UII = dyn_cast<IntrinsicInst>(UI);
+          UII && (UII->getIntrinsicID() == Intrinsic->getIntrinsicID()))
+        continue;
+    }
+
+    // Record the user so we can process on it later.
+    UsersPerBB[UI->getParent()].push_back(UI);
+  }
+
+  DenseMap<BasicBlock *, Instruction *> ClonesPerBB;
+
+  // Generate one clone for each BB.
+  for (auto &[BB, Users] : UsersPerBB) {
+    Instruction *Clone = nullptr;
+    if (ClonesPerBB.contains(BB)) {
+      Clone = ClonesPerBB[BB];
+    } else {
+      Clone = I->clone();
+      Clone->setName(I->getName() + Twine(".clone"));
+      Clone->insertBefore(&*BB->getFirstInsertionPt());
+      ClonesPerBB[BB] = Clone;
+    }
+
+    // Adjust the users so they use the BB-local clone.
+    for (Instruction *UI : Users) {
+      UI->replaceUsesOfWith(I, Clone);
+      Changed = true;
+    }
+  }
+
+  if (Changed && CanErase)
+    I->eraseFromParent();
+
+  return Changed;
+}
+
+bool AMDGPULateCodeGenPrepare::hasUndefOrPoisonOperand(IntrinsicInst &I,
+                                                       unsigned Operand) const {
+  using PatternMatch::m_Poison;
+  using PatternMatch::m_Undef;
+
+  return match(I.getOperand(Operand), m_Undef()) ||
+         match(I.getOperand(Operand), m_Poison());
+}
+
 INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
                       "AMDGPU IR late optimizations", false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
diff --git a/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll b/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll
--- a/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll
@@ -13,35 +13,28 @@
 ; ISA-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; ISA-NEXT:    v_mov_b32_e32 v7, 0
 ; ISA-NEXT:    s_waitcnt lgkmcnt(0)
-; ISA-NEXT:    s_cmp_lg_u32 s4, 0
-; ISA-NEXT:    s_cselect_b32 s6, -1, 0
-; ISA-NEXT:    s_and_b32 s6, s6, exec_lo
-; ISA-NEXT:    s_cselect_b32 s6, s5, 0
+; ISA-NEXT:    s_lshr_b32 s6, s5, 1
 ; ISA-NEXT:    s_lshr_b32 s7, 1, s4
 ; ISA-NEXT:    s_cmp_lg_u32 s4, 0
-; ISA-NEXT:    v_cvt_f32_i32_e32 v0, s6
-; ISA-NEXT:    s_cselect_b32 s8, -1, 0
-; ISA-NEXT:    s_and_b32 s8, s8, exec_lo
-; ISA-NEXT:    s_cselect_b32 s7, s7, 0
-; ISA-NEXT:    s_lshr_b32 s5, s5, 1
-; ISA-NEXT:    s_cmp_lg_u32 s4, 0
-; ISA-NEXT:    v_cvt_f32_ubyte0_e32 v4, s7
 ; ISA-NEXT:    s_cselect_b32 s4, -1, 0
-; ISA-NEXT:    v_cndmask_b32_e64 v3, 0, 1.0, s4
+; ISA-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s4
 ; ISA-NEXT:    s_and_b32 s4, s4, exec_lo
-; ISA-NEXT:    s_cselect_b32 s4, s5, 0
-; ISA-NEXT:    v_cvt_f32_i32_e32 v5, s4
+; ISA-NEXT:    s_cselect_b32 s4, s6, 0
+; ISA-NEXT:    s_cselect_b32 s6, s7, 0
+; ISA-NEXT:    s_cselect_b32 s5, s5, 0
+; ISA-NEXT:    v_cvt_f32_i32_e32 v3, s4
+; ISA-NEXT:    v_cvt_f32_ubyte0_e32 v4, s6
+; ISA-NEXT:    v_cvt_f32_i32_e32 v5, s5
 ; ISA-NEXT:    s_mov_b32 s4, 0
-; ISA-NEXT:    v_and_b32_e32 v5, 0x7fffffff, v5
 ; ISA-NEXT:  .LBB0_1: ; %bb14
 ; ISA-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; ISA-NEXT:    v_mov_b32_e32 v6, v7
 ; ISA-NEXT:    s_and_b32 s5, exec_lo, vcc_lo
 ; ISA-NEXT:    s_or_b32 s4, s5, s4
-; ISA-NEXT:    v_add_f32_e32 v7, v6, v3
-; ISA-NEXT:    v_add_f32_e32 v7, v7, v5
+; ISA-NEXT:    v_add_f32_e32 v7, v6, v0
+; ISA-NEXT:    v_add_f32_e64 v7, v7, |v3|
 ; ISA-NEXT:    v_add_f32_e32 v7, v7, v4
-; ISA-NEXT:    v_add_f32_e32 v7, v7, v0
+; ISA-NEXT:    v_add_f32_e32 v7, v7, v5
 ; ISA-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; ISA-NEXT:    s_cbranch_execnz .LBB0_1
 ; ISA-NEXT:  ; %bb.2: ; %bb21
@@ -58,56 +51,50 @@
   ; MIR-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
   ; MIR-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; MIR-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; MIR-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+  ; MIR-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
   ; MIR-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
   ; MIR-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[S_MOV_B64_]], 0, 0 :: (invariant load (s64) from `ptr addrspace(4) null`, align 4294967296, addrspace 4)
-  ; MIR-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
-  ; MIR-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
-  ; MIR-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; MIR-NEXT:   S_CMP_LG_U32 [[COPY4]], [[S_MOV_B32_]], implicit-def $scc
-  ; MIR-NEXT:   [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY $scc
-  ; MIR-NEXT:   $scc = COPY [[COPY5]]
-  ; MIR-NEXT:   [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 [[COPY3]], [[S_MOV_B32_]], implicit $scc
-  ; MIR-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-  ; MIR-NEXT:   [[S_LSHR_B32_:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[S_MOV_B32_1]], [[COPY4]], implicit-def dead $scc
-  ; MIR-NEXT:   S_CMP_LG_U32 [[COPY4]], [[S_MOV_B32_]], implicit-def $scc
+  ; MIR-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+  ; MIR-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+  ; MIR-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+  ; MIR-NEXT:   [[S_LSHR_B32_:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[COPY4]], [[S_MOV_B32_]], implicit-def dead $scc
+  ; MIR-NEXT:   [[S_LSHR_B32_1:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[S_MOV_B32_]], [[COPY5]], implicit-def dead $scc
+  ; MIR-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; MIR-NEXT:   S_CMP_LG_U32 [[COPY5]], [[S_MOV_B32_1]], implicit-def $scc
   ; MIR-NEXT:   [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec = COPY $scc
   ; MIR-NEXT:   $scc = COPY [[COPY6]]
-  ; MIR-NEXT:   [[S_CSELECT_B32_1:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_LSHR_B32_]], [[S_MOV_B32_]], implicit $scc
-  ; MIR-NEXT:   [[S_LSHR_B32_1:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[COPY3]], [[S_MOV_B32_1]], implicit-def dead $scc
-  ; MIR-NEXT:   S_CMP_LG_U32 [[COPY4]], [[S_MOV_B32_]], implicit-def $scc
-  ; MIR-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec = COPY $scc
-  ; MIR-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
-  ; MIR-NEXT:   [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
-  ; MIR-NEXT:   $scc = COPY [[COPY7]]
-  ; MIR-NEXT:   [[S_CSELECT_B32_2:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_LSHR_B32_1]], [[S_MOV_B32_]], implicit $scc
-  ; MIR-NEXT:   [[V_CVT_F32_I32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 killed [[S_CSELECT_B32_2]], 0, 0, implicit $mode, implicit $exec
-  ; MIR-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483647
-  ; MIR-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY [[V_CVT_F32_I32_e64_]]
-  ; MIR-NEXT:   [[S_AND_B32_:%[0-9]+]]:sgpr_32 = S_AND_B32 killed [[COPY9]], killed [[S_MOV_B32_2]], implicit-def dead $scc
-  ; MIR-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216
-  ; MIR-NEXT:   [[S_MOV_B32_4:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
-  ; MIR-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY killed [[S_MOV_B32_3]]
-  ; MIR-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[S_MOV_B32_4]], 0, [[COPY10]], [[COPY7]], implicit $exec
-  ; MIR-NEXT:   [[COPY11:%[0-9]+]]:sgpr_32 = COPY [[V_CNDMASK_B32_e64_]]
+  ; MIR-NEXT:   [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_LSHR_B32_]], [[S_MOV_B32_1]], implicit $scc
+  ; MIR-NEXT:   [[V_CVT_F32_I32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 killed [[S_CSELECT_B32_]], 0, 0, implicit $mode, implicit $exec
+  ; MIR-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY [[V_CVT_F32_I32_e64_]]
+  ; MIR-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216
+  ; MIR-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+  ; MIR-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[S_MOV_B32_2]]
+  ; MIR-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[S_MOV_B32_3]], 0, [[COPY8]], [[COPY6]], implicit $exec
+  ; MIR-NEXT:   [[COPY9:%[0-9]+]]:sgpr_32 = COPY [[V_CNDMASK_B32_e64_]]
+  ; MIR-NEXT:   $scc = COPY [[COPY6]]
+  ; MIR-NEXT:   [[S_CSELECT_B32_1:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_LSHR_B32_1]], [[S_MOV_B32_1]], implicit $scc
   ; MIR-NEXT:   [[V_CVT_F32_UBYTE0_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_UBYTE0_e64 killed [[S_CSELECT_B32_1]], 0, 0, implicit $exec
-  ; MIR-NEXT:   [[COPY12:%[0-9]+]]:sgpr_32 = COPY [[V_CVT_F32_UBYTE0_e64_]]
-  ; MIR-NEXT:   [[V_CVT_F32_I32_e64_1:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 killed [[S_CSELECT_B32_]], 0, 0, implicit $mode, implicit $exec
-  ; MIR-NEXT:   [[COPY13:%[0-9]+]]:sgpr_32 = COPY [[V_CVT_F32_I32_e64_1]]
-  ; MIR-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_I32_e64 [[COPY2]], [[S_MOV_B32_1]], implicit $exec
-  ; MIR-NEXT:   [[COPY14:%[0-9]+]]:vreg_1 = COPY [[V_CMP_LT_I32_e64_]]
+  ; MIR-NEXT:   [[COPY10:%[0-9]+]]:sgpr_32 = COPY [[V_CVT_F32_UBYTE0_e64_]]
+  ; MIR-NEXT:   $scc = COPY [[COPY6]]
+  ; MIR-NEXT:   [[S_CSELECT_B32_2:%[0-9]+]]:sreg_32 = S_CSELECT_B32 [[COPY4]], [[S_MOV_B32_1]], implicit $scc
+  ; MIR-NEXT:   [[V_CVT_F32_I32_e64_1:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 killed [[S_CSELECT_B32_2]], 0, 0, implicit $mode, implicit $exec
+  ; MIR-NEXT:   [[COPY11:%[0-9]+]]:sgpr_32 = COPY [[V_CVT_F32_I32_e64_1]]
+  ; MIR-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_I32_e64 [[COPY2]], [[S_MOV_B32_]], implicit $exec
+  ; MIR-NEXT:   [[COPY12:%[0-9]+]]:vreg_1 = COPY [[V_CMP_LT_I32_e64_]]
   ; MIR-NEXT: {{  $}}
   ; MIR-NEXT: bb.1.bb14:
   ; MIR-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
   ; MIR-NEXT: {{  $}}
-  ; MIR-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %7, %bb.1
-  ; MIR-NEXT:   [[PHI1:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_4]], %bb.0, %8, %bb.1
-  ; MIR-NEXT:   [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY14]]
-  ; MIR-NEXT:   [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[COPY15]], [[PHI]], implicit-def dead $scc
-  ; MIR-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[PHI1]], 0, [[COPY11]], 0, 0, implicit $mode, implicit $exec
-  ; MIR-NEXT:   [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_]], 0, [[S_AND_B32_]], 0, 0, implicit $mode, implicit $exec
-  ; MIR-NEXT:   [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_1]], 0, [[COPY12]], 0, 0, implicit $mode, implicit $exec
-  ; MIR-NEXT:   [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_2]], 0, [[COPY13]], 0, 0, implicit $mode, implicit $exec
-  ; MIR-NEXT:   [[COPY16:%[0-9]+]]:sgpr_32 = COPY [[V_ADD_F32_e64_3]]
+  ; MIR-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_1]], %bb.0, %7, %bb.1
+  ; MIR-NEXT:   [[PHI1:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_3]], %bb.0, %8, %bb.1
+  ; MIR-NEXT:   [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY12]]
+  ; MIR-NEXT:   [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[COPY13]], [[PHI]], implicit-def dead $scc
+  ; MIR-NEXT:   [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[PHI1]], 0, [[COPY9]], 0, 0, implicit $mode, implicit $exec
+  ; MIR-NEXT:   [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_]], 2, [[COPY7]], 0, 0, implicit $mode, implicit $exec
+  ; MIR-NEXT:   [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_1]], 0, [[COPY10]], 0, 0, implicit $mode, implicit $exec
+  ; MIR-NEXT:   [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_2]], 0, [[COPY11]], 0, 0, implicit $mode, implicit $exec
+  ; MIR-NEXT:   [[COPY14:%[0-9]+]]:sgpr_32 = COPY [[V_ADD_F32_e64_3]]
   ; MIR-NEXT:   SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; MIR-NEXT:   S_BRANCH %bb.2
   ; MIR-NEXT: {{  $}}
@@ -115,7 +102,7 @@
   ; MIR-NEXT:   [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1
   ; MIR-NEXT:   [[PHI3:%[0-9]+]]:sreg_32 = PHI [[SI_IF_BREAK]], %bb.1
   ; MIR-NEXT:   SI_END_CF [[PHI3]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
-  ; MIR-NEXT:   FLAT_STORE_DWORD [[COPY8]], [[PHI2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.ptr)
+  ; MIR-NEXT:   FLAT_STORE_DWORD [[COPY3]], [[PHI2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.ptr)
   ; MIR-NEXT:   SI_RETURN
 bb:
   %i = load <2 x i32>, ptr addrspace(4) null, align 4294967296
diff --git a/llvm/test/CodeGen/AMDGPU/fold-fabs.ll b/llvm/test/CodeGen/AMDGPU/fold-fabs.ll
--- a/llvm/test/CodeGen/AMDGPU/fold-fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-fabs.ll
@@ -10,10 +10,9 @@
 ; GFX10-NEXT:    s_mov_b32 s4, exec_lo
 ; GFX10-NEXT:    v_add_f32_e32 v1, v0, v1
 ; GFX10-NEXT:    v_add_f32_e64 v0, |v1|, |v1|
-; GFX10-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
 ; GFX10-NEXT:    v_cmpx_nlt_f32_e32 1.0, v0
 ; GFX10-NEXT:  ; %bb.1: ; %if
-; GFX10-NEXT:    v_mul_f32_e32 v0, 0x3e4ccccd, v1
+; GFX10-NEXT:    v_mul_f32_e64 v0, 0x3e4ccccd, |v1|
 ; GFX10-NEXT:  ; %bb.2: ; %exit
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -41,16 +40,15 @@
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    s_mov_b32 s4, exec_lo
-; GFX10-NEXT:    v_add_f32_e32 v1, v0, v1
-; GFX10-NEXT:    v_add_f32_e64 v0, |v1|, |v1|
-; GFX10-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
-; GFX10-NEXT:    v_cmpx_nlt_f32_e32 1.0, v0
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_add_f32_e64 v1, |v0|, |v0|
+; GFX10-NEXT:    v_cmpx_nlt_f32_e32 1.0, v1
 ; GFX10-NEXT:  ; %bb.1: ; %if
-; GFX10-NEXT:    v_mul_f32_e32 v0, 0x3e4ccccd, v1
+; GFX10-NEXT:    v_mul_f32_e64 v1, 0x3e4ccccd, |v0|
 ; GFX10-NEXT:  ; %bb.2: ; %exit
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_add_f32_e32 v1, 2.0, v1
-; GFX10-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX10-NEXT:    v_add_f32_e64 v0, |v0|, 2.0
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %0 = fadd reassoc nnan nsz arcp contract afn float %arg1, %arg2