diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp @@ -18,6 +18,9 @@ #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/KnownBits.h" @@ -47,6 +50,7 @@ AssumptionCache *AC = nullptr; UniformityInfo *UA = nullptr; + DominatorTree *DT = nullptr; public: static char ID; @@ -76,6 +80,9 @@ bool canWidenScalarExtLoad(LoadInst &LI) const; bool visitLoadInst(LoadInst &LI); + bool visitIntrinsicInst(IntrinsicInst &I); + bool cloneInstructionToUsers(Instruction *I); + bool hasUndefOrPoisonOperand(IntrinsicInst &I, unsigned Operand = 0) const; }; } // end anonymous namespace @@ -93,6 +100,9 @@ AC = &getAnalysis().getAssumptionCache(F); UA = &getAnalysis().getUniformityInfo(); + auto *DTWP = getAnalysisIfAvailable(); + DT = DTWP ? &DTWP->getDomTree() : nullptr; + bool Changed = false; for (auto &BB : F) for (Instruction &I : llvm::make_early_inc_range(BB)) @@ -177,6 +187,115 @@ return true; } +bool AMDGPULateCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) { + switch (I.getIntrinsicID()) { + case Intrinsic::fabs: + return cloneInstructionToUsers(&I); + default: + return false; + } +} + +/** + * This function enables the SelectionDAG to use some modifiers more often. + * In some cases, like after inlining a function which introduces a call to + * the llvm.fabs intrinsic, the IR ends up with users which reside outside + * of the basic block. The generated code then will be suboptimal because + * instruction selection cannot apply the abs modifier. + * In this function, all users of a given instruction are recorded if + * they don't reside in the same BB, and are dominated by the intrinsic call. + * A unique clone of the intrinsic call will be inserted at the earliest point + * in the BB and then all recorded users are adjusted to use the clone instead. + * For now, this is only implemented for the fabs intrinsic. + * It appears that this already works for fneg instructions. + * NOTE: This could be adjusted to move intrinsic calls to their use if there is + * only a single user outside of the caller BB. + * + * @param I The instruction to clone + * @return Whether something has changed or not. + */ +bool AMDGPULateCodeGenPrepare::cloneInstructionToUsers(Instruction *I) { + using namespace PatternMatch; + + bool Changed = false; + IntrinsicInst *Intrinsic = dyn_cast(I); + + // Early opt-out as we don't want to operate on undef or poison values. + if (Intrinsic && hasUndefOrPoisonOperand(*Intrinsic)) + return false; + + DenseMap> UsersPerBB; + bool CanErase = true; + + // Filter all users which we don't want to operate on, keep the rest. + for (User *U : I->users()) { + Instruction *UI = cast(U); + if (!UI) + continue; + + // Don't do anything if there is a user in the same BB. + if (UI->getParent() == I->getParent()) { + CanErase = false; + continue; + } + + // Ignore complex control flow for now. + const bool IsPhi = isa(UI); + + // Removing the original instruction will cause badref for + // phi nodes, so ignore these cases as well. + if (IsPhi || !DT || !DT->dominates(I, UI)) { + CanErase = false; + continue; + } + + if (Intrinsic) { + // Don't generate fabs(fabs(x)) calls. + if (auto *UII = dyn_cast(UI); + UII && (UII->getIntrinsicID() == Intrinsic->getIntrinsicID())) + continue; + } + + // Record the user so we can process on it later. + UsersPerBB[UI->getParent()].push_back(UI); + } + + DenseMap ClonesPerBB; + + // Generate one clone for each BB. + for (auto &[BB, Users] : UsersPerBB) { + Instruction *Clone = nullptr; + if (ClonesPerBB.contains(BB)) { + Clone = ClonesPerBB[BB]; + } else { + Clone = I->clone(); + Clone->setName(I->getName() + Twine(".clone")); + Clone->insertBefore(&*BB->getFirstInsertionPt()); + ClonesPerBB[BB] = Clone; + } + + // Adjust the users so they use the BB-local clone. + for (Instruction *UI : Users) { + UI->replaceUsesOfWith(I, Clone); + Changed = true; + } + } + + if (Changed && CanErase) + I->eraseFromParent(); + + return Changed; +} + +bool AMDGPULateCodeGenPrepare::hasUndefOrPoisonOperand(IntrinsicInst &I, + unsigned Operand) const { + using PatternMatch::m_Poison; + using PatternMatch::m_Undef; + + return match(I.getOperand(Operand), m_Undef()) || + match(I.getOperand(Operand), m_Poison()); +} + INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR late optimizations", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) diff --git a/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll b/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll --- a/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll +++ b/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll @@ -13,35 +13,28 @@ ; ISA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; ISA-NEXT: v_mov_b32_e32 v7, 0 ; ISA-NEXT: s_waitcnt lgkmcnt(0) -; ISA-NEXT: s_cmp_lg_u32 s4, 0 -; ISA-NEXT: s_cselect_b32 s6, -1, 0 -; ISA-NEXT: s_and_b32 s6, s6, exec_lo -; ISA-NEXT: s_cselect_b32 s6, s5, 0 +; ISA-NEXT: s_lshr_b32 s6, s5, 1 ; ISA-NEXT: s_lshr_b32 s7, 1, s4 ; ISA-NEXT: s_cmp_lg_u32 s4, 0 -; ISA-NEXT: v_cvt_f32_i32_e32 v0, s6 -; ISA-NEXT: s_cselect_b32 s8, -1, 0 -; ISA-NEXT: s_and_b32 s8, s8, exec_lo -; ISA-NEXT: s_cselect_b32 s7, s7, 0 -; ISA-NEXT: s_lshr_b32 s5, s5, 1 -; ISA-NEXT: s_cmp_lg_u32 s4, 0 -; ISA-NEXT: v_cvt_f32_ubyte0_e32 v4, s7 ; ISA-NEXT: s_cselect_b32 s4, -1, 0 -; ISA-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s4 +; ISA-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s4 ; ISA-NEXT: s_and_b32 s4, s4, exec_lo -; ISA-NEXT: s_cselect_b32 s4, s5, 0 -; ISA-NEXT: v_cvt_f32_i32_e32 v5, s4 +; ISA-NEXT: s_cselect_b32 s4, s6, 0 +; ISA-NEXT: s_cselect_b32 s6, s7, 0 +; ISA-NEXT: s_cselect_b32 s5, s5, 0 +; ISA-NEXT: v_cvt_f32_i32_e32 v3, s4 +; ISA-NEXT: v_cvt_f32_ubyte0_e32 v4, s6 +; ISA-NEXT: v_cvt_f32_i32_e32 v5, s5 ; ISA-NEXT: s_mov_b32 s4, 0 -; ISA-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 ; ISA-NEXT: .LBB0_1: ; %bb14 ; ISA-NEXT: ; =>This Inner Loop Header: Depth=1 ; ISA-NEXT: v_mov_b32_e32 v6, v7 ; ISA-NEXT: s_and_b32 s5, exec_lo, vcc_lo ; ISA-NEXT: s_or_b32 s4, s5, s4 -; ISA-NEXT: v_add_f32_e32 v7, v6, v3 -; ISA-NEXT: v_add_f32_e32 v7, v7, v5 +; ISA-NEXT: v_add_f32_e32 v7, v6, v0 +; ISA-NEXT: v_add_f32_e64 v7, v7, |v3| ; ISA-NEXT: v_add_f32_e32 v7, v7, v4 -; ISA-NEXT: v_add_f32_e32 v7, v7, v0 +; ISA-NEXT: v_add_f32_e32 v7, v7, v5 ; ISA-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; ISA-NEXT: s_cbranch_execnz .LBB0_1 ; ISA-NEXT: ; %bb.2: ; %bb21 @@ -58,56 +51,50 @@ ; MIR-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; MIR-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; MIR-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; MIR-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; MIR-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] ; MIR-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; MIR-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[S_MOV_B64_]], 0, 0 :: (invariant load (s64) from `ptr addrspace(4) null`, align 4294967296, addrspace 4) - ; MIR-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 - ; MIR-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0 - ; MIR-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; MIR-NEXT: S_CMP_LG_U32 [[COPY4]], [[S_MOV_B32_]], implicit-def $scc - ; MIR-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY $scc - ; MIR-NEXT: $scc = COPY [[COPY5]] - ; MIR-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 [[COPY3]], [[S_MOV_B32_]], implicit $scc - ; MIR-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1 - ; MIR-NEXT: [[S_LSHR_B32_:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[S_MOV_B32_1]], [[COPY4]], implicit-def dead $scc - ; MIR-NEXT: S_CMP_LG_U32 [[COPY4]], [[S_MOV_B32_]], implicit-def $scc + ; MIR-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 + ; MIR-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0 + ; MIR-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; MIR-NEXT: [[S_LSHR_B32_:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[COPY4]], [[S_MOV_B32_]], implicit-def dead $scc + ; MIR-NEXT: [[S_LSHR_B32_1:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[S_MOV_B32_]], [[COPY5]], implicit-def dead $scc + ; MIR-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; MIR-NEXT: S_CMP_LG_U32 [[COPY5]], [[S_MOV_B32_1]], implicit-def $scc ; MIR-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec = COPY $scc ; MIR-NEXT: $scc = COPY [[COPY6]] - ; MIR-NEXT: [[S_CSELECT_B32_1:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_LSHR_B32_]], [[S_MOV_B32_]], implicit $scc - ; MIR-NEXT: [[S_LSHR_B32_1:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[COPY3]], [[S_MOV_B32_1]], implicit-def dead $scc - ; MIR-NEXT: S_CMP_LG_U32 [[COPY4]], [[S_MOV_B32_]], implicit-def $scc - ; MIR-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec = COPY $scc - ; MIR-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; MIR-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; MIR-NEXT: $scc = COPY [[COPY7]] - ; MIR-NEXT: [[S_CSELECT_B32_2:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_LSHR_B32_1]], [[S_MOV_B32_]], implicit $scc - ; MIR-NEXT: [[V_CVT_F32_I32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 killed [[S_CSELECT_B32_2]], 0, 0, implicit $mode, implicit $exec - ; MIR-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483647 - ; MIR-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[V_CVT_F32_I32_e64_]] - ; MIR-NEXT: [[S_AND_B32_:%[0-9]+]]:sgpr_32 = S_AND_B32 killed [[COPY9]], killed [[S_MOV_B32_2]], implicit-def dead $scc - ; MIR-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216 - ; MIR-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 - ; MIR-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY killed [[S_MOV_B32_3]] - ; MIR-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[S_MOV_B32_4]], 0, [[COPY10]], [[COPY7]], implicit $exec - ; MIR-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY [[V_CNDMASK_B32_e64_]] + ; MIR-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_LSHR_B32_]], [[S_MOV_B32_1]], implicit $scc + ; MIR-NEXT: [[V_CVT_F32_I32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 killed [[S_CSELECT_B32_]], 0, 0, implicit $mode, implicit $exec + ; MIR-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY [[V_CVT_F32_I32_e64_]] + ; MIR-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216 + ; MIR-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 + ; MIR-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[S_MOV_B32_2]] + ; MIR-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[S_MOV_B32_3]], 0, [[COPY8]], [[COPY6]], implicit $exec + ; MIR-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY [[V_CNDMASK_B32_e64_]] + ; MIR-NEXT: $scc = COPY [[COPY6]] + ; MIR-NEXT: [[S_CSELECT_B32_1:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_LSHR_B32_1]], [[S_MOV_B32_1]], implicit $scc ; MIR-NEXT: [[V_CVT_F32_UBYTE0_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_UBYTE0_e64 killed [[S_CSELECT_B32_1]], 0, 0, implicit $exec - ; MIR-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY [[V_CVT_F32_UBYTE0_e64_]] - ; MIR-NEXT: [[V_CVT_F32_I32_e64_1:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 killed [[S_CSELECT_B32_]], 0, 0, implicit $mode, implicit $exec - ; MIR-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY [[V_CVT_F32_I32_e64_1]] - ; MIR-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_I32_e64 [[COPY2]], [[S_MOV_B32_1]], implicit $exec - ; MIR-NEXT: [[COPY14:%[0-9]+]]:vreg_1 = COPY [[V_CMP_LT_I32_e64_]] + ; MIR-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY [[V_CVT_F32_UBYTE0_e64_]] + ; MIR-NEXT: $scc = COPY [[COPY6]] + ; MIR-NEXT: [[S_CSELECT_B32_2:%[0-9]+]]:sreg_32 = S_CSELECT_B32 [[COPY4]], [[S_MOV_B32_1]], implicit $scc + ; MIR-NEXT: [[V_CVT_F32_I32_e64_1:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 killed [[S_CSELECT_B32_2]], 0, 0, implicit $mode, implicit $exec + ; MIR-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY [[V_CVT_F32_I32_e64_1]] + ; MIR-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_I32_e64 [[COPY2]], [[S_MOV_B32_]], implicit $exec + ; MIR-NEXT: [[COPY12:%[0-9]+]]:vreg_1 = COPY [[V_CMP_LT_I32_e64_]] ; MIR-NEXT: {{ $}} ; MIR-NEXT: bb.1.bb14: ; MIR-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) ; MIR-NEXT: {{ $}} - ; MIR-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %7, %bb.1 - ; MIR-NEXT: [[PHI1:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_4]], %bb.0, %8, %bb.1 - ; MIR-NEXT: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY14]] - ; MIR-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[COPY15]], [[PHI]], implicit-def dead $scc - ; MIR-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[PHI1]], 0, [[COPY11]], 0, 0, implicit $mode, implicit $exec - ; MIR-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_]], 0, [[S_AND_B32_]], 0, 0, implicit $mode, implicit $exec - ; MIR-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_1]], 0, [[COPY12]], 0, 0, implicit $mode, implicit $exec - ; MIR-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_2]], 0, [[COPY13]], 0, 0, implicit $mode, implicit $exec - ; MIR-NEXT: [[COPY16:%[0-9]+]]:sgpr_32 = COPY [[V_ADD_F32_e64_3]] + ; MIR-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_1]], %bb.0, %7, %bb.1 + ; MIR-NEXT: [[PHI1:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_3]], %bb.0, %8, %bb.1 + ; MIR-NEXT: [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY12]] + ; MIR-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[COPY13]], [[PHI]], implicit-def dead $scc + ; MIR-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[PHI1]], 0, [[COPY9]], 0, 0, implicit $mode, implicit $exec + ; MIR-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_]], 2, [[COPY7]], 0, 0, implicit $mode, implicit $exec + ; MIR-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_1]], 0, [[COPY10]], 0, 0, implicit $mode, implicit $exec + ; MIR-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_2]], 0, [[COPY11]], 0, 0, implicit $mode, implicit $exec + ; MIR-NEXT: [[COPY14:%[0-9]+]]:sgpr_32 = COPY [[V_ADD_F32_e64_3]] ; MIR-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; MIR-NEXT: S_BRANCH %bb.2 ; MIR-NEXT: {{ $}} @@ -115,7 +102,7 @@ ; MIR-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1 ; MIR-NEXT: [[PHI3:%[0-9]+]]:sreg_32 = PHI [[SI_IF_BREAK]], %bb.1 ; MIR-NEXT: SI_END_CF [[PHI3]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; MIR-NEXT: FLAT_STORE_DWORD [[COPY8]], [[PHI2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.ptr) + ; MIR-NEXT: FLAT_STORE_DWORD [[COPY3]], [[PHI2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.ptr) ; MIR-NEXT: SI_RETURN bb: %i = load <2 x i32>, ptr addrspace(4) null, align 4294967296 diff --git a/llvm/test/CodeGen/AMDGPU/fold-fabs.ll b/llvm/test/CodeGen/AMDGPU/fold-fabs.ll --- a/llvm/test/CodeGen/AMDGPU/fold-fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fold-fabs.ll @@ -10,10 +10,9 @@ ; GFX10-NEXT: s_mov_b32 s4, exec_lo ; GFX10-NEXT: v_add_f32_e32 v1, v0, v1 ; GFX10-NEXT: v_add_f32_e64 v0, |v1|, |v1| -; GFX10-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 ; GFX10-NEXT: v_cmpx_nlt_f32_e32 1.0, v0 ; GFX10-NEXT: ; %bb.1: ; %if -; GFX10-NEXT: v_mul_f32_e32 v0, 0x3e4ccccd, v1 +; GFX10-NEXT: v_mul_f32_e64 v0, 0x3e4ccccd, |v1| ; GFX10-NEXT: ; %bb.2: ; %exit ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -41,16 +40,15 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX10-NEXT: s_mov_b32 s4, exec_lo -; GFX10-NEXT: v_add_f32_e32 v1, v0, v1 -; GFX10-NEXT: v_add_f32_e64 v0, |v1|, |v1| -; GFX10-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; GFX10-NEXT: v_cmpx_nlt_f32_e32 1.0, v0 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_add_f32_e64 v1, |v0|, |v0| +; GFX10-NEXT: v_cmpx_nlt_f32_e32 1.0, v1 ; GFX10-NEXT: ; %bb.1: ; %if -; GFX10-NEXT: v_mul_f32_e32 v0, 0x3e4ccccd, v1 +; GFX10-NEXT: v_mul_f32_e64 v1, 0x3e4ccccd, |v0| ; GFX10-NEXT: ; %bb.2: ; %exit ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_add_f32_e32 v1, 2.0, v1 -; GFX10-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX10-NEXT: v_add_f32_e64 v0, |v0|, 2.0 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: %0 = fadd reassoc nnan nsz arcp contract afn float %arg1, %arg2