diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -353,6 +353,9 @@ bool isConstantUnsignedBitfieldExtractLegal(unsigned Opc, LLT Ty1, LLT Ty2) const override; + + bool shouldSinkOperands(Instruction *I, + SmallVectorImpl &Ops) const override; }; namespace AMDGPUISD { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -5133,3 +5133,22 @@ return (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64)) && Ty2 == LLT::scalar(32); } + +/// Whether it is profitable to sink the operands of an +/// Instruction I to the basic block of I. +/// This helps using several modifiers (like abs and neg) more often. +bool AMDGPUTargetLowering::shouldSinkOperands( + Instruction *I, SmallVectorImpl &Ops) const { + using namespace PatternMatch; + + for (auto &Op : I->operands()) { + // Ensure we are not already sinking this operand. + if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); })) + continue; + + if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value()))) + Ops.push_back(&Op); + } + + return !Ops.empty(); +} diff --git a/llvm/test/CodeGen/AMDGPU/andorbitset.ll b/llvm/test/CodeGen/AMDGPU/andorbitset.ll --- a/llvm/test/CodeGen/AMDGPU/andorbitset.ll +++ b/llvm/test/CodeGen/AMDGPU/andorbitset.ll @@ -53,7 +53,7 @@ ; Make sure there's no verifier error with an undef source. ; SI-LABEL: {{^}}bitset_verifier_error: ; SI-NOT: %bb.1: -; SI: s_bitset0_b32 s{{[0-9]+}}, 31 +; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff define void @bitset_verifier_error() local_unnamed_addr #0 { bb: %i = call float @llvm.fabs.f32(float undef) #0 diff --git a/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll b/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll --- a/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll +++ b/llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll @@ -13,35 +13,28 @@ ; ISA-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; ISA-NEXT: v_mov_b32_e32 v7, 0 ; ISA-NEXT: s_waitcnt lgkmcnt(0) -; ISA-NEXT: s_cmp_lg_u32 s4, 0 -; ISA-NEXT: s_cselect_b32 s6, -1, 0 -; ISA-NEXT: s_and_b32 s6, s6, exec_lo -; ISA-NEXT: s_cselect_b32 s6, s5, 0 +; ISA-NEXT: s_lshr_b32 s6, s5, 1 ; ISA-NEXT: s_lshr_b32 s7, 1, s4 ; ISA-NEXT: s_cmp_lg_u32 s4, 0 -; ISA-NEXT: v_cvt_f32_i32_e32 v0, s6 -; ISA-NEXT: s_cselect_b32 s8, -1, 0 -; ISA-NEXT: s_and_b32 s8, s8, exec_lo -; ISA-NEXT: s_cselect_b32 s7, s7, 0 -; ISA-NEXT: s_lshr_b32 s5, s5, 1 -; ISA-NEXT: s_cmp_lg_u32 s4, 0 -; ISA-NEXT: v_cvt_f32_ubyte0_e32 v4, s7 ; ISA-NEXT: s_cselect_b32 s4, -1, 0 -; ISA-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s4 +; ISA-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s4 ; ISA-NEXT: s_and_b32 s4, s4, exec_lo -; ISA-NEXT: s_cselect_b32 s4, s5, 0 -; ISA-NEXT: v_cvt_f32_i32_e32 v5, s4 +; ISA-NEXT: s_cselect_b32 s4, s6, 0 +; ISA-NEXT: s_cselect_b32 s6, s7, 0 +; ISA-NEXT: s_cselect_b32 s5, s5, 0 +; ISA-NEXT: v_cvt_f32_i32_e32 v3, s4 +; ISA-NEXT: v_cvt_f32_ubyte0_e32 v4, s6 +; ISA-NEXT: v_cvt_f32_i32_e32 v5, s5 ; ISA-NEXT: s_mov_b32 s4, 0 -; ISA-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 ; ISA-NEXT: .LBB0_1: ; %bb14 ; ISA-NEXT: ; =>This Inner Loop Header: Depth=1 ; ISA-NEXT: v_mov_b32_e32 v6, v7 ; ISA-NEXT: s_and_b32 s5, exec_lo, vcc_lo ; ISA-NEXT: s_or_b32 s4, s5, s4 -; ISA-NEXT: v_add_f32_e32 v7, v6, v3 -; ISA-NEXT: v_add_f32_e32 v7, v7, v5 +; ISA-NEXT: v_add_f32_e32 v7, v6, v0 +; ISA-NEXT: v_add_f32_e64 v7, v7, |v3| ; ISA-NEXT: v_add_f32_e32 v7, v7, v4 -; ISA-NEXT: v_add_f32_e32 v7, v7, v0 +; ISA-NEXT: v_add_f32_e32 v7, v7, v5 ; ISA-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 ; ISA-NEXT: s_cbranch_execnz .LBB0_1 ; ISA-NEXT: ; %bb.2: ; %bb21 @@ -58,56 +51,50 @@ ; MIR-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; MIR-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; MIR-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; MIR-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; MIR-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] ; MIR-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 ; MIR-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[S_MOV_B64_]], 0, 0 :: (invariant load (s64) from `ptr addrspace(4) null`, align 4294967296, addrspace 4) - ; MIR-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 - ; MIR-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0 - ; MIR-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; MIR-NEXT: S_CMP_LG_U32 [[COPY4]], [[S_MOV_B32_]], implicit-def $scc - ; MIR-NEXT: [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec = COPY $scc - ; MIR-NEXT: $scc = COPY [[COPY5]] - ; MIR-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 [[COPY3]], [[S_MOV_B32_]], implicit $scc - ; MIR-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1 - ; MIR-NEXT: [[S_LSHR_B32_:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[S_MOV_B32_1]], [[COPY4]], implicit-def dead $scc - ; MIR-NEXT: S_CMP_LG_U32 [[COPY4]], [[S_MOV_B32_]], implicit-def $scc + ; MIR-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 + ; MIR-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0 + ; MIR-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; MIR-NEXT: [[S_LSHR_B32_:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[COPY4]], [[S_MOV_B32_]], implicit-def dead $scc + ; MIR-NEXT: [[S_LSHR_B32_1:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[S_MOV_B32_]], [[COPY5]], implicit-def dead $scc + ; MIR-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; MIR-NEXT: S_CMP_LG_U32 [[COPY5]], [[S_MOV_B32_1]], implicit-def $scc ; MIR-NEXT: [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec = COPY $scc ; MIR-NEXT: $scc = COPY [[COPY6]] - ; MIR-NEXT: [[S_CSELECT_B32_1:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_LSHR_B32_]], [[S_MOV_B32_]], implicit $scc - ; MIR-NEXT: [[S_LSHR_B32_1:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[COPY3]], [[S_MOV_B32_1]], implicit-def dead $scc - ; MIR-NEXT: S_CMP_LG_U32 [[COPY4]], [[S_MOV_B32_]], implicit-def $scc - ; MIR-NEXT: [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec = COPY $scc - ; MIR-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; MIR-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; MIR-NEXT: $scc = COPY [[COPY7]] - ; MIR-NEXT: [[S_CSELECT_B32_2:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_LSHR_B32_1]], [[S_MOV_B32_]], implicit $scc - ; MIR-NEXT: [[V_CVT_F32_I32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 killed [[S_CSELECT_B32_2]], 0, 0, implicit $mode, implicit $exec - ; MIR-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483647 - ; MIR-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[V_CVT_F32_I32_e64_]] - ; MIR-NEXT: [[S_AND_B32_:%[0-9]+]]:sgpr_32 = S_AND_B32 killed [[COPY9]], killed [[S_MOV_B32_2]], implicit-def dead $scc - ; MIR-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216 - ; MIR-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 - ; MIR-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY killed [[S_MOV_B32_3]] - ; MIR-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[S_MOV_B32_4]], 0, [[COPY10]], [[COPY7]], implicit $exec - ; MIR-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY [[V_CNDMASK_B32_e64_]] + ; MIR-NEXT: [[S_CSELECT_B32_:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_LSHR_B32_]], [[S_MOV_B32_1]], implicit $scc + ; MIR-NEXT: [[V_CVT_F32_I32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 killed [[S_CSELECT_B32_]], 0, 0, implicit $mode, implicit $exec + ; MIR-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY [[V_CVT_F32_I32_e64_]] + ; MIR-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216 + ; MIR-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 0 + ; MIR-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[S_MOV_B32_2]] + ; MIR-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[S_MOV_B32_3]], 0, [[COPY8]], [[COPY6]], implicit $exec + ; MIR-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY [[V_CNDMASK_B32_e64_]] + ; MIR-NEXT: $scc = COPY [[COPY6]] + ; MIR-NEXT: [[S_CSELECT_B32_1:%[0-9]+]]:sreg_32 = S_CSELECT_B32 killed [[S_LSHR_B32_1]], [[S_MOV_B32_1]], implicit $scc ; MIR-NEXT: [[V_CVT_F32_UBYTE0_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_UBYTE0_e64 killed [[S_CSELECT_B32_1]], 0, 0, implicit $exec - ; MIR-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY [[V_CVT_F32_UBYTE0_e64_]] - ; MIR-NEXT: [[V_CVT_F32_I32_e64_1:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 killed [[S_CSELECT_B32_]], 0, 0, implicit $mode, implicit $exec - ; MIR-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY [[V_CVT_F32_I32_e64_1]] - ; MIR-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_I32_e64 [[COPY2]], [[S_MOV_B32_1]], implicit $exec - ; MIR-NEXT: [[COPY14:%[0-9]+]]:vreg_1 = COPY [[V_CMP_LT_I32_e64_]] + ; MIR-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY [[V_CVT_F32_UBYTE0_e64_]] + ; MIR-NEXT: $scc = COPY [[COPY6]] + ; MIR-NEXT: [[S_CSELECT_B32_2:%[0-9]+]]:sreg_32 = S_CSELECT_B32 [[COPY4]], [[S_MOV_B32_1]], implicit $scc + ; MIR-NEXT: [[V_CVT_F32_I32_e64_1:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e64 killed [[S_CSELECT_B32_2]], 0, 0, implicit $mode, implicit $exec + ; MIR-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY [[V_CVT_F32_I32_e64_1]] + ; MIR-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_LT_I32_e64 [[COPY2]], [[S_MOV_B32_]], implicit $exec + ; MIR-NEXT: [[COPY12:%[0-9]+]]:vreg_1 = COPY [[V_CMP_LT_I32_e64_]] ; MIR-NEXT: {{ $}} ; MIR-NEXT: bb.1.bb14: ; MIR-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) ; MIR-NEXT: {{ $}} - ; MIR-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %7, %bb.1 - ; MIR-NEXT: [[PHI1:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_4]], %bb.0, %8, %bb.1 - ; MIR-NEXT: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY14]] - ; MIR-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[COPY15]], [[PHI]], implicit-def dead $scc - ; MIR-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[PHI1]], 0, [[COPY11]], 0, 0, implicit $mode, implicit $exec - ; MIR-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_]], 0, [[S_AND_B32_]], 0, 0, implicit $mode, implicit $exec - ; MIR-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_1]], 0, [[COPY12]], 0, 0, implicit $mode, implicit $exec - ; MIR-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_2]], 0, [[COPY13]], 0, 0, implicit $mode, implicit $exec - ; MIR-NEXT: [[COPY16:%[0-9]+]]:sgpr_32 = COPY [[V_ADD_F32_e64_3]] + ; MIR-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_1]], %bb.0, %7, %bb.1 + ; MIR-NEXT: [[PHI1:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_3]], %bb.0, %8, %bb.1 + ; MIR-NEXT: [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY12]] + ; MIR-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK [[COPY13]], [[PHI]], implicit-def dead $scc + ; MIR-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[PHI1]], 0, [[COPY9]], 0, 0, implicit $mode, implicit $exec + ; MIR-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_]], 2, [[COPY7]], 0, 0, implicit $mode, implicit $exec + ; MIR-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_1]], 0, [[COPY10]], 0, 0, implicit $mode, implicit $exec + ; MIR-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[V_ADD_F32_e64_2]], 0, [[COPY11]], 0, 0, implicit $mode, implicit $exec + ; MIR-NEXT: [[COPY14:%[0-9]+]]:sgpr_32 = COPY [[V_ADD_F32_e64_3]] ; MIR-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; MIR-NEXT: S_BRANCH %bb.2 ; MIR-NEXT: {{ $}} @@ -115,7 +102,7 @@ ; MIR-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1 ; MIR-NEXT: [[PHI3:%[0-9]+]]:sreg_32 = PHI [[SI_IF_BREAK]], %bb.1 ; MIR-NEXT: SI_END_CF [[PHI3]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; MIR-NEXT: FLAT_STORE_DWORD [[COPY8]], [[PHI2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.ptr) + ; MIR-NEXT: FLAT_STORE_DWORD [[COPY3]], [[PHI2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %ir.ptr) ; MIR-NEXT: SI_RETURN bb: %i = load <2 x i32>, ptr addrspace(4) null, align 4294967296 diff --git a/llvm/test/CodeGen/AMDGPU/fold-fabs.ll b/llvm/test/CodeGen/AMDGPU/fold-fabs.ll --- a/llvm/test/CodeGen/AMDGPU/fold-fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fold-fabs.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs -disable-machine-sink=1 -start-before=amdgpu-late-codegenprepare < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs -disable-machine-sink=1 - < %s | FileCheck -check-prefix=GFX10 %s define float @fold_abs_in_branch(float %arg1, float %arg2) { ; GFX10-LABEL: fold_abs_in_branch: @@ -10,10 +10,9 @@ ; GFX10-NEXT: s_mov_b32 s4, exec_lo ; GFX10-NEXT: v_add_f32_e32 v1, v0, v1 ; GFX10-NEXT: v_add_f32_e64 v0, |v1|, |v1| -; GFX10-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 ; GFX10-NEXT: v_cmpx_nlt_f32_e32 1.0, v0 ; GFX10-NEXT: ; %bb.1: ; %if -; GFX10-NEXT: v_mul_f32_e32 v0, 0x3e4ccccd, v1 +; GFX10-NEXT: v_mul_f32_e64 v0, 0x3e4ccccd, |v1| ; GFX10-NEXT: ; %bb.2: ; %exit ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -41,16 +40,15 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX10-NEXT: s_mov_b32 s4, exec_lo -; GFX10-NEXT: v_add_f32_e32 v1, v0, v1 -; GFX10-NEXT: v_add_f32_e64 v0, |v1|, |v1| -; GFX10-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; GFX10-NEXT: v_cmpx_nlt_f32_e32 1.0, v0 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_add_f32_e64 v1, |v0|, |v0| +; GFX10-NEXT: v_cmpx_nlt_f32_e32 1.0, v1 ; GFX10-NEXT: ; %bb.1: ; %if -; GFX10-NEXT: v_mul_f32_e32 v0, 0x3e4ccccd, v1 +; GFX10-NEXT: v_mul_f32_e64 v1, 0x3e4ccccd, |v0| ; GFX10-NEXT: ; %bb.2: ; %exit ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GFX10-NEXT: v_add_f32_e32 v1, 2.0, v1 -; GFX10-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX10-NEXT: v_add_f32_e64 v0, |v0|, 2.0 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: %0 = fadd reassoc nnan nsz arcp contract afn float %arg1, %arg2 @@ -77,11 +75,10 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_f32_e64 v0, |s4|, |s4| -; GFX10-NEXT: s_bitset0_b32 s4, 31 ; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, 1.0, v0 ; GFX10-NEXT: s_cbranch_vccnz .LBB2_2 ; GFX10-NEXT: ; %bb.1: ; %if -; GFX10-NEXT: v_mul_f32_e64 v0, 0x3e4ccccd, s4 +; GFX10-NEXT: v_mul_f32_e64 v0, 0x3e4ccccd, |s4| ; GFX10-NEXT: .LBB2_2: ; %exit ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: @@ -107,11 +104,10 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_add_f32_e64 v0, |s4|, |s4| -; GFX10-NEXT: s_bitset0_b32 s4, 31 ; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, 1.0, v0 ; GFX10-NEXT: s_cbranch_vccnz .LBB3_2 ; GFX10-NEXT: ; %bb.1: ; %if -; GFX10-NEXT: v_mul_f32_e64 v0, 0x3e4ccccd, s4 +; GFX10-NEXT: v_mul_f32_e64 v0, 0x3e4ccccd, |s4| ; GFX10-NEXT: .LBB3_2: ; %exit ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: @@ -140,7 +136,6 @@ ; GFX10-NEXT: s_mov_b32 s4, exec_lo ; GFX10-NEXT: v_add_f32_e32 v1, v0, v1 ; GFX10-NEXT: v_add_f32_e64 v0, |v1|, |v1| -; GFX10-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 ; GFX10-NEXT: v_cmpx_nlt_f32_e32 1.0, v0 ; GFX10-NEXT: ; %bb.1: ; %if ; GFX10-NEXT: v_mul_f32_e64 v0, 0x3e4ccccd, |v1| @@ -175,14 +170,16 @@ ; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX10-NEXT: v_add_f32_e64 v0, |v0|, |v0| ; GFX10-NEXT: v_cmpx_nlt_f32_e32 1.0, v0 -; GFX10-NEXT: s_cbranch_execz .LBB5_2 -; GFX10-NEXT: .LBB5_1: ; %l +; GFX10-NEXT: s_cbranch_execz .LBB5_3 +; GFX10-NEXT: ; %bb.1: ; %header.preheader +; GFX10-NEXT: ; implicit-def: $vgpr0 +; GFX10-NEXT: .LBB5_2: ; %header ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x40400000, v0 ; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, -1.0, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 -; GFX10-NEXT: s_cbranch_vccnz .LBB5_1 -; GFX10-NEXT: .LBB5_2: ; %exit +; GFX10-NEXT: s_cbranch_vccnz .LBB5_2 +; GFX10-NEXT: .LBB5_3: ; %Flow1 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: @@ -209,5 +206,40 @@ ret float %ret } +define float @fold_neg_in_branch(float %arg1, float %arg2) { +; GFX10-LABEL: fold_neg_in_branch: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-NEXT: s_mov_b32 s4, exec_lo +; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_cmpx_nlt_f32_e32 1.0, v0 +; GFX10-NEXT: ; %bb.1: ; %if +; GFX10-NEXT: v_rcp_f32_e64 v1, -v0 +; GFX10-NEXT: v_mul_f32_e64 v1, |v0|, v1 +; GFX10-NEXT: ; %bb.2: ; %exit +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mul_f32_e64 v0, -v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +entry: + %0 = fadd reassoc nnan nsz arcp contract afn float %arg1, %arg2 + %1 = fadd reassoc nnan nsz arcp contract afn float %0, %arg2 + %2 = fneg reassoc nnan nsz arcp contract afn float %1 + %3 = fcmp ule float %1, 1.000000e+00 + br i1 %3, label %if, label %exit + +if: + %if.fabs = call reassoc nnan nsz arcp contract afn float @llvm.fabs.f32(float %1) + %if.3 = fdiv reassoc nnan nsz arcp contract afn float %if.fabs, %2 + br label %exit + +exit: + %ret = phi float [ %1, %entry ], [ %if.3, %if ] + %ret.2 = fmul reassoc nnan nsz arcp contract afn float %2, %ret + ret float %ret.2 +} + declare float @llvm.fabs.f32(float) declare float @llvm.fmuladd.f32(float, float, float) #0