Index: llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -346,6 +346,17 @@ /// Transform zext(trunc(x)) to x. bool matchCombineZextTrunc(MachineInstr &MI, Register &Reg); + /// Transform fptrunc(fpext(x)) to x. + bool matchCombineFPTruncExt(MachineInstr &MI, Register &Reg); + + /// Transform fptrunc([su]itofp(x)) to [su]itofp x. + bool matchCombineFPTruncIntToFloat( + MachineInstr &MI, std::function &MatchInfo); + + /// Transform fpext([su]itofp(x)) to [su]itofp x. + bool matchCombineFPExtIntToFloat( + MachineInstr &MI, std::function &MatchInfo); + /// Transform [asz]ext([asz]ext(x)) to [asz]ext x. bool matchCombineExtOfExt(MachineInstr &MI, std::tuple &MatchInfo); Index: llvm/include/llvm/Target/GlobalISel/Combine.td =================================================================== --- llvm/include/llvm/Target/GlobalISel/Combine.td +++ llvm/include/llvm/Target/GlobalISel/Combine.td @@ -354,7 +354,7 @@ def constant_fp_op_matchinfo: GIDefMatchData<"Optional">; def constant_fp_op: GICombineRule < (defs root:$root, constant_fp_op_matchinfo:$info), - (match (wip_match_opcode G_FNEG, G_FABS, G_FPTRUNC, G_FSQRT, G_FLOG2):$root, + (match (wip_match_opcode G_FNEG, G_FABS, G_FPTRUNC, G_FPEXT, G_FSQRT, G_FLOG2):$root, [{ return Helper.matchCombineConstantFoldFpUnary(*${root}, ${info}); }]), (apply [{ Helper.applyCombineConstantFoldFpUnary(*${root}, ${info}); }]) >; @@ -472,6 +472,31 @@ (apply [{ Helper.applyCombineExtOfExt(*${root}, ${matchinfo}); }]) >; +// Fold (fptrunc (fpext x)) -> x if source type is same as destination type. +def fptrunc_fpext_fold: GICombineRule < + (defs root:$root, register_matchinfo:$matchinfo), + (match (wip_match_opcode G_FPTRUNC):$root, + [{ return Helper.matchCombineFPTruncExt(*${root}, ${matchinfo}); }]), + (apply [{ Helper.replaceSingleDefInstWithReg(*${root}, ${matchinfo}); }]) +>; + +// Fold (fptrunc ([su]itofp x)) -> ([su]itofp x) +def fptrunc_int_to_float: GICombineRule< + (defs root:$d, build_fn_matchinfo:$info), + (match (wip_match_opcode G_FPTRUNC): $d, + [{ return Helper.matchCombineFPTruncIntToFloat(*${d}, ${info}); }]), + (apply [{ Helper.applyBuildFn(*${d}, ${info}); }]) +>; + +// Fold (fpext ([su]itofp x)) -> ([su]itofp x), if x is representable in Dst +// without losing precision. +def fpext_int_to_float: GICombineRule< + (defs root:$d, build_fn_matchinfo:$info), + (match (wip_match_opcode G_FPEXT): $d, + [{ return Helper.matchCombineFPExtIntToFloat(*${d}, ${info}); }]), + (apply [{ Helper.applyBuildFn(*${d}, ${info}); }]) +>; + def not_cmp_fold_matchinfo : GIDefMatchData<"SmallVector">; def not_cmp_fold : GICombineRule< (defs root:$d, not_cmp_fold_matchinfo:$info), @@ -695,7 +720,8 @@ binop_same_val, binop_left_to_zero, binop_right_to_zero, p2i_to_i2p, i2p_to_p2i, anyext_trunc_fold, - fneg_fneg_fold, right_identity_one]>; + fneg_fneg_fold, right_identity_one, + fptrunc_fpext_fold]>; def const_combines : GICombineGroup<[constant_fp_op, const_ptradd_to_i2p, overlapping_and]>; @@ -728,7 +754,7 @@ const_combines, xor_of_and_with_same_reg, ptr_add_with_zero, shift_immed_chain, shift_of_shifted_logic_chain, load_or_combine, div_rem_to_divrem, funnel_shift_combines, form_bitfield_extract, - constant_fold]>; + constant_fold, fptrunc_int_to_float, fpext_int_to_float]>; // A combine group used to for prelegalizer combiners at -O0. The combines in // this group have been selected based on experiments to balance code size and Index: llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -1660,6 +1660,7 @@ return V; } case TargetOpcode::G_FPTRUNC: + case TargetOpcode::G_FPEXT: break; case TargetOpcode::G_FSQRT: { bool Unused; @@ -1675,8 +1676,9 @@ } } // Convert `APFloat` to appropriate IEEE type depending on `DstTy`. Otherwise, - // `buildFConstant` will assert on size mismatch. Only `G_FPTRUNC`, `G_FSQRT`, - // and `G_FLOG2` reach here. + // `buildFConstant` will assert on size mismatch. + assert(Opcode == TargetOpcode::G_FPTRUNC || Opcode == TargetOpcode::G_FPEXT || + Opcode == TargetOpcode::G_FSQRT || Opcode == TargetOpcode::G_FLOG2); bool Unused; V.convert(getFltSemanticForLLT(DstTy), APFloat::rmNearestTiesToEven, &Unused); return V; @@ -2488,6 +2490,50 @@ return false; } +bool CombinerHelper::matchCombineFPTruncExt(MachineInstr &MI, Register &Reg) { + assert(MI.getOpcode() == TargetOpcode::G_FPTRUNC && "Expected a G_FPTRUNC"); + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + LLT DstTy = MRI.getType(DstReg); + return mi_match(SrcReg, MRI, + m_GFPExt(m_all_of(m_Reg(Reg), m_SpecificType(DstTy)))); +} + +bool CombinerHelper::matchCombineFPTruncIntToFloat( + MachineInstr &MI, std::function &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_FPTRUNC && "Expected a G_FPTRUNC"); + Register SrcReg = MI.getOperand(1).getReg(); + MachineInstr *SrcMI = MRI.getVRegDef(SrcReg); + unsigned SrcOpc = SrcMI->getOpcode(); + if (SrcOpc != TargetOpcode::G_SITOFP && SrcOpc != TargetOpcode::G_UITOFP) + return false; + Register SrcInputReg = SrcMI->getOperand(1).getReg(); + Register DstReg = MI.getOperand(0).getReg(); + MatchInfo = [=](MachineIRBuilder &B) { + B.buildInstr(SrcOpc, {DstReg}, {SrcInputReg}); + }; + return true; +} + +bool CombinerHelper::matchCombineFPExtIntToFloat( + MachineInstr &MI, std::function &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_FPEXT && "Expected a G_FPEXT"); + Register SrcReg = MI.getOperand(1).getReg(); + MachineInstr *SrcMI = MRI.getVRegDef(SrcReg); + unsigned SrcOpc = SrcMI->getOpcode(); + Register DstReg = MI.getOperand(0).getReg(); + if (SrcOpc != TargetOpcode::G_SITOFP && SrcOpc != TargetOpcode::G_UITOFP) + return false; + // 8-bit integers are fully representable in supported floating point types. + Register SrcInputReg = SrcMI->getOperand(1).getReg(); + if (MRI.getType(SrcInputReg).getSizeInBits() != 8) + return false; + MatchInfo = [=](MachineIRBuilder &B) { + B.buildInstr(SrcOpc, {DstReg}, {SrcInputReg}); + }; + return true; +} + bool CombinerHelper::matchCombineExtOfExt( MachineInstr &MI, std::tuple &MatchInfo) { assert((MI.getOpcode() == TargetOpcode::G_ANYEXT || Index: llvm/test/CodeGen/AArch64/GlobalISel/combine-ext.mir =================================================================== --- llvm/test/CodeGen/AArch64/GlobalISel/combine-ext.mir +++ llvm/test/CodeGen/AArch64/GlobalISel/combine-ext.mir @@ -194,3 +194,36 @@ %2:_(<2 x s64>) = G_ZEXT %1(<2 x s32>) $q0 = COPY %2(<2 x s64>) ... +--- +name: test_combine_half_to_float_fpext_constant +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_half_to_float_fpext_constant + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float -5.500000e+00 + ; CHECK: $w0 = COPY [[C]](s32) + %0:_(s16) = G_FCONSTANT half 0xHC580 + %1:_(s32) = G_FPEXT %0(s16) + $w0 = COPY %1(s32) +... +--- +name: test_combine_half_to_double_fpext_constant +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_half_to_double_fpext_constant + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double -5.500000e+00 + ; CHECK: $x0 = COPY [[C]](s64) + %0:_(s16) = G_FCONSTANT half 0xHC580 + %1:_(s64) = G_FPEXT %0(s16) + $x0 = COPY %1(s64) +... +--- +name: test_combine_float_to_double_fpext_constant +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_float_to_double_fpext_constant + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 5.500000e+00 + ; CHECK: $x0 = COPY [[C]](s64) + %0:_(s32) = G_FCONSTANT float 5.500000e+00 + %1:_(s64) = G_FPEXT %0(s32) + $x0 = COPY %1(s64) +... Index: llvm/test/CodeGen/AArch64/GlobalISel/combine-fpext-suitofp.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/GlobalISel/combine-fpext-suitofp.mir @@ -0,0 +1,33 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -o - -mtriple=aarch64-unknown-unknown -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s | FileCheck %s + +# (fpext ([us]itofp x)) -> ([us]itofp x) +# Due to loss of precision, this is only enabled for x = i8. +--- +name: test_combine_fpext_sitofp_8_to_16 +body: | + bb.1: + liveins: $b0 + ; CHECK-LABEL: name: test_combine_fpext_sitofp_8_to_16 + ; CHECK: [[COPY:%[0-9]+]]:_(s8) = COPY $b0 + ; CHECK: [[SITOFP:%[0-9]+]]:_(s16) = G_SITOFP [[COPY]](s8) + ; CHECK: $h1 = COPY [[SITOFP]](s16) + %0:_(s8) = COPY $b0 + %1:_(s8) = G_SITOFP %0(s8) + %2:_(s16) = G_FPEXT %1(s8) + $h1 = COPY %2(s16) +... +--- +name: test_combine_fpext_sitofp_8_to_32 +body: | + bb.1: + liveins: $b0 + ; CHECK-LABEL: name: test_combine_fpext_sitofp_8_to_32 + ; CHECK: [[COPY:%[0-9]+]]:_(s8) = COPY $b0 + ; CHECK: [[SITOFP:%[0-9]+]]:_(s32) = G_SITOFP [[COPY]](s8) + ; CHECK: $w1 = COPY [[SITOFP]](s32) + %0:_(s8) = COPY $b0 + %1:_(s8) = G_SITOFP %0(s8) + %2:_(s32) = G_FPEXT %1(s8) + $w1 = COPY %2(s32) +... Index: llvm/test/CodeGen/AArch64/GlobalISel/combine-fptrunc.mir =================================================================== --- llvm/test/CodeGen/AArch64/GlobalISel/combine-fptrunc.mir +++ llvm/test/CodeGen/AArch64/GlobalISel/combine-fptrunc.mir @@ -34,3 +34,90 @@ %1:_(s32) = G_FPTRUNC %0(s64) $w0 = COPY %1(s32) ... +# (fptrunc (fpext x)) -> x +--- +name: test_combine_truncate_ext_32 +body: | + bb.1: + liveins: $w0 + ; CHECK-LABEL: name: test_combine_truncate_ext_32 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: $w1 = COPY [[COPY]](s32) + %0:_(s32) = COPY $w0 + %1:_(s64) = G_FPEXT %0(s32) + %2:_(s32) = G_FPTRUNC %1(s64) + $w1 = COPY %2(s32) +... +--- +name: test_combine_truncate_ext_16 +body: | + bb.1: + liveins: $h0 + ; CHECK-LABEL: name: test_combine_truncate_ext_16 + ; CHECK: [[COPY:%[0-9]+]]:_(s16) = COPY $h0 + ; CHECK: $h1 = COPY [[COPY]](s16) + %0:_(s16) = COPY $h0 + %1:_(s32) = G_FPEXT %0(s16) + %2:_(s16) = G_FPTRUNC %1(s32) + $h1 = COPY %2(s16) +... +# Don't combine when the types mismatch. +--- +name: test_combine_truncate_ext_32_non_identity +body: | + bb.1: + liveins: $w0 + ; CHECK-LABEL: name: test_combine_truncate_ext_32_non_identity + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[FPEXT:%[0-9]+]]:_(s64) = G_FPEXT [[COPY]](s32) + ; CHECK: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FPEXT]](s64) + ; CHECK: $h1 = COPY [[FPTRUNC]](s16) + %0:_(s32) = COPY $w0 + %1:_(s64) = G_FPEXT %0(s32) + %2:_(s16) = G_FPTRUNC %1(s64) + $h1 = COPY %2(s16) +... +--- +name: test_combine_truncate_ext_16_non_identity +body: | + bb.1: + liveins: $h0 + ; CHECK-LABEL: name: test_combine_truncate_ext_16_non_identity + ; CHECK: [[COPY:%[0-9]+]]:_(s16) = COPY $h0 + ; CHECK: [[FPEXT:%[0-9]+]]:_(s64) = G_FPEXT [[COPY]](s16) + ; CHECK: [[FPTRUNC:%[0-9]+]]:_(s32) = G_FPTRUNC [[FPEXT]](s64) + ; CHECK: $w1 = COPY [[FPTRUNC]](s32) + %0:_(s16) = COPY $h0 + %1:_(s64) = G_FPEXT %0(s16) + %2:_(s32) = G_FPTRUNC %1(s64) + $w1 = COPY %2(s32) +... +# (fptrunc ([us]itofp x)) -> ([us]itofp x) +--- +name: test_combine_truncate_sitofp_16 +body: | + bb.1: + liveins: $h0 + ; CHECK-LABEL: name: test_combine_truncate_sitofp_16 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK: [[SITOFP:%[0-9]+]]:_(s16) = G_SITOFP [[COPY]](s32) + ; CHECK: $h1 = COPY [[SITOFP]](s16) + %0:_(s32) = COPY $w0 + %1:_(s32) = G_SITOFP %0(s32) + %2:_(s16) = G_FPTRUNC %1(s32) + $h1 = COPY %2(s16) +... +--- +name: test_combine_truncate_uitofp_32 +body: | + bb.1: + liveins: $h0 + ; CHECK-LABEL: name: test_combine_truncate_uitofp_32 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[COPY]](s64) + ; CHECK: $w1 = COPY [[UITOFP]](s32) + %0:_(s64) = COPY $x0 + %1:_(s64) = G_UITOFP %0(s64) + %2:_(s32) = G_FPTRUNC %1(s64) + $w1 = COPY %2(s32) +... Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll @@ -181,40 +181,38 @@ ; GFX6-IEEE-LABEL: v_rcp_f16: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_rcp_f16: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] @@ -223,9 +221,8 @@ ; GFX89: ; %bb.0: ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX89-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX89-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX89-NEXT: v_rcp_f32_e32 v1, v1 -; GFX89-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX89-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX89-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX89-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 ; GFX89-NEXT: s_setpc_b64 s[30:31] @@ -235,9 +232,8 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX10-NEXT: v_rcp_f32_e32 v1, v1 -; GFX10-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX10-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX10-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -249,40 +245,38 @@ ; GFX6-IEEE-LABEL: v_rcp_f16_arcp: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_rcp_f16_arcp: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] @@ -291,9 +285,8 @@ ; GFX89: ; %bb.0: ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX89-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX89-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX89-NEXT: v_rcp_f32_e32 v1, v1 -; GFX89-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX89-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX89-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX89-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 ; GFX89-NEXT: s_setpc_b64 s[30:31] @@ -303,9 +296,8 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX10-NEXT: v_rcp_f32_e32 v1, v1 -; GFX10-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX10-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX10-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -318,9 +310,8 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -344,40 +335,38 @@ ; GFX6-IEEE-LABEL: v_rcp_f16_ulp25: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 -; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_rcp_f16_ulp25: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] @@ -834,71 +823,67 @@ ; GFX6-IEEE-LABEL: v_rcp_v2f16: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_rcp_v2f16: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: s_movk_i32 s6, 0x3c00 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, s6 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, s6 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -908,11 +893,10 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX8-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX8-NEXT: v_rcp_f32_e32 v1, v1 ; GFX8-NEXT: v_rcp_f32_e32 v3, v3 -; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1 -; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 @@ -928,11 +912,10 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX9-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX9-NEXT: v_rcp_f32_e32 v1, v1 ; GFX9-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_f32_e32 v1, v4, v1 -; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX9-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 @@ -948,12 +931,11 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3 ; GFX10-NEXT: v_rcp_f32_e32 v2, v2 -; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX10-NEXT: v_mul_f32_e32 v2, v4, v2 +; GFX10-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX10-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0 @@ -969,71 +951,67 @@ ; GFX6-IEEE-LABEL: v_rcp_v2f16_arcp: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_rcp_v2f16_arcp: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: s_movk_i32 s6, 0x3c00 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, s6 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, s6 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -1043,11 +1021,10 @@ ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX8-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX8-NEXT: v_rcp_f32_e32 v1, v1 ; GFX8-NEXT: v_rcp_f32_e32 v3, v3 -; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1 -; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 @@ -1063,11 +1040,10 @@ ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX9-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX9-NEXT: v_rcp_f32_e32 v1, v1 ; GFX9-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_f32_e32 v1, v4, v1 -; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX9-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 @@ -1083,12 +1059,11 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX10-NEXT: v_rcp_f32_e32 v3, v3 ; GFX10-NEXT: v_rcp_f32_e32 v2, v2 -; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX10-NEXT: v_mul_f32_e32 v2, v4, v2 +; GFX10-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX10-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0 @@ -1106,11 +1081,10 @@ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, v2, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1150,71 +1124,67 @@ ; GFX6-IEEE-LABEL: v_rcp_v2f16_ulp25: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2 -; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 +; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 -; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2 +; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX6-FLUSH-LABEL: v_rcp_v2f16_ulp25: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: s_movk_i32 s6, 0x3c00 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, s6 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 -; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, s6 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6 -; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ;