diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -927,28 +927,31 @@ // TODO: Match source modifiers. - const DebugLoc &DL = MI.getDebugLoc(); - MachineBasicBlock *MBB = MI.getParent(); - Register Numer = MI.getOperand(3).getReg(); Register Denom = MI.getOperand(4).getReg(); unsigned ChooseDenom = MI.getOperand(5).getImm(); Register Src0 = ChooseDenom != 0 ? Numer : Denom; - auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0) - .addDef(Dst1) - .addImm(0) // $src0_modifiers - .addUse(Src0) // $src0 - .addImm(0) // $src1_modifiers - .addUse(Denom) // $src1 - .addImm(0) // $src2_modifiers - .addUse(Numer) // $src2 - .addImm(0) // $clamp - .addImm(0); // $omod + MachineIRBuilder Builder(MI); + auto MIB = Builder.buildInstr(Opc) + .addDef(Dst0) + .addImm(0) // $src0_modifiers + .addUse(Src0) // $src0 + .addImm(0) // $src1_modifiers + .addUse(Denom) // $src1 + .addImm(0) // $src2_modifiers + .addUse(Numer) // $src2 + .addImm(0) // $clamp + .addImm(0); // $omod + + if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI)) { + return false; + } + MRI->replaceRegWith(Dst1, TRI.getVCC()); MI.eraseFromParent(); - return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); + return true; } bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const { diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -8192,6 +8192,30 @@ } } +static bool HasImplicitSDSTVCC(unsigned Opcode) { + switch (Opcode) { + default: + return false; + case AMDGPU::V_DIV_SCALE_F32_e64_gfx11: + case AMDGPU::V_DIV_SCALE_F32_e64_w32_gfx11: + case AMDGPU::V_DIV_SCALE_F32_e64_w64_gfx11: + case AMDGPU::V_DIV_SCALE_F64_e64_gfx11: + case AMDGPU::V_DIV_SCALE_F64_e64_w32_gfx11: + case AMDGPU::V_DIV_SCALE_F64_e64_w64_gfx11: + case AMDGPU::V_DIV_SCALE_F32_gfx10: + case AMDGPU::V_DIV_SCALE_F32_w32_gfx10: + case AMDGPU::V_DIV_SCALE_F32_w64_gfx10: + case AMDGPU::V_DIV_SCALE_F64_gfx10: + case AMDGPU::V_DIV_SCALE_F64_w32_gfx10: + case AMDGPU::V_DIV_SCALE_F64_w64_gfx10: + case AMDGPU::V_DIV_SCALE_F32_gfx6_gfx7: + case AMDGPU::V_DIV_SCALE_F64_gfx6_gfx7: + case AMDGPU::V_DIV_SCALE_F32_vi: + case AMDGPU::V_DIV_SCALE_F64_vi: + return true; + } +} + void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands, OptionalImmIndexMap &OptionalIdx) { unsigned Opc = Inst.getOpcode(); @@ -8203,9 +8227,16 @@ } if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers) != -1) { + const bool UsesImplicitSDSTVCC = HasImplicitSDSTVCC(Inst.getOpcode()); // This instruction has src modifiers for (unsigned E = Operands.size(); I != E; ++I) { AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); + + if (UsesImplicitSDSTVCC && Op.isReg() && I == 2 && + (Op.getReg() == AMDGPU::VCC || Op.getReg() == AMDGPU::VCC_LO)) { + continue; + } + if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { Op.addRegOrImmWithFPInputModsOperands(Inst, 2); } else if (Op.isImmModifier()) { @@ -8261,6 +8292,28 @@ // Copy the operand to ensure it's not invalidated when Inst grows. Inst.insert(it, MCOperand(Inst.getOperand(0))); // src2 = dst } + + // Fix AsmParserOnly Opcodes to canonical opcodes. + switch (Inst.getOpcode()) { + default: + break; + case AMDGPU::V_DIV_SCALE_F32_e64_w32_gfx11: + case AMDGPU::V_DIV_SCALE_F32_e64_w64_gfx11: + Inst.setOpcode(AMDGPU::V_DIV_SCALE_F32_e64_gfx11); + break; + case AMDGPU::V_DIV_SCALE_F64_e64_w32_gfx11: + case AMDGPU::V_DIV_SCALE_F64_e64_w64_gfx11: + Inst.setOpcode(AMDGPU::V_DIV_SCALE_F64_e64_gfx11); + break; + case AMDGPU::V_DIV_SCALE_F32_w32_gfx10: + case AMDGPU::V_DIV_SCALE_F32_w64_gfx10: + Inst.setOpcode(AMDGPU::V_DIV_SCALE_F32_gfx10); + break; + case AMDGPU::V_DIV_SCALE_F64_w32_gfx10: + case AMDGPU::V_DIV_SCALE_F64_w64_gfx10: + Inst.setOpcode(AMDGPU::V_DIV_SCALE_F64_gfx10); + break; + } } void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) { diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -761,10 +761,22 @@ O << "/*INV_OP*/"; } - // Print default vcc/vcc_lo operand of v_cndmask_b32_e32. + // Print default vcc/vcc_lo operand for specific instructions. switch (MI->getOpcode()) { default: break; - + case AMDGPU::V_DIV_SCALE_F32_e64_gfx11: + case AMDGPU::V_DIV_SCALE_F64_e64_gfx11: + case AMDGPU::V_DIV_SCALE_F32_gfx10: + case AMDGPU::V_DIV_SCALE_F64_gfx10: { + // Print vcc(_lo) after the vdst for V_DIV_SCALE on gfx10+. + int VDstIdx = + AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vdst); + assert(VDstIdx != -1); + if ((int)OpNo == VDstIdx) { + printDefaultVccOperand(false, STI, O); + } + break; + } case AMDGPU::V_CNDMASK_B32_e32_gfx10: case AMDGPU::V_ADD_CO_CI_U32_e32_gfx10: case AMDGPU::V_SUB_CO_CI_U32_e32_gfx10: diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -12423,8 +12423,19 @@ if (ST.isWave32() && !MF.empty()) { for (auto &MBB : MF) { - for (auto &MI : MBB) { + for (auto I = MBB.begin(); I != MBB.end(); ++I) { + auto &MI = *I; TII->fixImplicitOperands(MI); + + if (MI.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 || + MI.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) { + // Fixup adjacent copy of the VCC impdef so it's also VCC_LO. + auto NextI = std::next(I); + if (NextI != MBB.end() && NextI->getOpcode() == AMDGPU::COPY && + NextI->getOperand(1).getReg() == AMDGPU::VCC) { + NextI->getOperand(1).setReg(AMDGPU::VCC_LO); + } + } } } } diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp --- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -709,6 +709,11 @@ Register SrcReg = MI.getOperand(1).getReg(); assert(!MI.getOperand(1).getSubReg()); + // VCC already represents a lane mask and doesn't need special lowering. + if (SrcReg == AMDGPU::VCC_LO || SrcReg == AMDGPU::VCC) { + continue; + } + if (!SrcReg.isVirtual() || (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) { assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32); unsigned TmpReg = createLaneMaskReg(*MF); diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -20,15 +20,16 @@ } class VOP3b_Profile : VOPProfile<[vt, vt, vt, vt]> { - let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst); - let Asm64 = "$vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$omod"; + let Outs64 = (outs DstRC:$vdst); + + let Asm64 = "$vdst, vcc, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$omod"; let IsSingle = 1; let HasExtVOP3DPP = 0; let HasExtDPP = 0; } -def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile; -def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile; +def VOP3b_F32_VCC_F32_F32_F32 : VOP3b_Profile; +def VOP3b_F64_VCC_F64_F64_F64 : VOP3b_Profile; def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> { let HasClamp = 1; @@ -220,13 +221,13 @@ } // End isReMaterializable = 1 -let mayRaiseFPException = 0 in { // Seems suspicious but manual doesn't say it does. +let Defs = [VCC], mayRaiseFPException = 0 in { // Seems suspicious but manual doesn't say it does. let SchedRW = [WriteFloatFMA, WriteSALU] in - defm V_DIV_SCALE_F32 : VOP3Inst_Pseudo_Wrapper <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32> ; + defm V_DIV_SCALE_F32 : VOP3Inst_Pseudo_Wrapper <"v_div_scale_f32", VOP3b_F32_VCC_F32_F32_F32> ; // Double precision division pre-scale. let SchedRW = [WriteDouble, WriteSALU], FPDPRounding = 1 in - defm V_DIV_SCALE_F64 : VOP3Inst_Pseudo_Wrapper <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64>; + defm V_DIV_SCALE_F64 : VOP3Inst_Pseudo_Wrapper <"v_div_scale_f64", VOP3b_F64_VCC_F64_F64_F64>; } // End mayRaiseFPException = 0 let isReMaterializable = 1 in @@ -842,6 +843,32 @@ // GFX11. //===----------------------------------------------------------------------===// +let AssemblerPredicate = isGFX11Only in +multiclass VOP3be_VCCSDST_Real_gfx11 op, string asmName> { + defvar ps = !cast(NAME #"_e64"); + + def _e64_gfx11 : VOP3_Real, + VOP3be_gfx11 { + let IsSingle = ps.Pfl.IsSingle; + let sdst = !cast(VCC.HWEncoding); + let AsmString = asmName # !subst(", vcc", "", ps.AsmOperands); + } + + let isAsmParserOnly = 1 in { + def _e64_w32_gfx11 : VOP3_Base, VOP3be_gfx11 { + let sdst = !cast(VCC_LO.HWEncoding); + let WaveSizePredicate = isWave32; + let AsmString = asmName # !subst("vcc", "vcc_lo", ps.AsmOperands); + } + + def _e64_w64_gfx11 : VOP3_Base, VOP3be_gfx11 { + let sdst = !cast(VCC.HWEncoding); + let WaveSizePredicate = isWave64; + let AsmString = asmName # ps.AsmOperands; + } + } +} + defm V_FMA_DX9_ZERO_F32 : VOP3_Real_with_name_gfx11<0x209, "V_FMA_LEGACY_F32", "v_fma_dx9_zero_f32">; defm V_MAD_I32_I24 : VOP3_Realtriple_gfx11<0x20a>; defm V_MAD_U32_U24 : VOP3_Realtriple_gfx11<0x20b>; @@ -916,8 +943,8 @@ defm V_MINMAX_I32 : VOP3_Realtriple_gfx11<0x265>; defm V_DOT2_F16_F16 : VOP3Dot_Realtriple_gfx11<0x266>; defm V_DOT2_BF16_BF16 : VOP3Dot_Realtriple_gfx11<0x267>; -defm V_DIV_SCALE_F32 : VOP3be_Real_gfx11<0x2fc, "V_DIV_SCALE_F32", "v_div_scale_f32">; -defm V_DIV_SCALE_F64 : VOP3be_Real_gfx11<0x2fd, "V_DIV_SCALE_F64", "v_div_scale_f64">; +defm V_DIV_SCALE_F32 : VOP3be_VCCSDST_Real_gfx11<0x2fc, "v_div_scale_f32">; +defm V_DIV_SCALE_F64 : VOP3be_VCCSDST_Real_gfx11<0x2fd, "v_div_scale_f64">; defm V_MAD_U64_U32_gfx11 : VOP3be_Real_gfx11<0x2fe, "V_MAD_U64_U32_gfx11", "v_mad_u64_u32">; defm V_MAD_I64_I32_gfx11 : VOP3be_Real_gfx11<0x2ff, "V_MAD_I64_I32_gfx11", "v_mad_i64_i32">; defm V_ADD_NC_U16 : VOP3Only_Realtriple_gfx11<0x303>; @@ -989,6 +1016,30 @@ VOP3_Real(NAME#"_e64"), SIEncodingFamily.GFX10>, VOP3be_gfx10(NAME#"_e64").Pfl>; } + multiclass VOP3be_VCCSDST_Real_gfx10 op, string asmName> { + defvar ps = !cast(NAME #"_e64"); + + def _gfx10 : VOP3_Real, + VOP3be_gfx10 { + let IsSingle = ps.Pfl.IsSingle; + let sdst = !cast(VCC_LO.HWEncoding); + let AsmString = asmName # !subst(", vcc", "", ps.AsmOperands); + } + + let isAsmParserOnly = 1 in { + def _w32_gfx10 : VOP3_Base, VOP3be_gfx10 { + let sdst = !cast(VCC_LO.HWEncoding); + let WaveSizePredicate = isWave32; + let AsmString = asmName # !subst("vcc", "vcc_lo", ps.AsmOperands); + } + + def _w64_gfx10 : VOP3_Base, VOP3be_gfx10 { + let sdst = !cast(VCC.HWEncoding); + let WaveSizePredicate = isWave64; + let AsmString = asmName # ps.AsmOperands; + } + } + } multiclass VOP3Interp_Real_gfx10 op> { def _gfx10 : VOP3_Real(NAME), SIEncodingFamily.GFX10>, @@ -1135,6 +1186,13 @@ VOP3_Real(NAME#"_e64"), SIEncodingFamily.SI>, VOP3be_gfx6_gfx7(NAME#"_e64").Pfl>; } + multiclass VOP3be_VCCSDST_Real_gfx6_gfx7 op> { + def _gfx6_gfx7 : + VOP3_Real(NAME#"_e64"), SIEncodingFamily.SI>, + VOP3be_gfx6_gfx7(NAME#"_e64").Pfl> { + let sdst = !cast(VCC.HWEncoding); + } + } } // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" multiclass VOP3_Real_gfx6_gfx7_gfx10 op> : @@ -1143,6 +1201,9 @@ multiclass VOP3be_Real_gfx6_gfx7_gfx10 op> : VOP3be_Real_gfx6_gfx7, VOP3be_Real_gfx10; +multiclass VOP3be_VCCSDST_Real_gfx6_gfx7_gfx10 op, string asmName> : + VOP3be_VCCSDST_Real_gfx6_gfx7, VOP3be_VCCSDST_Real_gfx10; + defm V_LSHL_B64 : VOP3_Real_gfx6_gfx7<0x161>; defm V_LSHR_B64 : VOP3_Real_gfx6_gfx7<0x162>; defm V_ASHR_I64 : VOP3_Real_gfx6_gfx7<0x163>; @@ -1194,8 +1255,8 @@ defm V_MSAD_U8 : VOP3_Real_gfx6_gfx7_gfx10<0x171>; defm V_MQSAD_PK_U16_U8 : VOP3_Real_gfx6_gfx7_gfx10<0x173>; defm V_TRIG_PREOP_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x174>; -defm V_DIV_SCALE_F32 : VOP3be_Real_gfx6_gfx7_gfx10<0x16d>; -defm V_DIV_SCALE_F64 : VOP3be_Real_gfx6_gfx7_gfx10<0x16e>; +defm V_DIV_SCALE_F32 : VOP3be_VCCSDST_Real_gfx6_gfx7_gfx10<0x16d, "v_div_scale_f32">; +defm V_DIV_SCALE_F64 : VOP3be_VCCSDST_Real_gfx6_gfx7_gfx10<0x16e, "v_div_scale_f64">; // NB: Same opcode as v_mad_legacy_f32 let DecoderNamespace = "GFX10_B" in @@ -1221,6 +1282,13 @@ VOP3be_vi (NAME#"_e64").Pfl>; } +multiclass VOP3be_VCCSDST_Real_vi op> { + def _vi : VOP3_Real(NAME#"_e64"), SIEncodingFamily.VI>, + VOP3be_vi (NAME#"_e64").Pfl> { + let sdst = !cast(VCC.HWEncoding); + } +} + multiclass VOP3OpSel_Real_gfx9 op> { def _vi : VOP3_Real(NAME#"_e64"), SIEncodingFamily.VI>, VOP3OpSel_gfx9 (NAME#"_e64").Pfl>; @@ -1325,8 +1393,8 @@ defm V_CVT_PK_U8_F32 : VOP3_Real_vi <0x1dd>; defm V_DIV_FIXUP_F32 : VOP3_Real_vi <0x1de>; defm V_DIV_FIXUP_F64 : VOP3_Real_vi <0x1df>; -defm V_DIV_SCALE_F32 : VOP3be_Real_vi <0x1e0>; -defm V_DIV_SCALE_F64 : VOP3be_Real_vi <0x1e1>; +defm V_DIV_SCALE_F32 : VOP3be_VCCSDST_Real_vi <0x1e0>; +defm V_DIV_SCALE_F64 : VOP3be_VCCSDST_Real_vi <0x1e1>; defm V_DIV_FMAS_F32 : VOP3_Real_vi <0x1e2>; defm V_DIV_FMAS_F64 : VOP3_Real_vi <0x1e3>; defm V_MSAD_U8 : VOP3_Real_vi <0x1e4>; diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -155,10 +155,8 @@ bit IsSingle = ps.Pfl.IsSingle; } -class VOP3_Real : - VOP_Real , - InstSI , - SIMCInstr { +class VOP3_Base : + InstSI { let VALU = 1; let VOP3 = 1; @@ -185,6 +183,11 @@ VOPProfile Pfl = ps.Pfl; } +class VOP3_Real : + VOP3_Base , + VOP_Real , + SIMCInstr ; + // XXX - Is there any reason to distinguish this from regular VOP3 // here? class VOP3P_Real : diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/constant-bus-restriction.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/constant-bus-restriction.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/constant-bus-restriction.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/constant-bus-restriction.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX11 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10PLUS %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10PLUS %s ; Make sure we don't violate the constant bus restriction @@ -216,18 +216,13 @@ ; GFX9-LABEL: div_scale_s_s_true: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_div_scale_f32 v0, s[0:1], s2, v0, s2 +; GFX9-NEXT: v_div_scale_f32 v0, vcc, s2, v0, s2 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: div_scale_s_s_true: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_div_scale_f32 v0, s0, s2, s3, s2 -; GFX10-NEXT: ; return to shader part epilog -; -; GFX11-LABEL: div_scale_s_s_true: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_div_scale_f32 v0, null, s2, s3, s2 -; GFX11-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: div_scale_s_s_true: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_div_scale_f32 v0, vcc_lo, s2, s3, s2 +; GFX10PLUS-NEXT: ; return to shader part epilog %div.scale = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %src0, float %src1, i1 true) %result = extractvalue { float, i1 } %div.scale, 0 ret float %result @@ -237,18 +232,13 @@ ; GFX9-LABEL: div_scale_s_s_false: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_div_scale_f32 v0, s[0:1], v0, v0, s2 +; GFX9-NEXT: v_div_scale_f32 v0, vcc, v0, v0, s2 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: div_scale_s_s_false: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_div_scale_f32 v0, s0, s3, s3, s2 -; GFX10-NEXT: ; return to shader part epilog -; -; GFX11-LABEL: div_scale_s_s_false: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s2 -; GFX11-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: div_scale_s_s_false: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_div_scale_f32 v0, vcc_lo, s3, s3, s2 +; GFX10PLUS-NEXT: ; return to shader part epilog %div.scale = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %src0, float %src1, i1 false) %result = extractvalue { float, i1 } %div.scale, 0 ret float %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll @@ -21,7 +21,7 @@ ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 @@ -40,7 +40,7 @@ ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -140,7 +140,7 @@ ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 @@ -159,7 +159,7 @@ ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -221,7 +221,7 @@ ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v0, v0, v1 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 @@ -240,7 +240,7 @@ ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v0, v0, v1 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -302,7 +302,7 @@ ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v0, v0, v1 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 @@ -321,7 +321,7 @@ ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v0, v0, v1 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -410,7 +410,7 @@ ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v0, v0, v1 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 @@ -429,7 +429,7 @@ ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v0, v0, v1 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -506,7 +506,7 @@ ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 @@ -525,7 +525,7 @@ ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -589,7 +589,7 @@ ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4 ; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0 @@ -600,7 +600,7 @@ ; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v3, v3, v1 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -620,7 +620,7 @@ ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v5, v4 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -637,7 +637,7 @@ ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v3, v3, v1 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -820,7 +820,7 @@ ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4 ; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0 @@ -831,7 +831,7 @@ ; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v3, v3, v1 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -851,7 +851,7 @@ ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v5, v4 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -868,7 +868,7 @@ ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v3, v3, v1 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -982,7 +982,7 @@ ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, v0, v0, v2 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 @@ -993,7 +993,7 @@ ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, v1, v1, v2 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1013,7 +1013,7 @@ ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v0, v0, v2 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -1030,7 +1030,7 @@ ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v4 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -1137,7 +1137,7 @@ ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, v0, v0, v2 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 @@ -1148,7 +1148,7 @@ ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, v1, v1, v2 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1168,7 +1168,7 @@ ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v0, v0, v2 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -1185,7 +1185,7 @@ ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v4 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -1350,7 +1350,7 @@ ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, v0, v0, v2 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 @@ -1361,7 +1361,7 @@ ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, v1, v1, v2 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1381,7 +1381,7 @@ ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v0, v0, v2 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -1398,7 +1398,7 @@ ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v4 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -1533,7 +1533,7 @@ ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4 ; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0 @@ -1544,7 +1544,7 @@ ; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v3, v3, v1 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1564,7 +1564,7 @@ ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v5, v4 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -1581,7 +1581,7 @@ ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v3, v3, v1 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll @@ -18,7 +18,7 @@ ; GFX6-IEEE-LABEL: v_fdiv_f32: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 @@ -34,7 +34,7 @@ ; GFX6-FLUSH-LABEL: v_fdiv_f32: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -52,7 +52,7 @@ ; GFX89-IEEE-LABEL: v_fdiv_f32: ; GFX89-IEEE: ; %bb.0: ; GFX89-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; GFX89-IEEE-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 ; GFX89-IEEE-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 ; GFX89-IEEE-NEXT: v_rcp_f32_e32 v4, v2 ; GFX89-IEEE-NEXT: v_fma_f32 v5, -v2, v4, 1.0 @@ -68,7 +68,7 @@ ; GFX89-FLUSH-LABEL: v_fdiv_f32: ; GFX89-FLUSH: ; %bb.0: ; GFX89-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; GFX89-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 ; GFX89-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 ; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 ; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -87,7 +87,7 @@ ; GFX10-IEEE: ; %bb.0: ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v2, vcc_lo, v1, v1, v0 ; GFX10-IEEE-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX10-IEEE-NEXT: v_fma_f32 v4, -v2, v3, 1.0 @@ -104,7 +104,7 @@ ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, v1, v1, v0 ; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 ; GFX10-FLUSH-NEXT: s_denorm_mode 3 @@ -123,7 +123,7 @@ ; GFX11-IEEE: ; %bb.0: ; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v2, vcc_lo, v1, v1, v0 ; GFX11-IEEE-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 ; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v2 @@ -145,7 +145,7 @@ ; GFX11-FLUSH: ; %bb.0: ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, v1, v1, v0 ; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 @@ -200,7 +200,7 @@ ; GFX6-IEEE-LABEL: v_fdiv_f32_ulp25: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 @@ -229,7 +229,7 @@ ; GFX89-IEEE-LABEL: v_fdiv_f32_ulp25: ; GFX89-IEEE: ; %bb.0: ; GFX89-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; GFX89-IEEE-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 ; GFX89-IEEE-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 ; GFX89-IEEE-NEXT: v_rcp_f32_e32 v4, v2 ; GFX89-IEEE-NEXT: v_fma_f32 v5, -v2, v4, 1.0 @@ -246,7 +246,7 @@ ; GFX10-IEEE: ; %bb.0: ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v2, vcc_lo, v1, v1, v0 ; GFX10-IEEE-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX10-IEEE-NEXT: v_fma_f32 v4, -v2, v3, 1.0 @@ -275,7 +275,7 @@ ; GFX11-IEEE: ; %bb.0: ; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v2, vcc_lo, v1, v1, v0 ; GFX11-IEEE-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 ; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v2 @@ -315,7 +315,7 @@ ; GFX6-IEEE-LABEL: v_rcp_f32: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v1, vcc, v0, v0, 1.0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 ; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 ; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 @@ -331,7 +331,7 @@ ; GFX6-FLUSH-LABEL: v_rcp_f32: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, vcc, v0, v0, 1.0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -349,7 +349,7 @@ ; GFX89-IEEE-LABEL: v_rcp_f32: ; GFX89-IEEE: ; %bb.0: ; GFX89-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX89-IEEE-NEXT: v_div_scale_f32 v1, vcc, v0, v0, 1.0 ; GFX89-IEEE-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 ; GFX89-IEEE-NEXT: v_rcp_f32_e32 v3, v1 ; GFX89-IEEE-NEXT: v_fma_f32 v4, -v1, v3, 1.0 @@ -365,7 +365,7 @@ ; GFX89-FLUSH-LABEL: v_rcp_f32: ; GFX89-FLUSH: ; %bb.0: ; GFX89-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX89-FLUSH-NEXT: v_div_scale_f32 v1, vcc, v0, v0, 1.0 ; GFX89-FLUSH-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 ; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v3, v1 ; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -384,7 +384,7 @@ ; GFX10-IEEE: ; %bb.0: ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v1, s4, v0, v0, 1.0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v1, vcc_lo, v0, v0, 1.0 ; GFX10-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v1 ; GFX10-IEEE-NEXT: v_fma_f32 v3, -v1, v2, 1.0 @@ -401,7 +401,7 @@ ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-FLUSH-NEXT: v_div_scale_f32 v1, s4, v0, v0, 1.0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v1, vcc_lo, v0, v0, 1.0 ; GFX10-FLUSH-NEXT: v_div_scale_f32 v3, vcc_lo, 1.0, v0, 1.0 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 ; GFX10-FLUSH-NEXT: s_denorm_mode 3 @@ -420,7 +420,7 @@ ; GFX11-IEEE: ; %bb.0: ; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v1, null, v0, v0, 1.0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v1, vcc_lo, v0, v0, 1.0 ; GFX11-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0 ; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-IEEE-NEXT: v_rcp_f32_e32 v2, v1 @@ -442,7 +442,7 @@ ; GFX11-FLUSH: ; %bb.0: ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FLUSH-NEXT: v_div_scale_f32 v1, null, v0, v0, 1.0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v1, vcc_lo, v0, v0, 1.0 ; GFX11-FLUSH-NEXT: v_div_scale_f32 v3, vcc_lo, 1.0, v0, 1.0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 @@ -469,7 +469,7 @@ ; GFX6-IEEE-LABEL: v_rcp_f32_arcp: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v1, vcc, v0, v0, 1.0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 ; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 ; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 @@ -485,7 +485,7 @@ ; GFX6-FLUSH-LABEL: v_rcp_f32_arcp: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, vcc, v0, v0, 1.0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -503,7 +503,7 @@ ; GFX89-IEEE-LABEL: v_rcp_f32_arcp: ; GFX89-IEEE: ; %bb.0: ; GFX89-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX89-IEEE-NEXT: v_div_scale_f32 v1, vcc, v0, v0, 1.0 ; GFX89-IEEE-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 ; GFX89-IEEE-NEXT: v_rcp_f32_e32 v3, v1 ; GFX89-IEEE-NEXT: v_fma_f32 v4, -v1, v3, 1.0 @@ -519,7 +519,7 @@ ; GFX89-FLUSH-LABEL: v_rcp_f32_arcp: ; GFX89-FLUSH: ; %bb.0: ; GFX89-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX89-FLUSH-NEXT: v_div_scale_f32 v1, vcc, v0, v0, 1.0 ; GFX89-FLUSH-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 ; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v3, v1 ; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -538,7 +538,7 @@ ; GFX10-IEEE: ; %bb.0: ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v1, s4, v0, v0, 1.0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v1, vcc_lo, v0, v0, 1.0 ; GFX10-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v1 ; GFX10-IEEE-NEXT: v_fma_f32 v3, -v1, v2, 1.0 @@ -555,7 +555,7 @@ ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-FLUSH-NEXT: v_div_scale_f32 v1, s4, v0, v0, 1.0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v1, vcc_lo, v0, v0, 1.0 ; GFX10-FLUSH-NEXT: v_div_scale_f32 v3, vcc_lo, 1.0, v0, 1.0 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 ; GFX10-FLUSH-NEXT: s_denorm_mode 3 @@ -574,7 +574,7 @@ ; GFX11-IEEE: ; %bb.0: ; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v1, null, v0, v0, 1.0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v1, vcc_lo, v0, v0, 1.0 ; GFX11-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0 ; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-IEEE-NEXT: v_rcp_f32_e32 v2, v1 @@ -596,7 +596,7 @@ ; GFX11-FLUSH: ; %bb.0: ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FLUSH-NEXT: v_div_scale_f32 v1, null, v0, v0, 1.0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v1, vcc_lo, v0, v0, 1.0 ; GFX11-FLUSH-NEXT: v_div_scale_f32 v3, vcc_lo, 1.0, v0, 1.0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 @@ -739,7 +739,7 @@ ; GFX6-IEEE-LABEL: v_fdiv_f32_arcp_ulp25: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 @@ -768,7 +768,7 @@ ; GFX89-IEEE-LABEL: v_fdiv_f32_arcp_ulp25: ; GFX89-IEEE: ; %bb.0: ; GFX89-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; GFX89-IEEE-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 ; GFX89-IEEE-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 ; GFX89-IEEE-NEXT: v_rcp_f32_e32 v4, v2 ; GFX89-IEEE-NEXT: v_fma_f32 v5, -v2, v4, 1.0 @@ -785,7 +785,7 @@ ; GFX10-IEEE: ; %bb.0: ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v2, vcc_lo, v1, v1, v0 ; GFX10-IEEE-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX10-IEEE-NEXT: v_fma_f32 v4, -v2, v3, 1.0 @@ -814,7 +814,7 @@ ; GFX11-IEEE: ; %bb.0: ; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v2, vcc_lo, v1, v1, v0 ; GFX11-IEEE-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 ; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v2 @@ -854,7 +854,7 @@ ; GFX6-IEEE-LABEL: v_fdiv_v2f32: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4 ; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0 @@ -864,7 +864,7 @@ ; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7 ; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v3, v3, v1 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v5 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1 @@ -881,7 +881,7 @@ ; GFX6-FLUSH-LABEL: v_fdiv_v2f32: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v5, v4 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -893,7 +893,7 @@ ; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v3, v3, v1 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v6, v5 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1 @@ -912,35 +912,34 @@ ; GFX89-IEEE-LABEL: v_fdiv_v2f32: ; GFX89-IEEE: ; %bb.0: ; GFX89-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 -; GFX89-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 -; GFX89-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 -; GFX89-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], v1, v3, v1 -; GFX89-IEEE-NEXT: v_rcp_f32_e32 v8, v4 -; GFX89-IEEE-NEXT: v_rcp_f32_e32 v9, v5 -; GFX89-IEEE-NEXT: v_fma_f32 v10, -v4, v8, 1.0 -; GFX89-IEEE-NEXT: v_fma_f32 v8, v10, v8, v8 -; GFX89-IEEE-NEXT: v_fma_f32 v11, -v5, v9, 1.0 -; GFX89-IEEE-NEXT: v_fma_f32 v9, v11, v9, v9 -; GFX89-IEEE-NEXT: v_mul_f32_e32 v10, v6, v8 -; GFX89-IEEE-NEXT: v_mul_f32_e32 v11, v7, v9 -; GFX89-IEEE-NEXT: v_fma_f32 v12, -v4, v10, v6 -; GFX89-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v7 -; GFX89-IEEE-NEXT: v_fma_f32 v10, v12, v8, v10 -; GFX89-IEEE-NEXT: v_fma_f32 v11, v13, v9, v11 -; GFX89-IEEE-NEXT: v_fma_f32 v4, -v4, v10, v6 -; GFX89-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v7 -; GFX89-IEEE-NEXT: v_div_fmas_f32 v4, v4, v8, v10 -; GFX89-IEEE-NEXT: s_mov_b64 vcc, s[4:5] -; GFX89-IEEE-NEXT: v_div_fmas_f32 v5, v5, v9, v11 +; GFX89-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v0 +; GFX89-IEEE-NEXT: v_div_scale_f32 v5, vcc, v0, v2, v0 +; GFX89-IEEE-NEXT: v_rcp_f32_e32 v6, v4 +; GFX89-IEEE-NEXT: v_fma_f32 v7, -v4, v6, 1.0 +; GFX89-IEEE-NEXT: v_fma_f32 v6, v7, v6, v6 +; GFX89-IEEE-NEXT: v_mul_f32_e32 v7, v5, v6 +; GFX89-IEEE-NEXT: v_fma_f32 v8, -v4, v7, v5 +; GFX89-IEEE-NEXT: v_fma_f32 v7, v8, v6, v7 +; GFX89-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v5 +; GFX89-IEEE-NEXT: v_div_fmas_f32 v4, v4, v6, v7 +; GFX89-IEEE-NEXT: v_div_scale_f32 v5, vcc, v3, v3, v1 +; GFX89-IEEE-NEXT: v_div_scale_f32 v6, vcc, v1, v3, v1 ; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 +; GFX89-IEEE-NEXT: v_rcp_f32_e32 v7, v5 +; GFX89-IEEE-NEXT: v_fma_f32 v8, -v5, v7, 1.0 +; GFX89-IEEE-NEXT: v_fma_f32 v7, v8, v7, v7 +; GFX89-IEEE-NEXT: v_mul_f32_e32 v8, v6, v7 +; GFX89-IEEE-NEXT: v_fma_f32 v9, -v5, v8, v6 +; GFX89-IEEE-NEXT: v_fma_f32 v8, v9, v7, v8 +; GFX89-IEEE-NEXT: v_fma_f32 v5, -v5, v8, v6 +; GFX89-IEEE-NEXT: v_div_fmas_f32 v5, v5, v7, v8 ; GFX89-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 ; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX89-FLUSH-LABEL: v_fdiv_v2f32: ; GFX89-FLUSH: ; %bb.0: ; GFX89-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 +; GFX89-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v0 ; GFX89-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v0, v2, v0 ; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v6, v4 ; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -951,11 +950,11 @@ ; GFX89-FLUSH-NEXT: v_fma_f32 v7, v8, v6, v7 ; GFX89-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v5 ; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX89-FLUSH-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 ; GFX89-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v6, v7 +; GFX89-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v3, v3, v1 ; GFX89-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v1, v3, v1 -; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v7, v5 ; GFX89-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0 +; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v7, v5 ; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX89-FLUSH-NEXT: v_fma_f32 v2, -v5, v7, 1.0 ; GFX89-FLUSH-NEXT: v_fma_f32 v2, v2, v7, v7 @@ -972,28 +971,27 @@ ; GFX10-IEEE: ; %bb.0: ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v4, s4, v2, v2, v0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v5, s4, v3, v3, v1 -; GFX10-IEEE-NEXT: v_div_scale_f32 v10, vcc_lo, v0, v2, v0 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v6, v4 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v7, v5 -; GFX10-IEEE-NEXT: v_fma_f32 v8, -v4, v6, 1.0 -; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v7, 1.0 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v8, v6 -; GFX10-IEEE-NEXT: v_div_scale_f32 v8, s4, v1, v3, v1 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v7 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7 -; GFX10-IEEE-NEXT: v_fma_f32 v12, -v4, v9, v10 -; GFX10-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v8 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7 -; GFX10-IEEE-NEXT: v_fma_f32 v4, -v4, v9, v10 -; GFX10-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v8 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v4, v6, v9 -; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v5, v7, v11 +; GFX10-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, v2, v2, v0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v7, vcc_lo, v0, v2, v0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v4 +; GFX10-IEEE-NEXT: v_fma_f32 v6, -v4, v5, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v5 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v7, v5 +; GFX10-IEEE-NEXT: v_fma_f32 v8, -v4, v6, v7 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v8, v5 +; GFX10-IEEE-NEXT: v_fma_f32 v4, -v4, v6, v7 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v6 +; GFX10-IEEE-NEXT: v_div_scale_f32 v5, vcc_lo, v3, v3, v1 +; GFX10-IEEE-NEXT: v_div_scale_f32 v8, vcc_lo, v1, v3, v1 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v6, v5 +; GFX10-IEEE-NEXT: v_fma_f32 v7, -v5, v6, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v7, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v8, v6 +; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v7, v8 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v6 +; GFX10-IEEE-NEXT: v_fma_f32 v5, -v5, v7, v8 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v5, v6, v7 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -1001,7 +999,7 @@ ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, s4, v2, v2, v0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, v2, v2, v0 ; GFX10-FLUSH-NEXT: v_div_scale_f32 v6, vcc_lo, v0, v2, v0 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v4 ; GFX10-FLUSH-NEXT: s_denorm_mode 3 @@ -1012,20 +1010,20 @@ ; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v7, v8, v5 ; GFX10-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6 ; GFX10-FLUSH-NEXT: s_denorm_mode 0 -; GFX10-FLUSH-NEXT: v_div_scale_f32 v6, s4, v3, v3, v1 ; GFX10-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7 -; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v6 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v5, vcc_lo, v3, v3, v1 ; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v6, v5 ; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, v1, v3, v1 ; GFX10-FLUSH-NEXT: s_denorm_mode 3 -; GFX10-FLUSH-NEXT: v_fma_f32 v4, -v6, v5, 1.0 -; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v4, v5 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v4, v2, v5 -; GFX10-FLUSH-NEXT: v_fma_f32 v7, -v6, v4, v2 -; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v4, v7, v5 -; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v6, v4, v2 +; GFX10-FLUSH-NEXT: v_fma_f32 v4, -v5, v6, 1.0 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v6, v4, v6 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v4, v2, v6 +; GFX10-FLUSH-NEXT: v_fma_f32 v7, -v5, v4, v2 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v4, v7, v6 +; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v5, v4, v2 ; GFX10-FLUSH-NEXT: s_denorm_mode 0 -; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v5, v4 +; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v6, v4 ; GFX10-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -1033,36 +1031,37 @@ ; GFX11-IEEE: ; %bb.0: ; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v4, null, v2, v2, v0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v5, null, v3, v3, v1 -; GFX11-IEEE-NEXT: v_div_scale_f32 v10, vcc_lo, v0, v2, v0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_rcp_f32_e32 v6, v4 -; GFX11-IEEE-NEXT: v_rcp_f32_e32 v7, v5 +; GFX11-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, v2, v2, v0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v7, vcc_lo, v0, v2, v0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v5, v4 ; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff -; GFX11-IEEE-NEXT: v_fma_f32 v8, -v4, v6, 1.0 -; GFX11-IEEE-NEXT: v_fma_f32 v9, -v5, v7, 1.0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_dual_fmac_f32 v6, v8, v6 :: v_dual_fmac_f32 v7, v9, v7 -; GFX11-IEEE-NEXT: v_div_scale_f32 v8, s0, v1, v3, v1 -; GFX11-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7 -; GFX11-IEEE-NEXT: v_fma_f32 v12, -v4, v9, v10 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v8 -; GFX11-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7 -; GFX11-IEEE-NEXT: v_fma_f32 v4, -v4, v9, v10 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v8 -; GFX11-IEEE-NEXT: v_div_fmas_f32 v4, v4, v6, v9 -; GFX11-IEEE-NEXT: s_mov_b32 vcc_lo, s0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_div_fmas_f32 v5, v5, v7, v11 +; GFX11-IEEE-NEXT: v_fma_f32 v6, -v4, v5, 1.0 +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v5 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_mul_f32_e32 v6, v7, v5 +; GFX11-IEEE-NEXT: v_fma_f32 v8, -v4, v6, v7 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v6, v8, v5 +; GFX11-IEEE-NEXT: v_fma_f32 v4, -v4, v6, v7 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v6 +; GFX11-IEEE-NEXT: v_div_scale_f32 v5, vcc_lo, v3, v3, v1 +; GFX11-IEEE-NEXT: v_div_scale_f32 v8, vcc_lo, v1, v3, v1 ; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v6, v5 +; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff +; GFX11-IEEE-NEXT: v_fma_f32 v7, -v5, v6, 1.0 +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v6, v7, v6 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_mul_f32_e32 v7, v8, v6 +; GFX11-IEEE-NEXT: v_fma_f32 v9, -v5, v7, v8 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v6 +; GFX11-IEEE-NEXT: v_fma_f32 v5, -v5, v7, v8 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_div_fmas_f32 v5, v5, v6, v7 ; GFX11-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 ; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -1070,7 +1069,7 @@ ; GFX11-FLUSH: ; %bb.0: ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, null, v2, v2, v0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, v2, v2, v0 ; GFX11-FLUSH-NEXT: v_div_scale_f32 v6, vcc_lo, v0, v2, v0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v5, v4 @@ -1085,26 +1084,26 @@ ; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v7, v8, v5 ; GFX11-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6 ; GFX11-FLUSH-NEXT: s_denorm_mode 0 -; GFX11-FLUSH-NEXT: v_div_scale_f32 v6, null, v3, v3, v1 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7 -; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v5, v6 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_div_scale_f32 v5, vcc_lo, v3, v3, v1 ; GFX11-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v6, v5 ; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, v1, v3, v1 ; GFX11-FLUSH-NEXT: s_denorm_mode 3 ; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff -; GFX11-FLUSH-NEXT: v_fma_f32 v4, -v6, v5, 1.0 -; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v5, v4, v5 +; GFX11-FLUSH-NEXT: v_fma_f32 v4, -v5, v6, 1.0 +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v6, v4, v6 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FLUSH-NEXT: v_mul_f32_e32 v4, v2, v5 -; GFX11-FLUSH-NEXT: v_fma_f32 v7, -v6, v4, v2 +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v4, v2, v6 +; GFX11-FLUSH-NEXT: v_fma_f32 v7, -v5, v4, v2 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v4, v7, v5 -; GFX11-FLUSH-NEXT: v_fma_f32 v2, -v6, v4, v2 +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v4, v7, v6 +; GFX11-FLUSH-NEXT: v_fma_f32 v2, -v5, v4, v2 ; GFX11-FLUSH-NEXT: s_denorm_mode 0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v5, v4 +; GFX11-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v6, v4 ; GFX11-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1 ; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x float> %a, %b @@ -1148,7 +1147,7 @@ ; GFX6-IEEE-LABEL: v_fdiv_v2f32_ulp25: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4 ; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0 @@ -1158,7 +1157,7 @@ ; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7 ; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v3, v3, v1 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v5 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1 @@ -1194,28 +1193,27 @@ ; GFX89-IEEE-LABEL: v_fdiv_v2f32_ulp25: ; GFX89-IEEE: ; %bb.0: ; GFX89-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 -; GFX89-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 -; GFX89-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 -; GFX89-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], v1, v3, v1 -; GFX89-IEEE-NEXT: v_rcp_f32_e32 v8, v4 -; GFX89-IEEE-NEXT: v_rcp_f32_e32 v9, v5 -; GFX89-IEEE-NEXT: v_fma_f32 v10, -v4, v8, 1.0 -; GFX89-IEEE-NEXT: v_fma_f32 v8, v10, v8, v8 -; GFX89-IEEE-NEXT: v_fma_f32 v11, -v5, v9, 1.0 -; GFX89-IEEE-NEXT: v_fma_f32 v9, v11, v9, v9 -; GFX89-IEEE-NEXT: v_mul_f32_e32 v10, v6, v8 -; GFX89-IEEE-NEXT: v_mul_f32_e32 v11, v7, v9 -; GFX89-IEEE-NEXT: v_fma_f32 v12, -v4, v10, v6 -; GFX89-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v7 -; GFX89-IEEE-NEXT: v_fma_f32 v10, v12, v8, v10 -; GFX89-IEEE-NEXT: v_fma_f32 v11, v13, v9, v11 -; GFX89-IEEE-NEXT: v_fma_f32 v4, -v4, v10, v6 -; GFX89-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v7 -; GFX89-IEEE-NEXT: v_div_fmas_f32 v4, v4, v8, v10 -; GFX89-IEEE-NEXT: s_mov_b64 vcc, s[4:5] -; GFX89-IEEE-NEXT: v_div_fmas_f32 v5, v5, v9, v11 +; GFX89-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v0 +; GFX89-IEEE-NEXT: v_div_scale_f32 v5, vcc, v0, v2, v0 +; GFX89-IEEE-NEXT: v_rcp_f32_e32 v6, v4 +; GFX89-IEEE-NEXT: v_fma_f32 v7, -v4, v6, 1.0 +; GFX89-IEEE-NEXT: v_fma_f32 v6, v7, v6, v6 +; GFX89-IEEE-NEXT: v_mul_f32_e32 v7, v5, v6 +; GFX89-IEEE-NEXT: v_fma_f32 v8, -v4, v7, v5 +; GFX89-IEEE-NEXT: v_fma_f32 v7, v8, v6, v7 +; GFX89-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v5 +; GFX89-IEEE-NEXT: v_div_fmas_f32 v4, v4, v6, v7 +; GFX89-IEEE-NEXT: v_div_scale_f32 v5, vcc, v3, v3, v1 +; GFX89-IEEE-NEXT: v_div_scale_f32 v6, vcc, v1, v3, v1 ; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 +; GFX89-IEEE-NEXT: v_rcp_f32_e32 v7, v5 +; GFX89-IEEE-NEXT: v_fma_f32 v8, -v5, v7, 1.0 +; GFX89-IEEE-NEXT: v_fma_f32 v7, v8, v7, v7 +; GFX89-IEEE-NEXT: v_mul_f32_e32 v8, v6, v7 +; GFX89-IEEE-NEXT: v_fma_f32 v9, -v5, v8, v6 +; GFX89-IEEE-NEXT: v_fma_f32 v8, v9, v7, v8 +; GFX89-IEEE-NEXT: v_fma_f32 v5, -v5, v8, v6 +; GFX89-IEEE-NEXT: v_div_fmas_f32 v5, v5, v7, v8 ; GFX89-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 ; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -1223,28 +1221,27 @@ ; GFX10-IEEE: ; %bb.0: ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v4, s4, v2, v2, v0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v5, s4, v3, v3, v1 -; GFX10-IEEE-NEXT: v_div_scale_f32 v10, vcc_lo, v0, v2, v0 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v6, v4 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v7, v5 -; GFX10-IEEE-NEXT: v_fma_f32 v8, -v4, v6, 1.0 -; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v7, 1.0 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v8, v6 -; GFX10-IEEE-NEXT: v_div_scale_f32 v8, s4, v1, v3, v1 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v7 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7 -; GFX10-IEEE-NEXT: v_fma_f32 v12, -v4, v9, v10 -; GFX10-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v8 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7 -; GFX10-IEEE-NEXT: v_fma_f32 v4, -v4, v9, v10 -; GFX10-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v8 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v4, v6, v9 -; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v5, v7, v11 +; GFX10-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, v2, v2, v0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v7, vcc_lo, v0, v2, v0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v4 +; GFX10-IEEE-NEXT: v_fma_f32 v6, -v4, v5, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v5 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v7, v5 +; GFX10-IEEE-NEXT: v_fma_f32 v8, -v4, v6, v7 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v8, v5 +; GFX10-IEEE-NEXT: v_fma_f32 v4, -v4, v6, v7 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v6 +; GFX10-IEEE-NEXT: v_div_scale_f32 v5, vcc_lo, v3, v3, v1 +; GFX10-IEEE-NEXT: v_div_scale_f32 v8, vcc_lo, v1, v3, v1 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v6, v5 +; GFX10-IEEE-NEXT: v_fma_f32 v7, -v5, v6, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v7, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v8, v6 +; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v7, v8 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v6 +; GFX10-IEEE-NEXT: v_fma_f32 v5, -v5, v7, v8 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v5, v6, v7 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -1270,36 +1267,37 @@ ; GFX11-IEEE: ; %bb.0: ; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v4, null, v2, v2, v0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v5, null, v3, v3, v1 -; GFX11-IEEE-NEXT: v_div_scale_f32 v10, vcc_lo, v0, v2, v0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_rcp_f32_e32 v6, v4 -; GFX11-IEEE-NEXT: v_rcp_f32_e32 v7, v5 +; GFX11-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, v2, v2, v0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v7, vcc_lo, v0, v2, v0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v5, v4 ; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff -; GFX11-IEEE-NEXT: v_fma_f32 v8, -v4, v6, 1.0 -; GFX11-IEEE-NEXT: v_fma_f32 v9, -v5, v7, 1.0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_dual_fmac_f32 v6, v8, v6 :: v_dual_fmac_f32 v7, v9, v7 -; GFX11-IEEE-NEXT: v_div_scale_f32 v8, s0, v1, v3, v1 -; GFX11-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7 -; GFX11-IEEE-NEXT: v_fma_f32 v12, -v4, v9, v10 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v8 -; GFX11-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7 -; GFX11-IEEE-NEXT: v_fma_f32 v4, -v4, v9, v10 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v8 -; GFX11-IEEE-NEXT: v_div_fmas_f32 v4, v4, v6, v9 -; GFX11-IEEE-NEXT: s_mov_b32 vcc_lo, s0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_div_fmas_f32 v5, v5, v7, v11 +; GFX11-IEEE-NEXT: v_fma_f32 v6, -v4, v5, 1.0 +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v5 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_mul_f32_e32 v6, v7, v5 +; GFX11-IEEE-NEXT: v_fma_f32 v8, -v4, v6, v7 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v6, v8, v5 +; GFX11-IEEE-NEXT: v_fma_f32 v4, -v4, v6, v7 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v6 +; GFX11-IEEE-NEXT: v_div_scale_f32 v5, vcc_lo, v3, v3, v1 +; GFX11-IEEE-NEXT: v_div_scale_f32 v8, vcc_lo, v1, v3, v1 ; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v6, v5 +; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff +; GFX11-IEEE-NEXT: v_fma_f32 v7, -v5, v6, 1.0 +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v6, v7, v6 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_mul_f32_e32 v7, v8, v6 +; GFX11-IEEE-NEXT: v_fma_f32 v9, -v5, v7, v8 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v6 +; GFX11-IEEE-NEXT: v_fma_f32 v5, -v5, v7, v8 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_div_fmas_f32 v5, v5, v6, v7 ; GFX11-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 ; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -1329,7 +1327,7 @@ ; GFX6-IEEE-LABEL: v_rcp_v2f32: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v0, v0, 1.0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 @@ -1339,7 +1337,7 @@ ; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, v1, v1, 1.0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, 1.0, v1, 1.0 @@ -1356,7 +1354,7 @@ ; GFX6-FLUSH-LABEL: v_rcp_v2f32: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v0, v0, 1.0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -1368,7 +1366,7 @@ ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v1, v1, 1.0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, 1.0, v1, 1.0 @@ -1387,35 +1385,34 @@ ; GFX89-IEEE-LABEL: v_rcp_v2f32: ; GFX89-IEEE: ; %bb.0: ; GFX89-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 -; GFX89-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 -; GFX89-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 -; GFX89-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], 1.0, v1, 1.0 -; GFX89-IEEE-NEXT: v_rcp_f32_e32 v6, v2 -; GFX89-IEEE-NEXT: v_rcp_f32_e32 v7, v3 -; GFX89-IEEE-NEXT: v_fma_f32 v8, -v2, v6, 1.0 -; GFX89-IEEE-NEXT: v_fma_f32 v6, v8, v6, v6 -; GFX89-IEEE-NEXT: v_fma_f32 v9, -v3, v7, 1.0 -; GFX89-IEEE-NEXT: v_fma_f32 v7, v9, v7, v7 -; GFX89-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6 -; GFX89-IEEE-NEXT: v_mul_f32_e32 v9, v5, v7 -; GFX89-IEEE-NEXT: v_fma_f32 v10, -v2, v8, v4 -; GFX89-IEEE-NEXT: v_fma_f32 v11, -v3, v9, v5 -; GFX89-IEEE-NEXT: v_fma_f32 v8, v10, v6, v8 -; GFX89-IEEE-NEXT: v_fma_f32 v9, v11, v7, v9 -; GFX89-IEEE-NEXT: v_fma_f32 v2, -v2, v8, v4 -; GFX89-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v5 -; GFX89-IEEE-NEXT: v_div_fmas_f32 v2, v2, v6, v8 -; GFX89-IEEE-NEXT: s_mov_b64 vcc, s[4:5] -; GFX89-IEEE-NEXT: v_div_fmas_f32 v3, v3, v7, v9 +; GFX89-IEEE-NEXT: v_div_scale_f32 v2, vcc, v0, v0, 1.0 +; GFX89-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX89-IEEE-NEXT: v_rcp_f32_e32 v4, v2 +; GFX89-IEEE-NEXT: v_fma_f32 v5, -v2, v4, 1.0 +; GFX89-IEEE-NEXT: v_fma_f32 v4, v5, v4, v4 +; GFX89-IEEE-NEXT: v_mul_f32_e32 v5, v3, v4 +; GFX89-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v3 +; GFX89-IEEE-NEXT: v_fma_f32 v5, v6, v4, v5 +; GFX89-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX89-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX89-IEEE-NEXT: v_div_scale_f32 v3, vcc, v1, v1, 1.0 +; GFX89-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 ; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX89-IEEE-NEXT: v_rcp_f32_e32 v5, v3 +; GFX89-IEEE-NEXT: v_fma_f32 v6, -v3, v5, 1.0 +; GFX89-IEEE-NEXT: v_fma_f32 v5, v6, v5, v5 +; GFX89-IEEE-NEXT: v_mul_f32_e32 v6, v4, v5 +; GFX89-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v4 +; GFX89-IEEE-NEXT: v_fma_f32 v6, v7, v5, v6 +; GFX89-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v4 +; GFX89-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v6 ; GFX89-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0 ; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX89-FLUSH-LABEL: v_rcp_v2f32: ; GFX89-FLUSH: ; %bb.0: ; GFX89-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX89-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v0, v0, 1.0 ; GFX89-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 ; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 ; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -1426,11 +1423,11 @@ ; GFX89-FLUSH-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX89-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v3 ; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX89-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 ; GFX89-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX89-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v1, v1, 1.0 ; GFX89-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 -; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 ; GFX89-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 ; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX89-FLUSH-NEXT: v_fma_f32 v2, -v3, v5, 1.0 ; GFX89-FLUSH-NEXT: v_fma_f32 v2, v2, v5, v5 @@ -1447,28 +1444,27 @@ ; GFX10-IEEE: ; %bb.0: ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v2, s4, v0, v0, 1.0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v3, s4, v1, v1, 1.0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v8, vcc_lo, 1.0, v0, 1.0 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0 -; GFX10-IEEE-NEXT: v_fma_f32 v7, -v3, v5, 1.0 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v4 -; GFX10-IEEE-NEXT: v_div_scale_f32 v6, s4, 1.0, v1, 1.0 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v7, v5 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v8, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v6, v5 -; GFX10-IEEE-NEXT: v_fma_f32 v10, -v2, v7, v8 -; GFX10-IEEE-NEXT: v_fma_f32 v11, -v3, v9, v6 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v10, v4 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v11, v5 -; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v8 -; GFX10-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v6 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v7 -; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v9 +; GFX10-IEEE-NEXT: v_div_scale_f32 v2, vcc_lo, v0, v0, 1.0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v5, vcc_lo, 1.0, v0, 1.0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX10-IEEE-NEXT: v_fma_f32 v4, -v2, v3, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v4, v3 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v5, v3 +; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v4, v5 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v3 +; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v4, v5 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v4 +; GFX10-IEEE-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v1, 1.0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v6, vcc_lo, 1.0, v1, 1.0 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v3 +; GFX10-IEEE-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v5, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v6, v4 +; GFX10-IEEE-NEXT: v_fma_f32 v7, -v3, v5, v6 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v7, v4 +; GFX10-IEEE-NEXT: v_fma_f32 v3, -v3, v5, v6 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v5 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -1476,7 +1472,7 @@ ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, s4, v0, v0, 1.0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, v0, v0, 1.0 ; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 ; GFX10-FLUSH-NEXT: s_denorm_mode 3 @@ -1487,20 +1483,20 @@ ; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 ; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX10-FLUSH-NEXT: s_denorm_mode 0 -; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, s4, v1, v1, 1.0 ; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v4 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v1, 1.0 ; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 ; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, v1, 1.0 ; GFX10-FLUSH-NEXT: s_denorm_mode 3 -; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v4, v3, 1.0 -; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v3 -; GFX10-FLUSH-NEXT: v_fma_f32 v6, -v4, v5, v2 -; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 -; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v4, v5, v2 +; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v4, v5, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v4 +; GFX10-FLUSH-NEXT: v_fma_f32 v6, -v3, v5, v2 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v4 +; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v3, v5, v2 ; GFX10-FLUSH-NEXT: s_denorm_mode 0 -; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX10-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -1508,36 +1504,37 @@ ; GFX11-IEEE: ; %bb.0: ; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v2, null, v0, v0, 1.0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v3, null, v1, v1, 1.0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v8, vcc_lo, 1.0, v0, 1.0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_rcp_f32_e32 v4, v2 -; GFX11-IEEE-NEXT: v_rcp_f32_e32 v5, v3 +; GFX11-IEEE-NEXT: v_div_scale_f32 v2, vcc_lo, v0, v0, 1.0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v5, vcc_lo, 1.0, v0, 1.0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff -; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0 -; GFX11-IEEE-NEXT: v_fma_f32 v7, -v3, v5, 1.0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_dual_fmac_f32 v4, v6, v4 :: v_dual_fmac_f32 v5, v7, v5 -; GFX11-IEEE-NEXT: v_div_scale_f32 v6, s0, 1.0, v1, 1.0 -; GFX11-IEEE-NEXT: v_mul_f32_e32 v7, v8, v4 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_mul_f32_e32 v9, v6, v5 -; GFX11-IEEE-NEXT: v_fma_f32 v10, -v2, v7, v8 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_fma_f32 v11, -v3, v9, v6 -; GFX11-IEEE-NEXT: v_fmac_f32_e32 v7, v10, v4 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_fmac_f32_e32 v9, v11, v5 -; GFX11-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v8 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v6 -; GFX11-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v7 -; GFX11-IEEE-NEXT: s_mov_b32 vcc_lo, s0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v9 +; GFX11-IEEE-NEXT: v_fma_f32 v4, -v2, v3, 1.0 +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v4, v3 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_mul_f32_e32 v4, v5, v3 +; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v4, v5 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v3 +; GFX11-IEEE-NEXT: v_fma_f32 v2, -v2, v4, v5 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v4 +; GFX11-IEEE-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v1, 1.0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v6, vcc_lo, 1.0, v1, 1.0 ; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v4, v3 +; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff +; GFX11-IEEE-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v4, v5, v4 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_mul_f32_e32 v5, v6, v4 +; GFX11-IEEE-NEXT: v_fma_f32 v7, -v3, v5, v6 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v5, v7, v4 +; GFX11-IEEE-NEXT: v_fma_f32 v3, -v3, v5, v6 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v5 ; GFX11-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0 ; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -1545,7 +1542,7 @@ ; GFX11-FLUSH: ; %bb.0: ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, null, v0, v0, 1.0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, v0, v0, 1.0 ; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 @@ -1560,26 +1557,26 @@ ; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 ; GFX11-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX11-FLUSH-NEXT: s_denorm_mode 0 -; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, null, v1, v1, 1.0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v4 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v1, 1.0 ; GFX11-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 ; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, v1, 1.0 ; GFX11-FLUSH-NEXT: s_denorm_mode 3 ; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff -; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v4, v3, 1.0 -; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3 +; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v4, v5, v4 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v3 -; GFX11-FLUSH-NEXT: v_fma_f32 v6, -v4, v5, v2 +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v4 +; GFX11-FLUSH-NEXT: v_fma_f32 v6, -v3, v5, v2 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 -; GFX11-FLUSH-NEXT: v_fma_f32 v2, -v4, v5, v2 +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v4 +; GFX11-FLUSH-NEXT: v_fma_f32 v2, -v3, v5, v2 ; GFX11-FLUSH-NEXT: s_denorm_mode 0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX11-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX11-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x float> , %x @@ -1590,7 +1587,7 @@ ; GFX6-IEEE-LABEL: v_rcp_v2f32_arcp: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v0, v0, 1.0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 @@ -1600,7 +1597,7 @@ ; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, v1, v1, 1.0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, 1.0, v1, 1.0 @@ -1617,7 +1614,7 @@ ; GFX6-FLUSH-LABEL: v_rcp_v2f32_arcp: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v0, v0, 1.0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -1629,7 +1626,7 @@ ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v1, v1, 1.0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, 1.0, v1, 1.0 @@ -1648,35 +1645,34 @@ ; GFX89-IEEE-LABEL: v_rcp_v2f32_arcp: ; GFX89-IEEE: ; %bb.0: ; GFX89-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 -; GFX89-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 -; GFX89-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 -; GFX89-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], 1.0, v1, 1.0 -; GFX89-IEEE-NEXT: v_rcp_f32_e32 v6, v2 -; GFX89-IEEE-NEXT: v_rcp_f32_e32 v7, v3 -; GFX89-IEEE-NEXT: v_fma_f32 v8, -v2, v6, 1.0 -; GFX89-IEEE-NEXT: v_fma_f32 v6, v8, v6, v6 -; GFX89-IEEE-NEXT: v_fma_f32 v9, -v3, v7, 1.0 -; GFX89-IEEE-NEXT: v_fma_f32 v7, v9, v7, v7 -; GFX89-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6 -; GFX89-IEEE-NEXT: v_mul_f32_e32 v9, v5, v7 -; GFX89-IEEE-NEXT: v_fma_f32 v10, -v2, v8, v4 -; GFX89-IEEE-NEXT: v_fma_f32 v11, -v3, v9, v5 -; GFX89-IEEE-NEXT: v_fma_f32 v8, v10, v6, v8 -; GFX89-IEEE-NEXT: v_fma_f32 v9, v11, v7, v9 -; GFX89-IEEE-NEXT: v_fma_f32 v2, -v2, v8, v4 -; GFX89-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v5 -; GFX89-IEEE-NEXT: v_div_fmas_f32 v2, v2, v6, v8 -; GFX89-IEEE-NEXT: s_mov_b64 vcc, s[4:5] -; GFX89-IEEE-NEXT: v_div_fmas_f32 v3, v3, v7, v9 +; GFX89-IEEE-NEXT: v_div_scale_f32 v2, vcc, v0, v0, 1.0 +; GFX89-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX89-IEEE-NEXT: v_rcp_f32_e32 v4, v2 +; GFX89-IEEE-NEXT: v_fma_f32 v5, -v2, v4, 1.0 +; GFX89-IEEE-NEXT: v_fma_f32 v4, v5, v4, v4 +; GFX89-IEEE-NEXT: v_mul_f32_e32 v5, v3, v4 +; GFX89-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v3 +; GFX89-IEEE-NEXT: v_fma_f32 v5, v6, v4, v5 +; GFX89-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX89-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX89-IEEE-NEXT: v_div_scale_f32 v3, vcc, v1, v1, 1.0 +; GFX89-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 ; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX89-IEEE-NEXT: v_rcp_f32_e32 v5, v3 +; GFX89-IEEE-NEXT: v_fma_f32 v6, -v3, v5, 1.0 +; GFX89-IEEE-NEXT: v_fma_f32 v5, v6, v5, v5 +; GFX89-IEEE-NEXT: v_mul_f32_e32 v6, v4, v5 +; GFX89-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v4 +; GFX89-IEEE-NEXT: v_fma_f32 v6, v7, v5, v6 +; GFX89-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v4 +; GFX89-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v6 ; GFX89-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0 ; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX89-FLUSH-LABEL: v_rcp_v2f32_arcp: ; GFX89-FLUSH: ; %bb.0: ; GFX89-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX89-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v0, v0, 1.0 ; GFX89-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 ; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 ; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -1687,11 +1683,11 @@ ; GFX89-FLUSH-NEXT: v_fma_f32 v5, v6, v4, v5 ; GFX89-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v3 ; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX89-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 ; GFX89-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX89-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v1, v1, 1.0 ; GFX89-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 -; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 ; GFX89-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 ; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX89-FLUSH-NEXT: v_fma_f32 v2, -v3, v5, 1.0 ; GFX89-FLUSH-NEXT: v_fma_f32 v2, v2, v5, v5 @@ -1708,28 +1704,27 @@ ; GFX10-IEEE: ; %bb.0: ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v2, s4, v0, v0, 1.0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v3, s4, v1, v1, 1.0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v8, vcc_lo, 1.0, v0, 1.0 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0 -; GFX10-IEEE-NEXT: v_fma_f32 v7, -v3, v5, 1.0 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v4 -; GFX10-IEEE-NEXT: v_div_scale_f32 v6, s4, 1.0, v1, 1.0 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v7, v5 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v8, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v6, v5 -; GFX10-IEEE-NEXT: v_fma_f32 v10, -v2, v7, v8 -; GFX10-IEEE-NEXT: v_fma_f32 v11, -v3, v9, v6 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v10, v4 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v11, v5 -; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v8 -; GFX10-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v6 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v7 -; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v9 +; GFX10-IEEE-NEXT: v_div_scale_f32 v2, vcc_lo, v0, v0, 1.0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v5, vcc_lo, 1.0, v0, 1.0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX10-IEEE-NEXT: v_fma_f32 v4, -v2, v3, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v4, v3 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v5, v3 +; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v4, v5 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v3 +; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v4, v5 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v4 +; GFX10-IEEE-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v1, 1.0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v6, vcc_lo, 1.0, v1, 1.0 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v3 +; GFX10-IEEE-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v5, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v6, v4 +; GFX10-IEEE-NEXT: v_fma_f32 v7, -v3, v5, v6 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v7, v4 +; GFX10-IEEE-NEXT: v_fma_f32 v3, -v3, v5, v6 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v5 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -1737,7 +1732,7 @@ ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, s4, v0, v0, 1.0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, v0, v0, 1.0 ; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 ; GFX10-FLUSH-NEXT: s_denorm_mode 3 @@ -1748,20 +1743,20 @@ ; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 ; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX10-FLUSH-NEXT: s_denorm_mode 0 -; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, s4, v1, v1, 1.0 ; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v4 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v1, 1.0 ; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 ; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, v1, 1.0 ; GFX10-FLUSH-NEXT: s_denorm_mode 3 -; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v4, v3, 1.0 -; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v3 -; GFX10-FLUSH-NEXT: v_fma_f32 v6, -v4, v5, v2 -; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 -; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v4, v5, v2 +; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v4, v5, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v4 +; GFX10-FLUSH-NEXT: v_fma_f32 v6, -v3, v5, v2 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v4 +; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v3, v5, v2 ; GFX10-FLUSH-NEXT: s_denorm_mode 0 -; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX10-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -1769,36 +1764,37 @@ ; GFX11-IEEE: ; %bb.0: ; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v2, null, v0, v0, 1.0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v3, null, v1, v1, 1.0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v8, vcc_lo, 1.0, v0, 1.0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_rcp_f32_e32 v4, v2 -; GFX11-IEEE-NEXT: v_rcp_f32_e32 v5, v3 +; GFX11-IEEE-NEXT: v_div_scale_f32 v2, vcc_lo, v0, v0, 1.0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v5, vcc_lo, 1.0, v0, 1.0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff -; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0 -; GFX11-IEEE-NEXT: v_fma_f32 v7, -v3, v5, 1.0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_dual_fmac_f32 v4, v6, v4 :: v_dual_fmac_f32 v5, v7, v5 -; GFX11-IEEE-NEXT: v_div_scale_f32 v6, s0, 1.0, v1, 1.0 -; GFX11-IEEE-NEXT: v_mul_f32_e32 v7, v8, v4 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_mul_f32_e32 v9, v6, v5 -; GFX11-IEEE-NEXT: v_fma_f32 v10, -v2, v7, v8 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_fma_f32 v11, -v3, v9, v6 -; GFX11-IEEE-NEXT: v_fmac_f32_e32 v7, v10, v4 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_fmac_f32_e32 v9, v11, v5 -; GFX11-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v8 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v6 -; GFX11-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v7 -; GFX11-IEEE-NEXT: s_mov_b32 vcc_lo, s0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v9 +; GFX11-IEEE-NEXT: v_fma_f32 v4, -v2, v3, 1.0 +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v4, v3 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_mul_f32_e32 v4, v5, v3 +; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v4, v5 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v3 +; GFX11-IEEE-NEXT: v_fma_f32 v2, -v2, v4, v5 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v4 +; GFX11-IEEE-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v1, 1.0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v6, vcc_lo, 1.0, v1, 1.0 ; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v4, v3 +; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff +; GFX11-IEEE-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v4, v5, v4 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_mul_f32_e32 v5, v6, v4 +; GFX11-IEEE-NEXT: v_fma_f32 v7, -v3, v5, v6 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v5, v7, v4 +; GFX11-IEEE-NEXT: v_fma_f32 v3, -v3, v5, v6 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v5 ; GFX11-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0 ; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -1806,7 +1802,7 @@ ; GFX11-FLUSH: ; %bb.0: ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, null, v0, v0, 1.0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, v0, v0, 1.0 ; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 @@ -1821,26 +1817,26 @@ ; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 ; GFX11-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX11-FLUSH-NEXT: s_denorm_mode 0 -; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, null, v1, v1, 1.0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v4 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v1, 1.0 ; GFX11-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 ; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, v1, 1.0 ; GFX11-FLUSH-NEXT: s_denorm_mode 3 ; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff -; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v4, v3, 1.0 -; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3 +; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v4, v5, v4 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v3 -; GFX11-FLUSH-NEXT: v_fma_f32 v6, -v4, v5, v2 +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v4 +; GFX11-FLUSH-NEXT: v_fma_f32 v6, -v3, v5, v2 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 -; GFX11-FLUSH-NEXT: v_fma_f32 v2, -v4, v5, v2 +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v4 +; GFX11-FLUSH-NEXT: v_fma_f32 v2, -v3, v5, v2 ; GFX11-FLUSH-NEXT: s_denorm_mode 0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX11-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; GFX11-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x float> , %x @@ -1994,7 +1990,7 @@ ; GFX6-IEEE-LABEL: v_fdiv_v2f32_arcp_ulp25: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4 ; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0 @@ -2004,7 +2000,7 @@ ; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7 ; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v3, v3, v1 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v5 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1 @@ -2040,28 +2036,27 @@ ; GFX89-IEEE-LABEL: v_fdiv_v2f32_arcp_ulp25: ; GFX89-IEEE: ; %bb.0: ; GFX89-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 -; GFX89-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 -; GFX89-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 -; GFX89-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], v1, v3, v1 -; GFX89-IEEE-NEXT: v_rcp_f32_e32 v8, v4 -; GFX89-IEEE-NEXT: v_rcp_f32_e32 v9, v5 -; GFX89-IEEE-NEXT: v_fma_f32 v10, -v4, v8, 1.0 -; GFX89-IEEE-NEXT: v_fma_f32 v8, v10, v8, v8 -; GFX89-IEEE-NEXT: v_fma_f32 v11, -v5, v9, 1.0 -; GFX89-IEEE-NEXT: v_fma_f32 v9, v11, v9, v9 -; GFX89-IEEE-NEXT: v_mul_f32_e32 v10, v6, v8 -; GFX89-IEEE-NEXT: v_mul_f32_e32 v11, v7, v9 -; GFX89-IEEE-NEXT: v_fma_f32 v12, -v4, v10, v6 -; GFX89-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v7 -; GFX89-IEEE-NEXT: v_fma_f32 v10, v12, v8, v10 -; GFX89-IEEE-NEXT: v_fma_f32 v11, v13, v9, v11 -; GFX89-IEEE-NEXT: v_fma_f32 v4, -v4, v10, v6 -; GFX89-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v7 -; GFX89-IEEE-NEXT: v_div_fmas_f32 v4, v4, v8, v10 -; GFX89-IEEE-NEXT: s_mov_b64 vcc, s[4:5] -; GFX89-IEEE-NEXT: v_div_fmas_f32 v5, v5, v9, v11 +; GFX89-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v0 +; GFX89-IEEE-NEXT: v_div_scale_f32 v5, vcc, v0, v2, v0 +; GFX89-IEEE-NEXT: v_rcp_f32_e32 v6, v4 +; GFX89-IEEE-NEXT: v_fma_f32 v7, -v4, v6, 1.0 +; GFX89-IEEE-NEXT: v_fma_f32 v6, v7, v6, v6 +; GFX89-IEEE-NEXT: v_mul_f32_e32 v7, v5, v6 +; GFX89-IEEE-NEXT: v_fma_f32 v8, -v4, v7, v5 +; GFX89-IEEE-NEXT: v_fma_f32 v7, v8, v6, v7 +; GFX89-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v5 +; GFX89-IEEE-NEXT: v_div_fmas_f32 v4, v4, v6, v7 +; GFX89-IEEE-NEXT: v_div_scale_f32 v5, vcc, v3, v3, v1 +; GFX89-IEEE-NEXT: v_div_scale_f32 v6, vcc, v1, v3, v1 ; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 +; GFX89-IEEE-NEXT: v_rcp_f32_e32 v7, v5 +; GFX89-IEEE-NEXT: v_fma_f32 v8, -v5, v7, 1.0 +; GFX89-IEEE-NEXT: v_fma_f32 v7, v8, v7, v7 +; GFX89-IEEE-NEXT: v_mul_f32_e32 v8, v6, v7 +; GFX89-IEEE-NEXT: v_fma_f32 v9, -v5, v8, v6 +; GFX89-IEEE-NEXT: v_fma_f32 v8, v9, v7, v8 +; GFX89-IEEE-NEXT: v_fma_f32 v5, -v5, v8, v6 +; GFX89-IEEE-NEXT: v_div_fmas_f32 v5, v5, v7, v8 ; GFX89-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 ; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -2069,28 +2064,27 @@ ; GFX10-IEEE: ; %bb.0: ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v4, s4, v2, v2, v0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v5, s4, v3, v3, v1 -; GFX10-IEEE-NEXT: v_div_scale_f32 v10, vcc_lo, v0, v2, v0 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v6, v4 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v7, v5 -; GFX10-IEEE-NEXT: v_fma_f32 v8, -v4, v6, 1.0 -; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v7, 1.0 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v8, v6 -; GFX10-IEEE-NEXT: v_div_scale_f32 v8, s4, v1, v3, v1 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v7 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7 -; GFX10-IEEE-NEXT: v_fma_f32 v12, -v4, v9, v10 -; GFX10-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v8 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7 -; GFX10-IEEE-NEXT: v_fma_f32 v4, -v4, v9, v10 -; GFX10-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v8 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v4, v6, v9 -; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v5, v7, v11 +; GFX10-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, v2, v2, v0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v7, vcc_lo, v0, v2, v0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v4 +; GFX10-IEEE-NEXT: v_fma_f32 v6, -v4, v5, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v5 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v7, v5 +; GFX10-IEEE-NEXT: v_fma_f32 v8, -v4, v6, v7 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v8, v5 +; GFX10-IEEE-NEXT: v_fma_f32 v4, -v4, v6, v7 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v6 +; GFX10-IEEE-NEXT: v_div_scale_f32 v5, vcc_lo, v3, v3, v1 +; GFX10-IEEE-NEXT: v_div_scale_f32 v8, vcc_lo, v1, v3, v1 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v6, v5 +; GFX10-IEEE-NEXT: v_fma_f32 v7, -v5, v6, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v7, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v8, v6 +; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v7, v8 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v6 +; GFX10-IEEE-NEXT: v_fma_f32 v5, -v5, v7, v8 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v5, v6, v7 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -2116,36 +2110,37 @@ ; GFX11-IEEE: ; %bb.0: ; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v4, null, v2, v2, v0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v5, null, v3, v3, v1 -; GFX11-IEEE-NEXT: v_div_scale_f32 v10, vcc_lo, v0, v2, v0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_rcp_f32_e32 v6, v4 -; GFX11-IEEE-NEXT: v_rcp_f32_e32 v7, v5 +; GFX11-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, v2, v2, v0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v7, vcc_lo, v0, v2, v0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v5, v4 ; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff -; GFX11-IEEE-NEXT: v_fma_f32 v8, -v4, v6, 1.0 -; GFX11-IEEE-NEXT: v_fma_f32 v9, -v5, v7, 1.0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_dual_fmac_f32 v6, v8, v6 :: v_dual_fmac_f32 v7, v9, v7 -; GFX11-IEEE-NEXT: v_div_scale_f32 v8, s0, v1, v3, v1 -; GFX11-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7 -; GFX11-IEEE-NEXT: v_fma_f32 v12, -v4, v9, v10 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v8 -; GFX11-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7 -; GFX11-IEEE-NEXT: v_fma_f32 v4, -v4, v9, v10 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v8 -; GFX11-IEEE-NEXT: v_div_fmas_f32 v4, v4, v6, v9 -; GFX11-IEEE-NEXT: s_mov_b32 vcc_lo, s0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_div_fmas_f32 v5, v5, v7, v11 +; GFX11-IEEE-NEXT: v_fma_f32 v6, -v4, v5, 1.0 +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v5 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_mul_f32_e32 v6, v7, v5 +; GFX11-IEEE-NEXT: v_fma_f32 v8, -v4, v6, v7 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v6, v8, v5 +; GFX11-IEEE-NEXT: v_fma_f32 v4, -v4, v6, v7 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v6 +; GFX11-IEEE-NEXT: v_div_scale_f32 v5, vcc_lo, v3, v3, v1 +; GFX11-IEEE-NEXT: v_div_scale_f32 v8, vcc_lo, v1, v3, v1 ; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v6, v5 +; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff +; GFX11-IEEE-NEXT: v_fma_f32 v7, -v5, v6, 1.0 +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v6, v7, v6 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_mul_f32_e32 v7, v8, v6 +; GFX11-IEEE-NEXT: v_fma_f32 v9, -v5, v7, v8 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v6 +; GFX11-IEEE-NEXT: v_fma_f32 v5, -v5, v7, v8 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_div_fmas_f32 v5, v5, v6, v7 ; GFX11-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 ; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll @@ -18,8 +18,8 @@ ; GFX6-LABEL: v_fdiv_f64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] -; GFX6-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[0:1], v[2:3], v[0:1] +; GFX6-NEXT: v_div_scale_f64 v[4:5], vcc, v[2:3], v[2:3], v[0:1] +; GFX6-NEXT: v_div_scale_f64 v[10:11], vcc, v[0:1], v[2:3], v[0:1] ; GFX6-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v5 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v11 @@ -37,7 +37,7 @@ ; GFX8-LABEL: v_fdiv_f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] +; GFX8-NEXT: v_div_scale_f64 v[4:5], vcc, v[2:3], v[2:3], v[0:1] ; GFX8-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX8-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX8-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] @@ -53,7 +53,7 @@ ; GFX9-LABEL: v_fdiv_f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] +; GFX9-NEXT: v_div_scale_f64 v[4:5], vcc, v[2:3], v[2:3], v[0:1] ; GFX9-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX9-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] @@ -70,7 +70,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_div_scale_f64 v[4:5], s4, v[2:3], v[2:3], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[4:5], vcc_lo, v[2:3], v[2:3], v[0:1] ; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[2:3], v[0:1] ; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 @@ -87,7 +87,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1] +; GFX11-NEXT: v_div_scale_f64 v[4:5], vcc_lo, v[2:3], v[2:3], v[0:1] ; GFX11-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[2:3], v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] @@ -161,8 +161,8 @@ ; GFX6-LABEL: v_fdiv_f64_ulp25: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] -; GFX6-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[0:1], v[2:3], v[0:1] +; GFX6-NEXT: v_div_scale_f64 v[4:5], vcc, v[2:3], v[2:3], v[0:1] +; GFX6-NEXT: v_div_scale_f64 v[10:11], vcc, v[0:1], v[2:3], v[0:1] ; GFX6-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v5 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v11 @@ -180,7 +180,7 @@ ; GFX8-LABEL: v_fdiv_f64_ulp25: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] +; GFX8-NEXT: v_div_scale_f64 v[4:5], vcc, v[2:3], v[2:3], v[0:1] ; GFX8-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX8-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX8-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] @@ -196,7 +196,7 @@ ; GFX9-LABEL: v_fdiv_f64_ulp25: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] +; GFX9-NEXT: v_div_scale_f64 v[4:5], vcc, v[2:3], v[2:3], v[0:1] ; GFX9-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX9-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] @@ -213,7 +213,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_div_scale_f64 v[4:5], s4, v[2:3], v[2:3], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[4:5], vcc_lo, v[2:3], v[2:3], v[0:1] ; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[2:3], v[0:1] ; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 @@ -230,7 +230,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1] +; GFX11-NEXT: v_div_scale_f64 v[4:5], vcc_lo, v[2:3], v[2:3], v[0:1] ; GFX11-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[2:3], v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] @@ -255,8 +255,8 @@ ; GFX6-LABEL: v_rcp_f64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 -; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[2:3], vcc, v[0:1], v[0:1], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[8:9], vcc, 1.0, v[0:1], 1.0 ; GFX6-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX6-NEXT: v_mov_b32_e32 v10, 0x3ff00000 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v10, v9 @@ -275,7 +275,7 @@ ; GFX8-LABEL: v_rcp_f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; GFX8-NEXT: v_div_scale_f64 v[2:3], vcc, v[0:1], v[0:1], 1.0 ; GFX8-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX8-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 ; GFX8-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] @@ -291,7 +291,7 @@ ; GFX9-LABEL: v_rcp_f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_div_scale_f64 v[2:3], vcc, v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 ; GFX9-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] @@ -308,7 +308,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_div_scale_f64 v[2:3], s4, v[0:1], v[0:1], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[2:3], vcc_lo, v[0:1], v[0:1], 1.0 ; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc_lo, 1.0, v[0:1], 1.0 ; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -325,7 +325,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_div_scale_f64 v[2:3], null, v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_div_scale_f64 v[2:3], vcc_lo, v[0:1], v[0:1], 1.0 ; GFX11-NEXT: v_div_scale_f64 v[8:9], vcc_lo, 1.0, v[0:1], 1.0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] @@ -350,8 +350,8 @@ ; GFX6-LABEL: v_rcp_f64_arcp: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 -; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[2:3], vcc, v[0:1], v[0:1], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[8:9], vcc, 1.0, v[0:1], 1.0 ; GFX6-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX6-NEXT: v_mov_b32_e32 v10, 0x3ff00000 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v10, v9 @@ -370,7 +370,7 @@ ; GFX8-LABEL: v_rcp_f64_arcp: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; GFX8-NEXT: v_div_scale_f64 v[2:3], vcc, v[0:1], v[0:1], 1.0 ; GFX8-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX8-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 ; GFX8-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] @@ -386,7 +386,7 @@ ; GFX9-LABEL: v_rcp_f64_arcp: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_div_scale_f64 v[2:3], vcc, v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 ; GFX9-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] @@ -403,7 +403,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_div_scale_f64 v[2:3], s4, v[0:1], v[0:1], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[2:3], vcc_lo, v[0:1], v[0:1], 1.0 ; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc_lo, 1.0, v[0:1], 1.0 ; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -420,7 +420,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_div_scale_f64 v[2:3], null, v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_div_scale_f64 v[2:3], vcc_lo, v[0:1], v[0:1], 1.0 ; GFX11-NEXT: v_div_scale_f64 v[8:9], vcc_lo, 1.0, v[0:1], 1.0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] @@ -494,8 +494,8 @@ ; GFX6-LABEL: v_rcp_f64_ulp25: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 -; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[2:3], vcc, v[0:1], v[0:1], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[8:9], vcc, 1.0, v[0:1], 1.0 ; GFX6-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX6-NEXT: v_mov_b32_e32 v10, 0x3ff00000 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v10, v9 @@ -514,7 +514,7 @@ ; GFX8-LABEL: v_rcp_f64_ulp25: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; GFX8-NEXT: v_div_scale_f64 v[2:3], vcc, v[0:1], v[0:1], 1.0 ; GFX8-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX8-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 ; GFX8-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] @@ -530,7 +530,7 @@ ; GFX9-LABEL: v_rcp_f64_ulp25: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_div_scale_f64 v[2:3], vcc, v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 ; GFX9-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] @@ -547,7 +547,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_div_scale_f64 v[2:3], s4, v[0:1], v[0:1], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[2:3], vcc_lo, v[0:1], v[0:1], 1.0 ; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc_lo, 1.0, v[0:1], 1.0 ; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -564,7 +564,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_div_scale_f64 v[2:3], null, v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_div_scale_f64 v[2:3], vcc_lo, v[0:1], v[0:1], 1.0 ; GFX11-NEXT: v_div_scale_f64 v[8:9], vcc_lo, 1.0, v[0:1], 1.0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] @@ -638,8 +638,8 @@ ; GFX6-LABEL: v_fdiv_f64_arcp_ulp25: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] -; GFX6-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[0:1], v[2:3], v[0:1] +; GFX6-NEXT: v_div_scale_f64 v[4:5], vcc, v[2:3], v[2:3], v[0:1] +; GFX6-NEXT: v_div_scale_f64 v[10:11], vcc, v[0:1], v[2:3], v[0:1] ; GFX6-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v5 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v11 @@ -657,7 +657,7 @@ ; GFX8-LABEL: v_fdiv_f64_arcp_ulp25: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] +; GFX8-NEXT: v_div_scale_f64 v[4:5], vcc, v[2:3], v[2:3], v[0:1] ; GFX8-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX8-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX8-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] @@ -673,7 +673,7 @@ ; GFX9-LABEL: v_fdiv_f64_arcp_ulp25: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] +; GFX9-NEXT: v_div_scale_f64 v[4:5], vcc, v[2:3], v[2:3], v[0:1] ; GFX9-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX9-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] @@ -690,7 +690,7 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_div_scale_f64 v[4:5], s4, v[2:3], v[2:3], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[4:5], vcc_lo, v[2:3], v[2:3], v[0:1] ; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[2:3], v[0:1] ; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 @@ -707,7 +707,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1] +; GFX11-NEXT: v_div_scale_f64 v[4:5], vcc_lo, v[2:3], v[2:3], v[0:1] ; GFX11-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[2:3], v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] @@ -732,90 +732,87 @@ ; GFX6-LABEL: v_fdiv_v2f64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] -; GFX6-NEXT: v_div_scale_f64 v[14:15], s[4:5], v[6:7], v[6:7], v[2:3] +; GFX6-NEXT: v_div_scale_f64 v[8:9], vcc, v[4:5], v[4:5], v[0:1] +; GFX6-NEXT: v_div_scale_f64 v[14:15], vcc, v[0:1], v[4:5], v[0:1] ; GFX6-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] -; GFX6-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[0:1], v[4:5], v[0:1] -; GFX6-NEXT: v_rcp_f64_e32 v[16:17], v[14:15] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v9 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v15 +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v19 ; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v9 ; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX6-NEXT: v_fma_f64 v[12:13], -v[14:15], v[16:17], 1.0 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v15 -; GFX6-NEXT: v_fma_f64 v[12:13], v[16:17], v[12:13], v[16:17] -; GFX6-NEXT: v_mul_f64 v[16:17], v[18:19], v[10:11] -; GFX6-NEXT: v_fma_f64 v[18:19], -v[8:9], v[16:17], v[18:19] -; GFX6-NEXT: v_fma_f64 v[8:9], -v[14:15], v[12:13], 1.0 -; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[18:19], v[10:11], v[16:17] -; GFX6-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13] -; GFX6-NEXT: v_div_scale_f64 v[12:13], s[6:7], v[2:3], v[6:7], v[2:3] -; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[10:11], v[4:5], v[0:1] -; GFX6-NEXT: v_mul_f64 v[16:17], v[12:13], v[8:9] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v13 -; GFX6-NEXT: v_fma_f64 v[18:19], -v[14:15], v[16:17], v[12:13] +; GFX6-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11] +; GFX6-NEXT: v_fma_f64 v[14:15], -v[8:9], v[12:13], v[14:15] +; GFX6-NEXT: v_div_fmas_f64 v[8:9], v[14:15], v[10:11], v[12:13] +; GFX6-NEXT: v_div_scale_f64 v[10:11], vcc, v[6:7], v[6:7], v[2:3] +; GFX6-NEXT: v_div_scale_f64 v[16:17], vcc, v[2:3], v[6:7], v[2:3] +; GFX6-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v11 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v17 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: s_nop 1 -; GFX6-NEXT: v_div_fmas_f64 v[8:9], v[18:19], v[8:9], v[16:17] -; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[6:7], v[2:3] +; GFX6-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX6-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX6-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX6-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX6-NEXT: v_mul_f64 v[14:15], v[16:17], v[12:13] +; GFX6-NEXT: v_fma_f64 v[10:11], -v[10:11], v[14:15], v[16:17] +; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[14:15] +; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_v2f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] -; GFX8-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3] -; GFX8-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] -; GFX8-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] -; GFX8-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX8-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX8-NEXT: v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1] -; GFX8-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX8-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 -; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] -; GFX8-NEXT: v_mul_f64 v[16:17], v[18:19], v[12:13] -; GFX8-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19] -; GFX8-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3] -; GFX8-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17] -; GFX8-NEXT: s_mov_b64 vcc, s[4:5] -; GFX8-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15] +; GFX8-NEXT: v_div_scale_f64 v[8:9], vcc, v[4:5], v[4:5], v[0:1] +; GFX8-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; GFX8-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX8-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX8-NEXT: v_div_scale_f64 v[12:13], vcc, v[0:1], v[4:5], v[0:1] +; GFX8-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; GFX8-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; GFX8-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] +; GFX8-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] +; GFX8-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] +; GFX8-NEXT: v_div_scale_f64 v[10:11], vcc, v[6:7], v[6:7], v[2:3] ; GFX8-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] -; GFX8-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19] -; GFX8-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21] +; GFX8-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; GFX8-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX8-NEXT: v_div_scale_f64 v[14:15], vcc, v[2:3], v[6:7], v[2:3] +; GFX8-NEXT: v_fma_f64 v[16:17], -v[10:11], v[12:13], 1.0 +; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX8-NEXT: v_mul_f64 v[16:17], v[14:15], v[12:13] +; GFX8-NEXT: v_fma_f64 v[10:11], -v[10:11], v[16:17], v[14:15] +; GFX8-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[16:17] ; GFX8-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fdiv_v2f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] -; GFX9-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3] -; GFX9-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] -; GFX9-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] -; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX9-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX9-NEXT: v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1] -; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX9-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 -; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] -; GFX9-NEXT: v_mul_f64 v[16:17], v[18:19], v[12:13] -; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19] -; GFX9-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3] -; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17] -; GFX9-NEXT: s_mov_b64 vcc, s[4:5] -; GFX9-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15] +; GFX9-NEXT: v_div_scale_f64 v[8:9], vcc, v[4:5], v[4:5], v[0:1] +; GFX9-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; GFX9-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX9-NEXT: v_div_scale_f64 v[12:13], vcc, v[0:1], v[4:5], v[0:1] +; GFX9-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; GFX9-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] +; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] +; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] +; GFX9-NEXT: v_div_scale_f64 v[10:11], vcc, v[6:7], v[6:7], v[2:3] ; GFX9-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] -; GFX9-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19] -; GFX9-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21] +; GFX9-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; GFX9-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX9-NEXT: v_div_scale_f64 v[14:15], vcc, v[2:3], v[6:7], v[2:3] +; GFX9-NEXT: v_fma_f64 v[16:17], -v[10:11], v[12:13], 1.0 +; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX9-NEXT: v_mul_f64 v[16:17], v[14:15], v[12:13] +; GFX9-NEXT: v_fma_f64 v[10:11], -v[10:11], v[16:17], v[14:15] +; GFX9-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[16:17] ; GFX9-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -823,28 +820,27 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_div_scale_f64 v[8:9], s4, v[4:5], v[4:5], v[0:1] -; GFX10-NEXT: v_div_scale_f64 v[10:11], s4, v[6:7], v[6:7], v[2:3] -; GFX10-NEXT: v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[4:5], v[0:1] -; GFX10-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] -; GFX10-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] -; GFX10-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX10-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX10-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX10-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX10-NEXT: v_div_scale_f64 v[16:17], s4, v[2:3], v[6:7], v[2:3] -; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX10-NEXT: v_mul_f64 v[18:19], v[20:21], v[12:13] -; GFX10-NEXT: v_mul_f64 v[22:23], v[16:17], v[14:15] -; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[20:21] -; GFX10-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[16:17] -; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] -; GFX10-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] +; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[4:5], v[4:5], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[14:15], vcc_lo, v[0:1], v[4:5], v[0:1] +; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX10-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[12:13], v[14:15] +; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[12:13] +; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[6:7], v[6:7], v[2:3] +; GFX10-NEXT: v_div_scale_f64 v[16:17], vcc_lo, v[2:3], v[6:7], v[2:3] ; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX10-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; GFX10-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX10-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX10-NEXT: v_mul_f64 v[14:15], v[16:17], v[12:13] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[10:11], v[14:15], v[16:17] +; GFX10-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[14:15] ; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -852,37 +848,37 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_div_scale_f64 v[8:9], null, v[4:5], v[4:5], v[0:1] -; GFX11-NEXT: v_div_scale_f64 v[10:11], null, v[6:7], v[6:7], v[2:3] -; GFX11-NEXT: v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[4:5], v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] -; GFX11-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] +; GFX11-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[4:5], v[4:5], v[0:1] +; GFX11-NEXT: v_div_scale_f64 v[14:15], vcc_lo, v[0:1], v[4:5], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX11-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX11-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX11-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX11-NEXT: v_div_scale_f64 v[16:17], s0, v[2:3], v[6:7], v[2:3] -; GFX11-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_f64 v[18:19], v[20:21], v[12:13] -; GFX11-NEXT: v_mul_f64 v[22:23], v[16:17], v[14:15] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[20:21] -; GFX11-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[16:17] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] -; GFX11-NEXT: s_mov_b32 vcc_lo, s0 -; GFX11-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11] +; GFX11-NEXT: v_fma_f64 v[8:9], -v[8:9], v[12:13], v[14:15] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[12:13] +; GFX11-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[6:7], v[6:7], v[2:3] +; GFX11-NEXT: v_div_scale_f64 v[16:17], vcc_lo, v[2:3], v[6:7], v[2:3] ; GFX11-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX11-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX11-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[14:15], v[16:17], v[12:13] +; GFX11-NEXT: v_fma_f64 v[10:11], -v[10:11], v[14:15], v[16:17] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[14:15] ; GFX11-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x double> %a, %b @@ -969,90 +965,87 @@ ; GFX6-LABEL: v_fdiv_v2f64_ulp25: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] -; GFX6-NEXT: v_div_scale_f64 v[14:15], s[4:5], v[6:7], v[6:7], v[2:3] +; GFX6-NEXT: v_div_scale_f64 v[8:9], vcc, v[4:5], v[4:5], v[0:1] +; GFX6-NEXT: v_div_scale_f64 v[14:15], vcc, v[0:1], v[4:5], v[0:1] ; GFX6-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] -; GFX6-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[0:1], v[4:5], v[0:1] -; GFX6-NEXT: v_rcp_f64_e32 v[16:17], v[14:15] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v9 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v15 +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v19 ; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v9 ; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX6-NEXT: v_fma_f64 v[12:13], -v[14:15], v[16:17], 1.0 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v15 -; GFX6-NEXT: v_fma_f64 v[12:13], v[16:17], v[12:13], v[16:17] -; GFX6-NEXT: v_mul_f64 v[16:17], v[18:19], v[10:11] -; GFX6-NEXT: v_fma_f64 v[18:19], -v[8:9], v[16:17], v[18:19] -; GFX6-NEXT: v_fma_f64 v[8:9], -v[14:15], v[12:13], 1.0 -; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[18:19], v[10:11], v[16:17] -; GFX6-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13] -; GFX6-NEXT: v_div_scale_f64 v[12:13], s[6:7], v[2:3], v[6:7], v[2:3] -; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[10:11], v[4:5], v[0:1] -; GFX6-NEXT: v_mul_f64 v[16:17], v[12:13], v[8:9] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v13 -; GFX6-NEXT: v_fma_f64 v[18:19], -v[14:15], v[16:17], v[12:13] +; GFX6-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11] +; GFX6-NEXT: v_fma_f64 v[14:15], -v[8:9], v[12:13], v[14:15] +; GFX6-NEXT: v_div_fmas_f64 v[8:9], v[14:15], v[10:11], v[12:13] +; GFX6-NEXT: v_div_scale_f64 v[10:11], vcc, v[6:7], v[6:7], v[2:3] +; GFX6-NEXT: v_div_scale_f64 v[16:17], vcc, v[2:3], v[6:7], v[2:3] +; GFX6-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v11 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v17 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: s_nop 1 -; GFX6-NEXT: v_div_fmas_f64 v[8:9], v[18:19], v[8:9], v[16:17] -; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[6:7], v[2:3] +; GFX6-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX6-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX6-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX6-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX6-NEXT: v_mul_f64 v[14:15], v[16:17], v[12:13] +; GFX6-NEXT: v_fma_f64 v[10:11], -v[10:11], v[14:15], v[16:17] +; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[14:15] +; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_v2f64_ulp25: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] -; GFX8-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3] -; GFX8-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] -; GFX8-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] -; GFX8-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX8-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX8-NEXT: v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1] -; GFX8-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX8-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 -; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] -; GFX8-NEXT: v_mul_f64 v[16:17], v[18:19], v[12:13] -; GFX8-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19] -; GFX8-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3] -; GFX8-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17] -; GFX8-NEXT: s_mov_b64 vcc, s[4:5] -; GFX8-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15] +; GFX8-NEXT: v_div_scale_f64 v[8:9], vcc, v[4:5], v[4:5], v[0:1] +; GFX8-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; GFX8-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX8-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX8-NEXT: v_div_scale_f64 v[12:13], vcc, v[0:1], v[4:5], v[0:1] +; GFX8-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; GFX8-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; GFX8-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] +; GFX8-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] +; GFX8-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] +; GFX8-NEXT: v_div_scale_f64 v[10:11], vcc, v[6:7], v[6:7], v[2:3] ; GFX8-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] -; GFX8-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19] -; GFX8-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21] +; GFX8-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; GFX8-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX8-NEXT: v_div_scale_f64 v[14:15], vcc, v[2:3], v[6:7], v[2:3] +; GFX8-NEXT: v_fma_f64 v[16:17], -v[10:11], v[12:13], 1.0 +; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX8-NEXT: v_mul_f64 v[16:17], v[14:15], v[12:13] +; GFX8-NEXT: v_fma_f64 v[10:11], -v[10:11], v[16:17], v[14:15] +; GFX8-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[16:17] ; GFX8-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fdiv_v2f64_ulp25: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] -; GFX9-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3] -; GFX9-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] -; GFX9-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] -; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX9-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX9-NEXT: v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1] -; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX9-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 -; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] -; GFX9-NEXT: v_mul_f64 v[16:17], v[18:19], v[12:13] -; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19] -; GFX9-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3] -; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17] -; GFX9-NEXT: s_mov_b64 vcc, s[4:5] -; GFX9-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15] +; GFX9-NEXT: v_div_scale_f64 v[8:9], vcc, v[4:5], v[4:5], v[0:1] +; GFX9-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; GFX9-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX9-NEXT: v_div_scale_f64 v[12:13], vcc, v[0:1], v[4:5], v[0:1] +; GFX9-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; GFX9-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] +; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] +; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] +; GFX9-NEXT: v_div_scale_f64 v[10:11], vcc, v[6:7], v[6:7], v[2:3] ; GFX9-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] -; GFX9-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19] -; GFX9-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21] +; GFX9-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; GFX9-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX9-NEXT: v_div_scale_f64 v[14:15], vcc, v[2:3], v[6:7], v[2:3] +; GFX9-NEXT: v_fma_f64 v[16:17], -v[10:11], v[12:13], 1.0 +; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX9-NEXT: v_mul_f64 v[16:17], v[14:15], v[12:13] +; GFX9-NEXT: v_fma_f64 v[10:11], -v[10:11], v[16:17], v[14:15] +; GFX9-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[16:17] ; GFX9-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1060,28 +1053,27 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_div_scale_f64 v[8:9], s4, v[4:5], v[4:5], v[0:1] -; GFX10-NEXT: v_div_scale_f64 v[10:11], s4, v[6:7], v[6:7], v[2:3] -; GFX10-NEXT: v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[4:5], v[0:1] -; GFX10-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] -; GFX10-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] -; GFX10-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX10-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX10-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX10-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX10-NEXT: v_div_scale_f64 v[16:17], s4, v[2:3], v[6:7], v[2:3] -; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX10-NEXT: v_mul_f64 v[18:19], v[20:21], v[12:13] -; GFX10-NEXT: v_mul_f64 v[22:23], v[16:17], v[14:15] -; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[20:21] -; GFX10-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[16:17] -; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] -; GFX10-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] +; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[4:5], v[4:5], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[14:15], vcc_lo, v[0:1], v[4:5], v[0:1] +; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX10-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[12:13], v[14:15] +; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[12:13] +; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[6:7], v[6:7], v[2:3] +; GFX10-NEXT: v_div_scale_f64 v[16:17], vcc_lo, v[2:3], v[6:7], v[2:3] ; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX10-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; GFX10-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX10-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX10-NEXT: v_mul_f64 v[14:15], v[16:17], v[12:13] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[10:11], v[14:15], v[16:17] +; GFX10-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[14:15] ; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1089,37 +1081,37 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_div_scale_f64 v[8:9], null, v[4:5], v[4:5], v[0:1] -; GFX11-NEXT: v_div_scale_f64 v[10:11], null, v[6:7], v[6:7], v[2:3] -; GFX11-NEXT: v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[4:5], v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] -; GFX11-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] +; GFX11-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[4:5], v[4:5], v[0:1] +; GFX11-NEXT: v_div_scale_f64 v[14:15], vcc_lo, v[0:1], v[4:5], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX11-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX11-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX11-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX11-NEXT: v_div_scale_f64 v[16:17], s0, v[2:3], v[6:7], v[2:3] -; GFX11-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_f64 v[18:19], v[20:21], v[12:13] -; GFX11-NEXT: v_mul_f64 v[22:23], v[16:17], v[14:15] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[20:21] -; GFX11-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[16:17] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] -; GFX11-NEXT: s_mov_b32 vcc_lo, s0 -; GFX11-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11] +; GFX11-NEXT: v_fma_f64 v[8:9], -v[8:9], v[12:13], v[14:15] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[12:13] +; GFX11-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[6:7], v[6:7], v[2:3] +; GFX11-NEXT: v_div_scale_f64 v[16:17], vcc_lo, v[2:3], v[6:7], v[2:3] ; GFX11-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX11-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX11-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[14:15], v[16:17], v[12:13] +; GFX11-NEXT: v_fma_f64 v[10:11], -v[10:11], v[14:15], v[16:17] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[14:15] ; GFX11-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x double> %a, %b, !fpmath !0 @@ -1130,91 +1122,88 @@ ; GFX6-LABEL: v_rcp_v2f64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 -; GFX6-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[4:5], vcc, v[0:1], v[0:1], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[10:11], vcc, 1.0, v[0:1], 1.0 ; GFX6-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] -; GFX6-NEXT: v_mov_b32_e32 v18, 0x3ff00000 -; GFX6-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] -; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 -; GFX6-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 -; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0 -; GFX6-NEXT: v_fma_f64 v[12:13], -v[4:5], v[6:7], 1.0 -; GFX6-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] -; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v18, v9 -; GFX6-NEXT: v_mul_f64 v[12:13], v[8:9], v[6:7] +; GFX6-NEXT: v_mov_b32_e32 v14, 0x3ff00000 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v14, v11 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5 -; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[12:13], v[8:9] -; GFX6-NEXT: v_fma_f64 v[4:5], -v[10:11], v[14:15], 1.0 -; GFX6-NEXT: v_div_scale_f64 v[16:17], s[6:7], 1.0, v[2:3], 1.0 -; GFX6-NEXT: v_fma_f64 v[4:5], v[14:15], v[4:5], v[14:15] +; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_mul_f64 v[14:15], v[16:17], v[4:5] -; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[8:9], v[6:7], v[12:13] -; GFX6-NEXT: v_fma_f64 v[8:9], -v[10:11], v[14:15], v[16:17] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v18, v17 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v11 +; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX6-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7] +; GFX6-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11] +; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; GFX6-NEXT: v_div_scale_f64 v[6:7], vcc, v[2:3], v[2:3], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[12:13], vcc, 1.0, v[2:3], 1.0 +; GFX6-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v14, v13 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0 -; GFX6-NEXT: s_nop 0 -; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[8:9], v[4:5], v[14:15] -; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0 +; GFX6-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; GFX6-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX6-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX6-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX6-NEXT: v_mul_f64 v[10:11], v[12:13], v[8:9] +; GFX6-NEXT: v_fma_f64 v[6:7], -v[6:7], v[10:11], v[12:13] +; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[10:11] +; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_rcp_v2f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 -; GFX8-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0 -; GFX8-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0 -; GFX8-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] -; GFX8-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] -; GFX8-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX8-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX8-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX8-NEXT: v_div_scale_f64 v[12:13], vcc, 1.0, v[0:1], 1.0 -; GFX8-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX8-NEXT: v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0 -; GFX8-NEXT: v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0 -; GFX8-NEXT: v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9] -; GFX8-NEXT: v_fma_f64 v[10:11], v[10:11], v[18:19], v[10:11] -; GFX8-NEXT: v_mul_f64 v[14:15], v[12:13], v[8:9] -; GFX8-NEXT: v_mul_f64 v[18:19], v[16:17], v[10:11] -; GFX8-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[12:13] -; GFX8-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[16:17] -; GFX8-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] -; GFX8-NEXT: s_mov_b64 vcc, s[4:5] -; GFX8-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] +; GFX8-NEXT: v_div_scale_f64 v[4:5], vcc, v[0:1], v[0:1], 1.0 +; GFX8-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; GFX8-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX8-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX8-NEXT: v_div_scale_f64 v[8:9], vcc, 1.0, v[0:1], 1.0 +; GFX8-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0 +; GFX8-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; GFX8-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] +; GFX8-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] +; GFX8-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] +; GFX8-NEXT: v_div_scale_f64 v[6:7], vcc, v[2:3], v[2:3], 1.0 ; GFX8-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; GFX8-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] +; GFX8-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX8-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX8-NEXT: v_div_scale_f64 v[10:11], vcc, 1.0, v[2:3], 1.0 +; GFX8-NEXT: v_fma_f64 v[12:13], -v[6:7], v[8:9], 1.0 +; GFX8-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; GFX8-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] +; GFX8-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] +; GFX8-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] ; GFX8-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_rcp_v2f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0 -; GFX9-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0 -; GFX9-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] -; GFX9-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] -; GFX9-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX9-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX9-NEXT: v_div_scale_f64 v[12:13], vcc, 1.0, v[0:1], 1.0 -; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX9-NEXT: v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0 -; GFX9-NEXT: v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0 -; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9] -; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[18:19], v[10:11] -; GFX9-NEXT: v_mul_f64 v[14:15], v[12:13], v[8:9] -; GFX9-NEXT: v_mul_f64 v[18:19], v[16:17], v[10:11] -; GFX9-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[12:13] -; GFX9-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[16:17] -; GFX9-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] -; GFX9-NEXT: s_mov_b64 vcc, s[4:5] -; GFX9-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] +; GFX9-NEXT: v_div_scale_f64 v[4:5], vcc, v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; GFX9-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX9-NEXT: v_div_scale_f64 v[8:9], vcc, 1.0, v[0:1], 1.0 +; GFX9-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0 +; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; GFX9-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] +; GFX9-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] +; GFX9-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] +; GFX9-NEXT: v_div_scale_f64 v[6:7], vcc, v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; GFX9-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] +; GFX9-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX9-NEXT: v_div_scale_f64 v[10:11], vcc, 1.0, v[2:3], 1.0 +; GFX9-NEXT: v_fma_f64 v[12:13], -v[6:7], v[8:9], 1.0 +; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; GFX9-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] +; GFX9-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] +; GFX9-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] ; GFX9-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1222,28 +1211,27 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_div_scale_f64 v[4:5], s4, v[0:1], v[0:1], 1.0 -; GFX10-NEXT: v_div_scale_f64 v[6:7], s4, v[2:3], v[2:3], 1.0 -; GFX10-NEXT: v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0 -; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] -; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] -; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX10-NEXT: v_div_scale_f64 v[12:13], s4, 1.0, v[2:3], 1.0 -; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX10-NEXT: v_mul_f64 v[14:15], v[16:17], v[8:9] -; GFX10-NEXT: v_mul_f64 v[18:19], v[12:13], v[10:11] -; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17] -; GFX10-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[12:13] -; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] -; GFX10-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] +; GFX10-NEXT: v_div_scale_f64 v[4:5], vcc_lo, v[0:1], v[0:1], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc_lo, 1.0, v[0:1], 1.0 +; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX10-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7] +; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11] +; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; GFX10-NEXT: v_div_scale_f64 v[6:7], vcc_lo, v[2:3], v[2:3], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[12:13], vcc_lo, 1.0, v[2:3], 1.0 ; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX10-NEXT: v_mul_f64 v[10:11], v[12:13], v[8:9] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[6:7], v[10:11], v[12:13] +; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[10:11] ; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1251,37 +1239,37 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_div_scale_f64 v[4:5], null, v[0:1], v[0:1], 1.0 -; GFX11-NEXT: v_div_scale_f64 v[6:7], null, v[2:3], v[2:3], 1.0 -; GFX11-NEXT: v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] -; GFX11-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] +; GFX11-NEXT: v_div_scale_f64 v[4:5], vcc_lo, v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_div_scale_f64 v[10:11], vcc_lo, 1.0, v[0:1], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX11-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX11-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX11-NEXT: v_div_scale_f64 v[12:13], s0, 1.0, v[2:3], 1.0 -; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_f64 v[14:15], v[16:17], v[8:9] -; GFX11-NEXT: v_mul_f64 v[18:19], v[12:13], v[10:11] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17] -; GFX11-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[12:13] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] -; GFX11-NEXT: s_mov_b32 vcc_lo, s0 -; GFX11-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX11-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX11-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7] +; GFX11-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; GFX11-NEXT: v_div_scale_f64 v[6:7], vcc_lo, v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_div_scale_f64 v[12:13], vcc_lo, 1.0, v[2:3], 1.0 ; GFX11-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[10:11], v[12:13], v[8:9] +; GFX11-NEXT: v_fma_f64 v[6:7], -v[6:7], v[10:11], v[12:13] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[10:11] ; GFX11-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x double> , %x @@ -1292,91 +1280,88 @@ ; GFX6-LABEL: v_rcp_v2f64_arcp: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 -; GFX6-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[4:5], vcc, v[0:1], v[0:1], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[10:11], vcc, 1.0, v[0:1], 1.0 ; GFX6-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] -; GFX6-NEXT: v_mov_b32_e32 v18, 0x3ff00000 -; GFX6-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] -; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 -; GFX6-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 -; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0 -; GFX6-NEXT: v_fma_f64 v[12:13], -v[4:5], v[6:7], 1.0 -; GFX6-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] -; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v18, v9 -; GFX6-NEXT: v_mul_f64 v[12:13], v[8:9], v[6:7] +; GFX6-NEXT: v_mov_b32_e32 v14, 0x3ff00000 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v14, v11 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5 -; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[12:13], v[8:9] -; GFX6-NEXT: v_fma_f64 v[4:5], -v[10:11], v[14:15], 1.0 -; GFX6-NEXT: v_div_scale_f64 v[16:17], s[6:7], 1.0, v[2:3], 1.0 -; GFX6-NEXT: v_fma_f64 v[4:5], v[14:15], v[4:5], v[14:15] +; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_mul_f64 v[14:15], v[16:17], v[4:5] -; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[8:9], v[6:7], v[12:13] -; GFX6-NEXT: v_fma_f64 v[8:9], -v[10:11], v[14:15], v[16:17] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v18, v17 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v11 +; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX6-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7] +; GFX6-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11] +; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; GFX6-NEXT: v_div_scale_f64 v[6:7], vcc, v[2:3], v[2:3], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[12:13], vcc, 1.0, v[2:3], 1.0 +; GFX6-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v14, v13 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0 -; GFX6-NEXT: s_nop 0 -; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[8:9], v[4:5], v[14:15] -; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0 +; GFX6-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; GFX6-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX6-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX6-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX6-NEXT: v_mul_f64 v[10:11], v[12:13], v[8:9] +; GFX6-NEXT: v_fma_f64 v[6:7], -v[6:7], v[10:11], v[12:13] +; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[10:11] +; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_rcp_v2f64_arcp: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 -; GFX8-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0 -; GFX8-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0 -; GFX8-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] -; GFX8-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] -; GFX8-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX8-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX8-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX8-NEXT: v_div_scale_f64 v[12:13], vcc, 1.0, v[0:1], 1.0 -; GFX8-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX8-NEXT: v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0 -; GFX8-NEXT: v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0 -; GFX8-NEXT: v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9] -; GFX8-NEXT: v_fma_f64 v[10:11], v[10:11], v[18:19], v[10:11] -; GFX8-NEXT: v_mul_f64 v[14:15], v[12:13], v[8:9] -; GFX8-NEXT: v_mul_f64 v[18:19], v[16:17], v[10:11] -; GFX8-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[12:13] -; GFX8-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[16:17] -; GFX8-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] -; GFX8-NEXT: s_mov_b64 vcc, s[4:5] -; GFX8-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] +; GFX8-NEXT: v_div_scale_f64 v[4:5], vcc, v[0:1], v[0:1], 1.0 +; GFX8-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; GFX8-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX8-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX8-NEXT: v_div_scale_f64 v[8:9], vcc, 1.0, v[0:1], 1.0 +; GFX8-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0 +; GFX8-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; GFX8-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] +; GFX8-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] +; GFX8-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] +; GFX8-NEXT: v_div_scale_f64 v[6:7], vcc, v[2:3], v[2:3], 1.0 ; GFX8-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; GFX8-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] +; GFX8-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX8-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX8-NEXT: v_div_scale_f64 v[10:11], vcc, 1.0, v[2:3], 1.0 +; GFX8-NEXT: v_fma_f64 v[12:13], -v[6:7], v[8:9], 1.0 +; GFX8-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; GFX8-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] +; GFX8-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] +; GFX8-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] ; GFX8-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_rcp_v2f64_arcp: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0 -; GFX9-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0 -; GFX9-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] -; GFX9-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] -; GFX9-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX9-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX9-NEXT: v_div_scale_f64 v[12:13], vcc, 1.0, v[0:1], 1.0 -; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX9-NEXT: v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0 -; GFX9-NEXT: v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0 -; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9] -; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[18:19], v[10:11] -; GFX9-NEXT: v_mul_f64 v[14:15], v[12:13], v[8:9] -; GFX9-NEXT: v_mul_f64 v[18:19], v[16:17], v[10:11] -; GFX9-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[12:13] -; GFX9-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[16:17] -; GFX9-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] -; GFX9-NEXT: s_mov_b64 vcc, s[4:5] -; GFX9-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] +; GFX9-NEXT: v_div_scale_f64 v[4:5], vcc, v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; GFX9-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX9-NEXT: v_div_scale_f64 v[8:9], vcc, 1.0, v[0:1], 1.0 +; GFX9-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0 +; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; GFX9-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] +; GFX9-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] +; GFX9-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] +; GFX9-NEXT: v_div_scale_f64 v[6:7], vcc, v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; GFX9-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] +; GFX9-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX9-NEXT: v_div_scale_f64 v[10:11], vcc, 1.0, v[2:3], 1.0 +; GFX9-NEXT: v_fma_f64 v[12:13], -v[6:7], v[8:9], 1.0 +; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; GFX9-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] +; GFX9-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] +; GFX9-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] ; GFX9-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1384,28 +1369,27 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_div_scale_f64 v[4:5], s4, v[0:1], v[0:1], 1.0 -; GFX10-NEXT: v_div_scale_f64 v[6:7], s4, v[2:3], v[2:3], 1.0 -; GFX10-NEXT: v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0 -; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] -; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] -; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX10-NEXT: v_div_scale_f64 v[12:13], s4, 1.0, v[2:3], 1.0 -; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX10-NEXT: v_mul_f64 v[14:15], v[16:17], v[8:9] -; GFX10-NEXT: v_mul_f64 v[18:19], v[12:13], v[10:11] -; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17] -; GFX10-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[12:13] -; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] -; GFX10-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] +; GFX10-NEXT: v_div_scale_f64 v[4:5], vcc_lo, v[0:1], v[0:1], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc_lo, 1.0, v[0:1], 1.0 +; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX10-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7] +; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11] +; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; GFX10-NEXT: v_div_scale_f64 v[6:7], vcc_lo, v[2:3], v[2:3], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[12:13], vcc_lo, 1.0, v[2:3], 1.0 ; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX10-NEXT: v_mul_f64 v[10:11], v[12:13], v[8:9] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[6:7], v[10:11], v[12:13] +; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[10:11] ; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1413,37 +1397,37 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_div_scale_f64 v[4:5], null, v[0:1], v[0:1], 1.0 -; GFX11-NEXT: v_div_scale_f64 v[6:7], null, v[2:3], v[2:3], 1.0 -; GFX11-NEXT: v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] -; GFX11-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] +; GFX11-NEXT: v_div_scale_f64 v[4:5], vcc_lo, v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_div_scale_f64 v[10:11], vcc_lo, 1.0, v[0:1], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX11-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX11-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX11-NEXT: v_div_scale_f64 v[12:13], s0, 1.0, v[2:3], 1.0 -; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_f64 v[14:15], v[16:17], v[8:9] -; GFX11-NEXT: v_mul_f64 v[18:19], v[12:13], v[10:11] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17] -; GFX11-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[12:13] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] -; GFX11-NEXT: s_mov_b32 vcc_lo, s0 -; GFX11-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX11-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX11-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7] +; GFX11-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; GFX11-NEXT: v_div_scale_f64 v[6:7], vcc_lo, v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_div_scale_f64 v[12:13], vcc_lo, 1.0, v[2:3], 1.0 ; GFX11-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[10:11], v[12:13], v[8:9] +; GFX11-NEXT: v_fma_f64 v[6:7], -v[6:7], v[10:11], v[12:13] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[10:11] ; GFX11-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x double> , %x @@ -1530,91 +1514,88 @@ ; GFX6-LABEL: v_rcp_v2f64_ulp25: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 -; GFX6-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[4:5], vcc, v[0:1], v[0:1], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[10:11], vcc, 1.0, v[0:1], 1.0 ; GFX6-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] -; GFX6-NEXT: v_mov_b32_e32 v18, 0x3ff00000 -; GFX6-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] -; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 -; GFX6-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 -; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0 -; GFX6-NEXT: v_fma_f64 v[12:13], -v[4:5], v[6:7], 1.0 -; GFX6-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] -; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v18, v9 -; GFX6-NEXT: v_mul_f64 v[12:13], v[8:9], v[6:7] +; GFX6-NEXT: v_mov_b32_e32 v14, 0x3ff00000 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v14, v11 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5 -; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[12:13], v[8:9] -; GFX6-NEXT: v_fma_f64 v[4:5], -v[10:11], v[14:15], 1.0 -; GFX6-NEXT: v_div_scale_f64 v[16:17], s[6:7], 1.0, v[2:3], 1.0 -; GFX6-NEXT: v_fma_f64 v[4:5], v[14:15], v[4:5], v[14:15] +; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_mul_f64 v[14:15], v[16:17], v[4:5] -; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[8:9], v[6:7], v[12:13] -; GFX6-NEXT: v_fma_f64 v[8:9], -v[10:11], v[14:15], v[16:17] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v18, v17 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v11 +; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX6-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7] +; GFX6-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11] +; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; GFX6-NEXT: v_div_scale_f64 v[6:7], vcc, v[2:3], v[2:3], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[12:13], vcc, 1.0, v[2:3], 1.0 +; GFX6-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v14, v13 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0 -; GFX6-NEXT: s_nop 0 -; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[8:9], v[4:5], v[14:15] -; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0 +; GFX6-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; GFX6-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX6-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX6-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX6-NEXT: v_mul_f64 v[10:11], v[12:13], v[8:9] +; GFX6-NEXT: v_fma_f64 v[6:7], -v[6:7], v[10:11], v[12:13] +; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[10:11] +; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_rcp_v2f64_ulp25: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 -; GFX8-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0 -; GFX8-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0 -; GFX8-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] -; GFX8-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] -; GFX8-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX8-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX8-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX8-NEXT: v_div_scale_f64 v[12:13], vcc, 1.0, v[0:1], 1.0 -; GFX8-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX8-NEXT: v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0 -; GFX8-NEXT: v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0 -; GFX8-NEXT: v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9] -; GFX8-NEXT: v_fma_f64 v[10:11], v[10:11], v[18:19], v[10:11] -; GFX8-NEXT: v_mul_f64 v[14:15], v[12:13], v[8:9] -; GFX8-NEXT: v_mul_f64 v[18:19], v[16:17], v[10:11] -; GFX8-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[12:13] -; GFX8-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[16:17] -; GFX8-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] -; GFX8-NEXT: s_mov_b64 vcc, s[4:5] -; GFX8-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] +; GFX8-NEXT: v_div_scale_f64 v[4:5], vcc, v[0:1], v[0:1], 1.0 +; GFX8-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; GFX8-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX8-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX8-NEXT: v_div_scale_f64 v[8:9], vcc, 1.0, v[0:1], 1.0 +; GFX8-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0 +; GFX8-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; GFX8-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] +; GFX8-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] +; GFX8-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] +; GFX8-NEXT: v_div_scale_f64 v[6:7], vcc, v[2:3], v[2:3], 1.0 ; GFX8-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; GFX8-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] +; GFX8-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX8-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX8-NEXT: v_div_scale_f64 v[10:11], vcc, 1.0, v[2:3], 1.0 +; GFX8-NEXT: v_fma_f64 v[12:13], -v[6:7], v[8:9], 1.0 +; GFX8-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; GFX8-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] +; GFX8-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] +; GFX8-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] ; GFX8-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_rcp_v2f64_ulp25: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0 -; GFX9-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0 -; GFX9-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] -; GFX9-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] -; GFX9-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX9-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX9-NEXT: v_div_scale_f64 v[12:13], vcc, 1.0, v[0:1], 1.0 -; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX9-NEXT: v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0 -; GFX9-NEXT: v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0 -; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9] -; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[18:19], v[10:11] -; GFX9-NEXT: v_mul_f64 v[14:15], v[12:13], v[8:9] -; GFX9-NEXT: v_mul_f64 v[18:19], v[16:17], v[10:11] -; GFX9-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[12:13] -; GFX9-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[16:17] -; GFX9-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] -; GFX9-NEXT: s_mov_b64 vcc, s[4:5] -; GFX9-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] +; GFX9-NEXT: v_div_scale_f64 v[4:5], vcc, v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; GFX9-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX9-NEXT: v_div_scale_f64 v[8:9], vcc, 1.0, v[0:1], 1.0 +; GFX9-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0 +; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; GFX9-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] +; GFX9-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] +; GFX9-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] +; GFX9-NEXT: v_div_scale_f64 v[6:7], vcc, v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; GFX9-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] +; GFX9-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX9-NEXT: v_div_scale_f64 v[10:11], vcc, 1.0, v[2:3], 1.0 +; GFX9-NEXT: v_fma_f64 v[12:13], -v[6:7], v[8:9], 1.0 +; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; GFX9-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] +; GFX9-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] +; GFX9-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] ; GFX9-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1622,28 +1603,27 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_div_scale_f64 v[4:5], s4, v[0:1], v[0:1], 1.0 -; GFX10-NEXT: v_div_scale_f64 v[6:7], s4, v[2:3], v[2:3], 1.0 -; GFX10-NEXT: v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0 -; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] -; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] -; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX10-NEXT: v_div_scale_f64 v[12:13], s4, 1.0, v[2:3], 1.0 -; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX10-NEXT: v_mul_f64 v[14:15], v[16:17], v[8:9] -; GFX10-NEXT: v_mul_f64 v[18:19], v[12:13], v[10:11] -; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17] -; GFX10-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[12:13] -; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] -; GFX10-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] +; GFX10-NEXT: v_div_scale_f64 v[4:5], vcc_lo, v[0:1], v[0:1], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc_lo, 1.0, v[0:1], 1.0 +; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX10-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7] +; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11] +; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; GFX10-NEXT: v_div_scale_f64 v[6:7], vcc_lo, v[2:3], v[2:3], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[12:13], vcc_lo, 1.0, v[2:3], 1.0 ; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX10-NEXT: v_mul_f64 v[10:11], v[12:13], v[8:9] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[6:7], v[10:11], v[12:13] +; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[10:11] ; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1651,37 +1631,37 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_div_scale_f64 v[4:5], null, v[0:1], v[0:1], 1.0 -; GFX11-NEXT: v_div_scale_f64 v[6:7], null, v[2:3], v[2:3], 1.0 -; GFX11-NEXT: v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] -; GFX11-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] +; GFX11-NEXT: v_div_scale_f64 v[4:5], vcc_lo, v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_div_scale_f64 v[10:11], vcc_lo, 1.0, v[0:1], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX11-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX11-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX11-NEXT: v_div_scale_f64 v[12:13], s0, 1.0, v[2:3], 1.0 -; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_f64 v[14:15], v[16:17], v[8:9] -; GFX11-NEXT: v_mul_f64 v[18:19], v[12:13], v[10:11] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17] -; GFX11-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[12:13] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] -; GFX11-NEXT: s_mov_b32 vcc_lo, s0 -; GFX11-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX11-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX11-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7] +; GFX11-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; GFX11-NEXT: v_div_scale_f64 v[6:7], vcc_lo, v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_div_scale_f64 v[12:13], vcc_lo, 1.0, v[2:3], 1.0 ; GFX11-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[10:11], v[12:13], v[8:9] +; GFX11-NEXT: v_fma_f64 v[6:7], -v[6:7], v[10:11], v[12:13] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[10:11] ; GFX11-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x double> , %x, !fpmath !0 @@ -1768,90 +1748,87 @@ ; GFX6-LABEL: v_fdiv_v2f64_arcp_ulp25: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] -; GFX6-NEXT: v_div_scale_f64 v[14:15], s[4:5], v[6:7], v[6:7], v[2:3] +; GFX6-NEXT: v_div_scale_f64 v[8:9], vcc, v[4:5], v[4:5], v[0:1] +; GFX6-NEXT: v_div_scale_f64 v[14:15], vcc, v[0:1], v[4:5], v[0:1] ; GFX6-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] -; GFX6-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[0:1], v[4:5], v[0:1] -; GFX6-NEXT: v_rcp_f64_e32 v[16:17], v[14:15] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v9 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v15 +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v19 ; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v9 ; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX6-NEXT: v_fma_f64 v[12:13], -v[14:15], v[16:17], 1.0 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v15 -; GFX6-NEXT: v_fma_f64 v[12:13], v[16:17], v[12:13], v[16:17] -; GFX6-NEXT: v_mul_f64 v[16:17], v[18:19], v[10:11] -; GFX6-NEXT: v_fma_f64 v[18:19], -v[8:9], v[16:17], v[18:19] -; GFX6-NEXT: v_fma_f64 v[8:9], -v[14:15], v[12:13], 1.0 -; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[18:19], v[10:11], v[16:17] -; GFX6-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13] -; GFX6-NEXT: v_div_scale_f64 v[12:13], s[6:7], v[2:3], v[6:7], v[2:3] -; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[10:11], v[4:5], v[0:1] -; GFX6-NEXT: v_mul_f64 v[16:17], v[12:13], v[8:9] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v13 -; GFX6-NEXT: v_fma_f64 v[18:19], -v[14:15], v[16:17], v[12:13] +; GFX6-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11] +; GFX6-NEXT: v_fma_f64 v[14:15], -v[8:9], v[12:13], v[14:15] +; GFX6-NEXT: v_div_fmas_f64 v[8:9], v[14:15], v[10:11], v[12:13] +; GFX6-NEXT: v_div_scale_f64 v[10:11], vcc, v[6:7], v[6:7], v[2:3] +; GFX6-NEXT: v_div_scale_f64 v[16:17], vcc, v[2:3], v[6:7], v[2:3] +; GFX6-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v11 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v17 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: s_nop 1 -; GFX6-NEXT: v_div_fmas_f64 v[8:9], v[18:19], v[8:9], v[16:17] -; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[6:7], v[2:3] +; GFX6-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX6-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX6-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX6-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX6-NEXT: v_mul_f64 v[14:15], v[16:17], v[12:13] +; GFX6-NEXT: v_fma_f64 v[10:11], -v[10:11], v[14:15], v[16:17] +; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[14:15] +; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_v2f64_arcp_ulp25: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] -; GFX8-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3] -; GFX8-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] -; GFX8-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] -; GFX8-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX8-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX8-NEXT: v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1] -; GFX8-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX8-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 -; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] -; GFX8-NEXT: v_mul_f64 v[16:17], v[18:19], v[12:13] -; GFX8-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19] -; GFX8-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3] -; GFX8-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17] -; GFX8-NEXT: s_mov_b64 vcc, s[4:5] -; GFX8-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15] +; GFX8-NEXT: v_div_scale_f64 v[8:9], vcc, v[4:5], v[4:5], v[0:1] +; GFX8-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; GFX8-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX8-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX8-NEXT: v_div_scale_f64 v[12:13], vcc, v[0:1], v[4:5], v[0:1] +; GFX8-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; GFX8-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; GFX8-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] +; GFX8-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] +; GFX8-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] +; GFX8-NEXT: v_div_scale_f64 v[10:11], vcc, v[6:7], v[6:7], v[2:3] ; GFX8-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] -; GFX8-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19] -; GFX8-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21] +; GFX8-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; GFX8-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX8-NEXT: v_div_scale_f64 v[14:15], vcc, v[2:3], v[6:7], v[2:3] +; GFX8-NEXT: v_fma_f64 v[16:17], -v[10:11], v[12:13], 1.0 +; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX8-NEXT: v_mul_f64 v[16:17], v[14:15], v[12:13] +; GFX8-NEXT: v_fma_f64 v[10:11], -v[10:11], v[16:17], v[14:15] +; GFX8-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[16:17] ; GFX8-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fdiv_v2f64_arcp_ulp25: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] -; GFX9-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3] -; GFX9-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] -; GFX9-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] -; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX9-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX9-NEXT: v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1] -; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX9-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 -; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] -; GFX9-NEXT: v_mul_f64 v[16:17], v[18:19], v[12:13] -; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19] -; GFX9-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3] -; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17] -; GFX9-NEXT: s_mov_b64 vcc, s[4:5] -; GFX9-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15] +; GFX9-NEXT: v_div_scale_f64 v[8:9], vcc, v[4:5], v[4:5], v[0:1] +; GFX9-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; GFX9-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX9-NEXT: v_div_scale_f64 v[12:13], vcc, v[0:1], v[4:5], v[0:1] +; GFX9-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; GFX9-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] +; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] +; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] +; GFX9-NEXT: v_div_scale_f64 v[10:11], vcc, v[6:7], v[6:7], v[2:3] ; GFX9-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] -; GFX9-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19] -; GFX9-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21] +; GFX9-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; GFX9-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX9-NEXT: v_div_scale_f64 v[14:15], vcc, v[2:3], v[6:7], v[2:3] +; GFX9-NEXT: v_fma_f64 v[16:17], -v[10:11], v[12:13], 1.0 +; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX9-NEXT: v_mul_f64 v[16:17], v[14:15], v[12:13] +; GFX9-NEXT: v_fma_f64 v[10:11], -v[10:11], v[16:17], v[14:15] +; GFX9-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[16:17] ; GFX9-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1859,28 +1836,27 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_div_scale_f64 v[8:9], s4, v[4:5], v[4:5], v[0:1] -; GFX10-NEXT: v_div_scale_f64 v[10:11], s4, v[6:7], v[6:7], v[2:3] -; GFX10-NEXT: v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[4:5], v[0:1] -; GFX10-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] -; GFX10-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] -; GFX10-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX10-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX10-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX10-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX10-NEXT: v_div_scale_f64 v[16:17], s4, v[2:3], v[6:7], v[2:3] -; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX10-NEXT: v_mul_f64 v[18:19], v[20:21], v[12:13] -; GFX10-NEXT: v_mul_f64 v[22:23], v[16:17], v[14:15] -; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[20:21] -; GFX10-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[16:17] -; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] -; GFX10-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] +; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[4:5], v[4:5], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[14:15], vcc_lo, v[0:1], v[4:5], v[0:1] +; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX10-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[12:13], v[14:15] +; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[12:13] +; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[6:7], v[6:7], v[2:3] +; GFX10-NEXT: v_div_scale_f64 v[16:17], vcc_lo, v[2:3], v[6:7], v[2:3] ; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX10-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; GFX10-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX10-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX10-NEXT: v_mul_f64 v[14:15], v[16:17], v[12:13] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[10:11], v[14:15], v[16:17] +; GFX10-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[14:15] ; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1888,37 +1864,37 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_div_scale_f64 v[8:9], null, v[4:5], v[4:5], v[0:1] -; GFX11-NEXT: v_div_scale_f64 v[10:11], null, v[6:7], v[6:7], v[2:3] -; GFX11-NEXT: v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[4:5], v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] -; GFX11-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] +; GFX11-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[4:5], v[4:5], v[0:1] +; GFX11-NEXT: v_div_scale_f64 v[14:15], vcc_lo, v[0:1], v[4:5], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX11-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX11-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX11-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX11-NEXT: v_div_scale_f64 v[16:17], s0, v[2:3], v[6:7], v[2:3] -; GFX11-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_f64 v[18:19], v[20:21], v[12:13] -; GFX11-NEXT: v_mul_f64 v[22:23], v[16:17], v[14:15] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[20:21] -; GFX11-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[16:17] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] -; GFX11-NEXT: s_mov_b32 vcc_lo, s0 -; GFX11-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11] +; GFX11-NEXT: v_fma_f64 v[8:9], -v[8:9], v[12:13], v[14:15] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[12:13] +; GFX11-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[6:7], v[6:7], v[2:3] +; GFX11-NEXT: v_div_scale_f64 v[16:17], vcc_lo, v[2:3], v[6:7], v[2:3] ; GFX11-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX11-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX11-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[14:15], v[16:17], v[12:13] +; GFX11-NEXT: v_fma_f64 v[10:11], -v[10:11], v[14:15], v[16:17] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[14:15] ; GFX11-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x double> %a, %b, !fpmath !0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll @@ -13,7 +13,7 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; CI-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0 +; CI-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 ; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 ; CI-NEXT: v_rcp_f32_e32 v4, v2 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -166,7 +166,7 @@ ; CI-NEXT: s_load_dword s0, s[0:1], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, s2 +; CI-NEXT: v_div_scale_f32 v1, vcc, v0, v0, s2 ; CI-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2 ; CI-NEXT: v_rcp_f32_e32 v3, v1 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -195,7 +195,7 @@ ; VI-NEXT: s_load_dword s0, s[0:1], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, s2 +; VI-NEXT: v_div_scale_f32 v1, vcc, v0, v0, s2 ; VI-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2 ; VI-NEXT: v_rcp_f32_e32 v3, v1 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -321,7 +321,7 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], s[2:3] +; CI-NEXT: v_div_scale_f64 v[2:3], vcc, v[0:1], v[0:1], s[2:3] ; CI-NEXT: v_div_scale_f64 v[8:9], vcc, s[2:3], v[0:1], s[2:3] ; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -349,7 +349,7 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], s[2:3] +; VI-NEXT: v_div_scale_f64 v[2:3], vcc, v[0:1], v[0:1], s[2:3] ; VI-NEXT: v_div_scale_f64 v[8:9], vcc, s[2:3], v[0:1], s[2:3] ; VI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -499,9 +499,9 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; CI-NEXT: s_lshr_b32 s6, s0, 16 -; CI-NEXT: s_lshr_b32 s3, s2, 16 -; CI-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0 +; CI-NEXT: s_lshr_b32 s1, s2, 16 +; CI-NEXT: s_lshr_b32 s3, s0, 16 +; CI-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 ; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 ; CI-NEXT: v_rcp_f32_e32 v4, v2 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -516,11 +516,11 @@ ; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 ; CI-NEXT: v_trunc_f32_e32 v2, v2 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, s6 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, s3 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_div_scale_f32 v3, s[0:1], v2, v2, v1 +; CI-NEXT: v_div_scale_f32 v3, vcc, v2, v2, v1 ; CI-NEXT: v_div_scale_f32 v4, vcc, v1, v2, v1 ; CI-NEXT: v_rcp_f32_e32 v5, v3 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -599,11 +599,11 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; CI-NEXT: s_lshr_b32 s8, s2, 16 -; CI-NEXT: s_lshr_b32 s9, s3, 16 -; CI-NEXT: s_lshr_b32 s10, s0, 16 -; CI-NEXT: v_div_scale_f32 v2, s[6:7], v1, v1, v0 -; CI-NEXT: s_lshr_b32 s11, s1, 16 +; CI-NEXT: s_lshr_b32 s6, s2, 16 +; CI-NEXT: s_lshr_b32 s7, s3, 16 +; CI-NEXT: s_lshr_b32 s8, s0, 16 +; CI-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 +; CI-NEXT: s_lshr_b32 s9, s1, 16 ; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 ; CI-NEXT: v_rcp_f32_e32 v4, v2 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -618,11 +618,11 @@ ; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 ; CI-NEXT: v_trunc_f32_e32 v2, v2 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s8 -; CI-NEXT: v_cvt_f32_f16_e32 v2, s10 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; CI-NEXT: v_cvt_f32_f16_e32 v2, s8 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_div_scale_f32 v3, s[6:7], v2, v2, v1 +; CI-NEXT: v_div_scale_f32 v3, vcc, v2, v2, v1 ; CI-NEXT: v_div_scale_f32 v4, vcc, v1, v2, v1 ; CI-NEXT: v_rcp_f32_e32 v5, v3 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -640,7 +640,7 @@ ; CI-NEXT: v_cvt_f32_f16_e32 v2, s3 ; CI-NEXT: v_cvt_f32_f16_e32 v3, s1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_div_scale_f32 v4, s[0:1], v3, v3, v2 +; CI-NEXT: v_div_scale_f32 v4, vcc, v3, v3, v2 ; CI-NEXT: v_div_scale_f32 v5, vcc, v2, v3, v2 ; CI-NEXT: v_rcp_f32_e32 v6, v4 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -655,10 +655,10 @@ ; CI-NEXT: v_div_fixup_f32 v4, v4, v3, v2 ; CI-NEXT: v_trunc_f32_e32 v4, v4 ; CI-NEXT: v_fma_f32 v2, -v4, v3, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, s9 -; CI-NEXT: v_cvt_f32_f16_e32 v4, s11 +; CI-NEXT: v_cvt_f32_f16_e32 v3, s7 +; CI-NEXT: v_cvt_f32_f16_e32 v4, s9 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_div_scale_f32 v5, s[0:1], v4, v4, v3 +; CI-NEXT: v_div_scale_f32 v5, vcc, v4, v4, v3 ; CI-NEXT: v_div_scale_f32 v6, vcc, v3, v4, v3 ; CI-NEXT: v_rcp_f32_e32 v7, v5 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -762,7 +762,7 @@ ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_div_scale_f32 v1, s[6:7], v0, v0, s2 +; CI-NEXT: v_div_scale_f32 v1, vcc, v0, v0, s2 ; CI-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2 ; CI-NEXT: v_rcp_f32_e32 v3, v1 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -778,7 +778,7 @@ ; CI-NEXT: v_trunc_f32_e32 v1, v1 ; CI-NEXT: v_fma_f32 v0, -v1, v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, s3 +; CI-NEXT: v_div_scale_f32 v2, vcc, v1, v1, s3 ; CI-NEXT: v_div_scale_f32 v3, vcc, s3, v1, s3 ; CI-NEXT: v_rcp_f32_e32 v4, v2 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -807,7 +807,7 @@ ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x20 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_div_scale_f32 v1, s[6:7], v0, v0, s2 +; VI-NEXT: v_div_scale_f32 v1, vcc, v0, v0, s2 ; VI-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2 ; VI-NEXT: v_rcp_f32_e32 v3, v1 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -823,7 +823,7 @@ ; VI-NEXT: v_trunc_f32_e32 v1, v1 ; VI-NEXT: v_fma_f32 v0, -v1, v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, s3 +; VI-NEXT: v_div_scale_f32 v2, vcc, v1, v1, s3 ; VI-NEXT: v_div_scale_f32 v3, vcc, s3, v1, s3 ; VI-NEXT: v_rcp_f32_e32 v4, v2 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -860,7 +860,7 @@ ; CI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x10 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s8 -; CI-NEXT: v_div_scale_f32 v1, s[6:7], v0, v0, s0 +; CI-NEXT: v_div_scale_f32 v1, vcc, v0, v0, s0 ; CI-NEXT: v_div_scale_f32 v2, vcc, s0, v0, s0 ; CI-NEXT: v_rcp_f32_e32 v3, v1 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -876,7 +876,7 @@ ; CI-NEXT: v_trunc_f32_e32 v1, v1 ; CI-NEXT: v_fma_f32 v0, -v1, v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s9 -; CI-NEXT: v_div_scale_f32 v2, s[6:7], v1, v1, s1 +; CI-NEXT: v_div_scale_f32 v2, vcc, v1, v1, s1 ; CI-NEXT: v_div_scale_f32 v3, vcc, s1, v1, s1 ; CI-NEXT: v_rcp_f32_e32 v4, v2 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -892,7 +892,7 @@ ; CI-NEXT: v_trunc_f32_e32 v2, v2 ; CI-NEXT: v_fma_f32 v1, -v2, v1, s1 ; CI-NEXT: v_mov_b32_e32 v2, s10 -; CI-NEXT: v_div_scale_f32 v3, s[0:1], v2, v2, s2 +; CI-NEXT: v_div_scale_f32 v3, vcc, v2, v2, s2 ; CI-NEXT: v_div_scale_f32 v4, vcc, s2, v2, s2 ; CI-NEXT: v_rcp_f32_e32 v5, v3 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -908,7 +908,7 @@ ; CI-NEXT: v_trunc_f32_e32 v3, v3 ; CI-NEXT: v_fma_f32 v2, -v3, v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s11 -; CI-NEXT: v_div_scale_f32 v4, s[0:1], v3, v3, s3 +; CI-NEXT: v_div_scale_f32 v4, vcc, v3, v3, s3 ; CI-NEXT: v_div_scale_f32 v5, vcc, s3, v3, s3 ; CI-NEXT: v_rcp_f32_e32 v6, v4 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -937,7 +937,7 @@ ; VI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x40 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_div_scale_f32 v1, s[6:7], v0, v0, s0 +; VI-NEXT: v_div_scale_f32 v1, vcc, v0, v0, s0 ; VI-NEXT: v_div_scale_f32 v2, vcc, s0, v0, s0 ; VI-NEXT: v_rcp_f32_e32 v3, v1 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -953,7 +953,7 @@ ; VI-NEXT: v_trunc_f32_e32 v1, v1 ; VI-NEXT: v_fma_f32 v0, -v1, v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_div_scale_f32 v2, s[6:7], v1, v1, s1 +; VI-NEXT: v_div_scale_f32 v2, vcc, v1, v1, s1 ; VI-NEXT: v_div_scale_f32 v3, vcc, s1, v1, s1 ; VI-NEXT: v_rcp_f32_e32 v4, v2 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -969,7 +969,7 @@ ; VI-NEXT: v_trunc_f32_e32 v2, v2 ; VI-NEXT: v_fma_f32 v1, -v2, v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_div_scale_f32 v3, s[0:1], v2, v2, s2 +; VI-NEXT: v_div_scale_f32 v3, vcc, v2, v2, s2 ; VI-NEXT: v_div_scale_f32 v4, vcc, s2, v2, s2 ; VI-NEXT: v_rcp_f32_e32 v5, v3 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -985,7 +985,7 @@ ; VI-NEXT: v_trunc_f32_e32 v3, v3 ; VI-NEXT: v_fma_f32 v2, -v3, v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: v_div_scale_f32 v4, s[0:1], v3, v3, s3 +; VI-NEXT: v_div_scale_f32 v4, vcc, v3, v3, s3 ; VI-NEXT: v_div_scale_f32 v5, vcc, s3, v3, s3 ; VI-NEXT: v_rcp_f32_e32 v6, v4 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -1023,7 +1023,7 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: v_mov_b32_e32 v1, s9 -; CI-NEXT: v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], s[0:1] +; CI-NEXT: v_div_scale_f64 v[2:3], vcc, v[0:1], v[0:1], s[0:1] ; CI-NEXT: v_div_scale_f64 v[8:9], vcc, s[0:1], v[0:1], s[0:1] ; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -1038,7 +1038,7 @@ ; CI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[0:1] ; CI-NEXT: v_mov_b32_e32 v2, s10 ; CI-NEXT: v_mov_b32_e32 v3, s11 -; CI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], s[2:3] +; CI-NEXT: v_div_scale_f64 v[4:5], vcc, v[2:3], v[2:3], s[2:3] ; CI-NEXT: v_div_scale_f64 v[10:11], vcc, s[2:3], v[2:3], s[2:3] ; CI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 @@ -1066,7 +1066,7 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], s[0:1] +; VI-NEXT: v_div_scale_f64 v[2:3], vcc, v[0:1], v[0:1], s[0:1] ; VI-NEXT: v_div_scale_f64 v[8:9], vcc, s[0:1], v[0:1], s[0:1] ; VI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -1081,7 +1081,7 @@ ; VI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[0:1] ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], s[2:3] +; VI-NEXT: v_div_scale_f64 v[4:5], vcc, v[2:3], v[2:3], s[2:3] ; VI-NEXT: v_div_scale_f64 v[10:11], vcc, s[2:3], v[2:3], s[2:3] ; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll @@ -20,8 +20,8 @@ ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v0, v0, v2 ; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7] +; GFX7-NEXT: v_div_scale_f32 v0, vcc, v0, v0, v2 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -40,7 +40,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_load_dword v1, v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0 +; GFX8-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -55,7 +55,7 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s2, v2, v2, v1 +; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, v2, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm @@ -69,7 +69,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, v1 +; GFX11-NEXT: v_div_scale_f32 v0, vcc_lo, v0, v0, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -102,8 +102,8 @@ ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v2, v0, v2 ; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7] +; GFX7-NEXT: v_div_scale_f32 v0, vcc, v2, v0, v2 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -122,7 +122,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_load_dword v1, v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v1, v0 +; GFX8-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -137,7 +137,7 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s2, v1, v2, v1 +; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, v1, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm @@ -151,7 +151,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, v1, v0, v1 +; GFX11-NEXT: v_div_scale_f32 v0, vcc_lo, v1, v0, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -186,7 +186,7 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[2:3], v[2:3], v[0:1] +; GFX7-NEXT: v_div_scale_f64 v[0:1], vcc, v[2:3], v[2:3], v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -208,7 +208,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[2:3], v[2:3], v[0:1] +; GFX8-NEXT: v_div_scale_f64 v[0:1], vcc, v[2:3], v[2:3], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -224,7 +224,7 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[2:3], v[2:3], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[0:1], vcc_lo, v[2:3], v[2:3], v[0:1] ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm @@ -239,7 +239,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_load_b64 v[2:3], v2, s[2:3] offset:8 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[2:3], v[2:3], v[0:1] +; GFX11-NEXT: v_div_scale_f64 v[0:1], vcc_lo, v[2:3], v[2:3], v[0:1] ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -274,7 +274,7 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[2:3], v[0:1] +; GFX7-NEXT: v_div_scale_f64 v[0:1], vcc, v[0:1], v[2:3], v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -296,7 +296,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[2:3], v[0:1] +; GFX8-NEXT: v_div_scale_f64 v[0:1], vcc, v[0:1], v[2:3], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -312,7 +312,7 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[0:1], v[2:3], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[0:1], vcc_lo, v[0:1], v[2:3], v[0:1] ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm @@ -327,7 +327,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_load_b64 v[2:3], v2, s[2:3] offset:8 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[0:1], v[2:3], v[0:1] +; GFX11-NEXT: v_div_scale_f64 v[0:1], vcc_lo, v[0:1], v[2:3], v[0:1] ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -360,7 +360,7 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_div_scale_f32 v0, s[0:1], v0, v0, s8 +; GFX7-NEXT: v_div_scale_f32 v0, vcc, v0, v0, s8 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; @@ -376,7 +376,7 @@ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, s0 +; GFX8-NEXT: v_div_scale_f32 v2, vcc, v0, v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -391,7 +391,7 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s0, v0, v0, s0 +; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, v0, v0, s0 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; @@ -403,7 +403,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, s0 +; GFX11-NEXT: v_div_scale_f32 v0, vcc_lo, v0, v0, s0 ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -433,7 +433,7 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_div_scale_f32 v0, s[0:1], s8, v0, s8 +; GFX7-NEXT: v_div_scale_f32 v0, vcc, s8, v0, s8 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; @@ -449,7 +449,7 @@ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], s0, v0, s0 +; GFX8-NEXT: v_div_scale_f32 v2, vcc, s0, v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -464,7 +464,7 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s0, s0, v0, s0 +; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, s0, v0, s0 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; @@ -476,7 +476,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s0, v0, s0 +; GFX11-NEXT: v_div_scale_f32 v0, vcc_lo, s0, v0, s0 ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -506,7 +506,7 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_div_scale_f32 v0, s[0:1], s8, s8, v0 +; GFX7-NEXT: v_div_scale_f32 v0, vcc, s8, s8, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; @@ -522,7 +522,7 @@ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], s0, s0, v0 +; GFX8-NEXT: v_div_scale_f32 v2, vcc, s0, s0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -537,7 +537,7 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s0, s0, s0, v0 +; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, s0, s0, v0 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; @@ -549,7 +549,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s0, s0, v0 +; GFX11-NEXT: v_div_scale_f32 v0, vcc_lo, s0, s0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -579,7 +579,7 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_div_scale_f32 v0, s[0:1], v0, s8, v0 +; GFX7-NEXT: v_div_scale_f32 v0, vcc, v0, s8, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; @@ -595,7 +595,7 @@ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v0, s0, v0 +; GFX8-NEXT: v_div_scale_f32 v2, vcc, v0, s0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -610,7 +610,7 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s0, v0, s0, v0 +; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, v0, s0, v0 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; @@ -622,7 +622,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, v0, s0, v0 +; GFX11-NEXT: v_div_scale_f32 v0, vcc_lo, v0, s0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -652,7 +652,7 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], s[0:1] +; GFX7-NEXT: v_div_scale_f64 v[0:1], vcc, v[0:1], v[0:1], s[0:1] ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; @@ -670,7 +670,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], s[0:1] +; GFX8-NEXT: v_div_scale_f64 v[0:1], vcc, v[0:1], v[0:1], s[0:1] ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -683,7 +683,7 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], v[0:1], s[0:1] +; GFX10-NEXT: v_div_scale_f64 v[0:1], vcc_lo, v[0:1], v[0:1], s[0:1] ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; @@ -696,7 +696,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[0:1], v[0:1], s[0:1] +; GFX11-NEXT: v_div_scale_f64 v[0:1], vcc_lo, v[0:1], v[0:1], s[0:1] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -726,7 +726,7 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[0:1], v[0:1], s[0:1] +; GFX7-NEXT: v_div_scale_f64 v[0:1], vcc, s[0:1], v[0:1], s[0:1] ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; @@ -744,7 +744,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[0:1], v[0:1], s[0:1] +; GFX8-NEXT: v_div_scale_f64 v[0:1], vcc, s[0:1], v[0:1], s[0:1] ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -757,7 +757,7 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[0:1], v[0:1], s[0:1] +; GFX10-NEXT: v_div_scale_f64 v[0:1], vcc_lo, s[0:1], v[0:1], s[0:1] ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; @@ -770,7 +770,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[0:1], v[0:1], s[0:1] +; GFX11-NEXT: v_div_scale_f64 v[0:1], vcc_lo, s[0:1], v[0:1], s[0:1] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -800,7 +800,7 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[0:1], s[0:1], v[0:1] +; GFX7-NEXT: v_div_scale_f64 v[0:1], vcc, s[0:1], s[0:1], v[0:1] ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; @@ -818,7 +818,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[0:1], s[0:1], v[0:1] +; GFX8-NEXT: v_div_scale_f64 v[0:1], vcc, s[0:1], s[0:1], v[0:1] ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -831,7 +831,7 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[0:1], s[0:1], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[0:1], vcc_lo, s[0:1], s[0:1], v[0:1] ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; @@ -844,7 +844,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[0:1], s[0:1], v[0:1] +; GFX11-NEXT: v_div_scale_f64 v[0:1], vcc_lo, s[0:1], s[0:1], v[0:1] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -874,7 +874,7 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], s[0:1], v[0:1] +; GFX7-NEXT: v_div_scale_f64 v[0:1], vcc, v[0:1], s[0:1], v[0:1] ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; @@ -892,7 +892,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], s[0:1], v[0:1] +; GFX8-NEXT: v_div_scale_f64 v[0:1], vcc, v[0:1], s[0:1], v[0:1] ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -905,7 +905,7 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], s[0:1], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[0:1], vcc_lo, v[0:1], s[0:1], v[0:1] ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; @@ -918,7 +918,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[0:1], s[0:1], v[0:1] +; GFX11-NEXT: v_div_scale_f64 v[0:1], vcc_lo, v[0:1], s[0:1], v[0:1] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -942,7 +942,7 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s3 -; GFX7-NEXT: v_div_scale_f32 v0, s[4:5], v0, v0, s4 +; GFX7-NEXT: v_div_scale_f32 v0, vcc, v0, v0, s4 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -954,7 +954,7 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, s3 +; GFX8-NEXT: v_div_scale_f32 v2, vcc, v0, v0, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -968,7 +968,7 @@ ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s0, s5, s5, s4 +; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, s5, s5, s4 ; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; @@ -980,7 +980,7 @@ ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s2 +; GFX11-NEXT: v_div_scale_f32 v0, vcc_lo, s3, s3, s2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -999,7 +999,7 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s3 -; GFX7-NEXT: v_div_scale_f32 v0, s[4:5], s4, v0, s4 +; GFX7-NEXT: v_div_scale_f32 v0, vcc, s4, v0, s4 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -1011,7 +1011,7 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s3, v0, s3 +; GFX8-NEXT: v_div_scale_f32 v2, vcc, s3, v0, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1025,7 +1025,7 @@ ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s0, s4, s5, s4 +; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, s4, s5, s4 ; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; @@ -1037,7 +1037,7 @@ ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s2, s3, s2 +; GFX11-NEXT: v_div_scale_f32 v0, vcc_lo, s2, s3, s2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1056,7 +1056,7 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[4:5] +; GFX7-NEXT: v_div_scale_f64 v[0:1], vcc, v[0:1], v[0:1], s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1070,7 +1070,7 @@ ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[4:5] +; GFX8-NEXT: v_div_scale_f64 v[0:1], vcc, v[0:1], v[0:1], s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1084,7 +1084,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[4:5], s[4:5], s[2:3] +; GFX10-NEXT: v_div_scale_f64 v[0:1], vcc_lo, s[4:5], s[4:5], s[2:3] ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -1096,7 +1096,7 @@ ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[4:5], s[4:5], s[2:3] +; GFX11-NEXT: v_div_scale_f64 v[0:1], vcc_lo, s[4:5], s[4:5], s[2:3] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1115,7 +1115,7 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], s[4:5], v[0:1], s[4:5] +; GFX7-NEXT: v_div_scale_f64 v[0:1], vcc, s[4:5], v[0:1], s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1129,7 +1129,7 @@ ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], s[4:5], v[0:1], s[4:5] +; GFX8-NEXT: v_div_scale_f64 v[0:1], vcc, s[4:5], v[0:1], s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1143,7 +1143,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[2:3], s[4:5], s[2:3] +; GFX10-NEXT: v_div_scale_f64 v[0:1], vcc_lo, s[2:3], s[4:5], s[2:3] ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -1155,7 +1155,7 @@ ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[2:3], s[4:5], s[2:3] +; GFX11-NEXT: v_div_scale_f64 v[0:1], vcc_lo, s[2:3], s[4:5], s[2:3] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1177,9 +1177,9 @@ ; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v0, v0, 1.0 ; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_div_scale_f32 v0, vcc, v0, v0, 1.0 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -1194,7 +1194,7 @@ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, 1.0 +; GFX8-NEXT: v_div_scale_f32 v2, vcc, v0, v0, 1.0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1208,7 +1208,7 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, 1.0 +; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, v0, v0, 1.0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -1219,7 +1219,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, 1.0 +; GFX11-NEXT: v_div_scale_f32 v0, vcc_lo, v0, v0, 1.0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1245,9 +1245,9 @@ ; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], 2.0, 2.0, v0 ; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_div_scale_f32 v0, vcc, 2.0, 2.0, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -1262,7 +1262,7 @@ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], 2.0, 2.0, v0 +; GFX8-NEXT: v_div_scale_f32 v2, vcc, 2.0, 2.0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1276,7 +1276,7 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s2, 2.0, 2.0, v0 +; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, 2.0, 2.0, v0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -1287,7 +1287,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, 2.0, 2.0, v0 +; GFX11-NEXT: v_div_scale_f32 v0, vcc_lo, 2.0, 2.0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1316,9 +1316,9 @@ ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: v_and_b32_e32 v1, 0x7fffffff, v2 -; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v0, v0, v1 ; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7] +; GFX7-NEXT: v_and_b32_e32 v1, 0x7fffffff, v2 +; GFX7-NEXT: v_div_scale_f32 v0, vcc, v0, v0, v1 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -1338,7 +1338,7 @@ ; GFX8-NEXT: flat_load_dword v0, v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v1, 0x7fffffff, v2 -; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, v1 +; GFX8-NEXT: v_div_scale_f32 v2, vcc, v0, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1355,7 +1355,7 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_div_scale_f32 v0, s2, v2, v2, v0 +; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, v2, v2, v0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -1370,7 +1370,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, v1 +; GFX11-NEXT: v_div_scale_f32 v0, vcc_lo, v0, v0, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1405,9 +1405,9 @@ ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 -; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v0, v0, v2 ; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7] +; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX7-NEXT: v_div_scale_f32 v0, vcc, v0, v0, v2 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -1427,7 +1427,7 @@ ; GFX8-NEXT: flat_load_dword v1, v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0 +; GFX8-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1443,7 +1443,7 @@ ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v2 -; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, v1 +; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, v0, v0, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm @@ -1459,7 +1459,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, v1 +; GFX11-NEXT: v_div_scale_f32 v0, vcc_lo, v0, v0, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1484,17 +1484,17 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: v_mov_b32_e32 v0, 0x41000000 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], s0, s0, v0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_div_scale_f32 v0, vcc, s0, s0, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_scale_f32_val_undef_val: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_mov_b32_e32 v0, 0x41000000 -; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s0, s0, v0 +; GFX8-NEXT: v_div_scale_f32 v2, vcc, s0, s0, v0 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -1507,7 +1507,7 @@ ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s2, s0, s0, 0x41000000 +; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, s0, s0, 0x41000000 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -1516,7 +1516,7 @@ ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s0, s0, 0x41000000 +; GFX11-NEXT: v_div_scale_f32 v0, vcc_lo, s0, s0, 0x41000000 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1531,17 +1531,17 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: v_mov_b32_e32 v0, 0x41000000 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v0, v0, s0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_div_scale_f32 v0, vcc, v0, v0, s0 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_scale_f32_undef_val_val: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_mov_b32_e32 v0, 0x41000000 -; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, s0 +; GFX8-NEXT: v_div_scale_f32 v2, vcc, v0, v0, s0 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -1554,7 +1554,7 @@ ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s2, 0x41000000, 0x41000000, s0 +; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, 0x41000000, 0x41000000, s0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -1563,7 +1563,7 @@ ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, 0x41000000, 0x41000000, s0 +; GFX11-NEXT: v_div_scale_f32 v0, vcc_lo, 0x41000000, 0x41000000, s0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1577,16 +1577,16 @@ ; GFX7-LABEL: test_div_scale_f32_undef_undef_val: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], s0, s0, s0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_div_scale_f32 v0, vcc, s0, s0, s0 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_scale_f32_undef_undef_val: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s0, s0, s0 +; GFX8-NEXT: v_div_scale_f32 v2, vcc, s0, s0, s0 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -1599,7 +1599,7 @@ ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s2, s0, s0, s0 +; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, s0, s0, s0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -1608,7 +1608,7 @@ ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s0, s0, s0 +; GFX11-NEXT: v_div_scale_f32 v0, vcc_lo, s0, s0, s0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1624,7 +1624,7 @@ ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0x40200000 -; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[2:3] +; GFX7-NEXT: v_div_scale_f64 v[0:1], vcc, v[0:1], v[0:1], s[2:3] ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 @@ -1635,7 +1635,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_mov_b32 s2, 0 ; GFX8-NEXT: s_mov_b32 s3, 0x40200000 -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[2:3] +; GFX8-NEXT: v_div_scale_f64 v[0:1], vcc, v[0:1], v[0:1], s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, s1 @@ -1648,7 +1648,7 @@ ; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: s_mov_b32 s3, 0x40200000 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[0:1], s[0:1], s[2:3] +; GFX10-NEXT: v_div_scale_f64 v[0:1], vcc_lo, s[0:1], s[0:1], s[2:3] ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -1659,7 +1659,7 @@ ; GFX11-NEXT: s_mov_b32 s2, 0 ; GFX11-NEXT: s_mov_b32 s3, 0x40200000 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[0:1], s[0:1], s[2:3] +; GFX11-NEXT: v_div_scale_f64 v[0:1], vcc_lo, s[0:1], s[0:1], s[2:3] ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll b/llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll --- a/llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll @@ -11,24 +11,25 @@ ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GCN-NEXT: %4:vgpr_32, %5:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %6:vgpr_32, %7:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %8:vgpr_32 = nofpexcept V_RCP_F32_e64 0, %6, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %4:vgpr_32 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit-def dead $vcc, implicit $mode, implicit $exec + ; GCN-NEXT: %5:vgpr_32 = nofpexcept V_RCP_F32_e64 0, %4, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %6:vgpr_32 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit-def $vcc, implicit $mode, implicit $exec + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY $vcc ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 3 ; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216 ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GCN-NEXT: S_SETREG_B32_mode killed [[S_MOV_B32_]], 2305, implicit-def $mode, implicit $mode - ; GCN-NEXT: %12:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %6, 0, %8, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %13:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %12, 0, %8, 0, %8, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %14:vgpr_32 = nofpexcept V_MUL_F32_e64 0, %4, 0, %13, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %15:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %6, 0, %14, 0, %4, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %16:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %15, 0, %13, 0, %14, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %17:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %6, 0, %16, 0, %4, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %11:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %4, 0, %5, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %12:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %11, 0, %5, 0, %5, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %13:vgpr_32 = nofpexcept V_MUL_F32_e64 0, %6, 0, %12, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %14:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %4, 0, %13, 0, %6, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %15:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %14, 0, %12, 0, %13, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %16:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %4, 0, %15, 0, %6, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: S_SETREG_B32_mode killed [[S_MOV_B32_2]], 2305, implicit-def dead $mode, implicit $mode - ; GCN-NEXT: $vcc = COPY %5 - ; GCN-NEXT: %18:vgpr_32 = nofpexcept V_DIV_FMAS_F32_e64 0, killed %17, 0, %13, 0, %16, 0, 0, implicit $mode, implicit $vcc, implicit $exec - ; GCN-NEXT: %19:vgpr_32 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed %18, 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: $vgpr0 = COPY %19 + ; GCN-NEXT: $vcc = COPY [[COPY2]] + ; GCN-NEXT: %17:vgpr_32 = nofpexcept V_DIV_FMAS_F32_e64 0, killed %16, 0, %12, 0, %15, 0, 0, implicit $mode, implicit $vcc, implicit $exec + ; GCN-NEXT: %18:vgpr_32 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed %17, 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr0 = COPY %18 ; GCN-NEXT: SI_RETURN implicit $vgpr0 entry: %fdiv = fdiv float %a, %b @@ -42,24 +43,25 @@ ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GCN-NEXT: %4:vgpr_32, %5:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %6:vgpr_32, %7:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %8:vgpr_32 = nnan nofpexcept V_RCP_F32_e64 0, %6, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %4:vgpr_32 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit-def dead $vcc, implicit $mode, implicit $exec + ; GCN-NEXT: %5:vgpr_32 = nnan nofpexcept V_RCP_F32_e64 0, %4, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %6:vgpr_32 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit-def $vcc, implicit $mode, implicit $exec + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY $vcc ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 3 ; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216 ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GCN-NEXT: S_SETREG_B32_mode killed [[S_MOV_B32_]], 2305, implicit-def $mode, implicit $mode - ; GCN-NEXT: %12:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %6, 0, %8, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %13:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed %12, 0, %8, 0, %8, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %14:vgpr_32 = nnan nofpexcept V_MUL_F32_e64 0, %4, 0, %13, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %15:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %6, 0, %14, 0, %4, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %16:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed %15, 0, %13, 0, %14, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %17:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %6, 0, %16, 0, %4, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %11:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %4, 0, %5, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %12:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed %11, 0, %5, 0, %5, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %13:vgpr_32 = nnan nofpexcept V_MUL_F32_e64 0, %6, 0, %12, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %14:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %4, 0, %13, 0, %6, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %15:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed %14, 0, %12, 0, %13, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %16:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %4, 0, %15, 0, %6, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: S_SETREG_B32_mode killed [[S_MOV_B32_2]], 2305, implicit-def dead $mode, implicit $mode - ; GCN-NEXT: $vcc = COPY %5 - ; GCN-NEXT: %18:vgpr_32 = nnan nofpexcept V_DIV_FMAS_F32_e64 0, killed %17, 0, %13, 0, %16, 0, 0, implicit $mode, implicit $vcc, implicit $exec - ; GCN-NEXT: %19:vgpr_32 = nnan nofpexcept V_DIV_FIXUP_F32_e64 0, killed %18, 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: $vgpr0 = COPY %19 + ; GCN-NEXT: $vcc = COPY [[COPY2]] + ; GCN-NEXT: %17:vgpr_32 = nnan nofpexcept V_DIV_FMAS_F32_e64 0, killed %16, 0, %12, 0, %15, 0, 0, implicit $mode, implicit $vcc, implicit $exec + ; GCN-NEXT: %18:vgpr_32 = nnan nofpexcept V_DIV_FIXUP_F32_e64 0, killed %17, 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr0 = COPY %18 ; GCN-NEXT: SI_RETURN implicit $vgpr0 entry: %fdiv = fdiv nnan float %a, %b diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll --- a/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll @@ -6,12 +6,12 @@ ; GCN-LABEL: {{^}}fdiv_f64: ; GCN-DAG: buffer_load_dwordx2 [[NUM:v\[[0-9]+:[0-9]+\]]], off, {{s\[[0-9]+:[0-9]+\]}}, 0 ; GCN-DAG: buffer_load_dwordx2 [[DEN:v\[[0-9]+:[0-9]+\]]], off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:8 -; CI-DAG: v_div_scale_f64 [[SCALE0:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[DEN]], [[DEN]], [[NUM]] +; CI-DAG: v_div_scale_f64 [[SCALE0:v\[[0-9]+:[0-9]+\]]], vcc, [[DEN]], [[DEN]], [[NUM]] ; CI-DAG: v_div_scale_f64 [[SCALE1:v\[[0-9]+:[0-9]+\]]], vcc, [[NUM]], [[DEN]], [[NUM]] ; Check for div_scale bug workaround on SI -; SI-DAG: v_div_scale_f64 [[SCALE0:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[DEN]], [[DEN]], [[NUM]] -; SI-DAG: v_div_scale_f64 [[SCALE1:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[NUM]], [[DEN]], [[NUM]] +; SI-DAG: v_div_scale_f64 [[SCALE0:v\[[0-9]+:[0-9]+\]]], vcc, [[DEN]], [[DEN]], [[NUM]] +; SI-DAG: v_div_scale_f64 [[SCALE1:v\[[0-9]+:[0-9]+\]]], vcc, [[NUM]], [[DEN]], [[NUM]] ; GCN-DAG: v_rcp_f64_e32 [[RCP_SCALE0:v\[[0-9]+:[0-9]+\]]], [[SCALE0]] diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -28,18 +28,18 @@ ; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 -; SI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 -; SI-NEXT: v_rcp_f32_e32 v4, v3 +; SI-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 +; SI-NEXT: v_rcp_f32_e32 v3, v2 +; SI-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 -; SI-NEXT: v_fma_f32 v4, v5, v4, v4 -; SI-NEXT: v_mul_f32_e32 v5, v2, v4 -; SI-NEXT: v_fma_f32 v6, -v3, v5, v2 -; SI-NEXT: v_fma_f32 v5, v6, v4, v5 -; SI-NEXT: v_fma_f32 v2, -v3, v5, v2 +; SI-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; SI-NEXT: v_fma_f32 v3, v5, v3, v3 +; SI-NEXT: v_mul_f32_e32 v5, v4, v3 +; SI-NEXT: v_fma_f32 v6, -v2, v5, v4 +; SI-NEXT: v_fma_f32 v5, v6, v3, v5 +; SI-NEXT: v_fma_f32 v2, -v2, v5, v4 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; SI-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; SI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 ; SI-NEXT: v_trunc_f32_e32 v2, v2 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 @@ -69,18 +69,18 @@ ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 -; CI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 -; CI-NEXT: v_rcp_f32_e32 v4, v3 +; CI-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 +; CI-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; CI-NEXT: v_rcp_f32_e32 v3, v2 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 -; CI-NEXT: v_fma_f32 v4, v5, v4, v4 -; CI-NEXT: v_mul_f32_e32 v5, v2, v4 -; CI-NEXT: v_fma_f32 v6, -v3, v5, v2 -; CI-NEXT: v_fma_f32 v5, v6, v4, v5 -; CI-NEXT: v_fma_f32 v2, -v3, v5, v2 +; CI-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; CI-NEXT: v_fma_f32 v3, v5, v3, v3 +; CI-NEXT: v_mul_f32_e32 v5, v4, v3 +; CI-NEXT: v_fma_f32 v6, -v2, v5, v4 +; CI-NEXT: v_fma_f32 v5, v6, v3, v5 +; CI-NEXT: v_fma_f32 v2, -v2, v5, v4 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; CI-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 ; CI-NEXT: v_trunc_f32_e32 v2, v2 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 @@ -506,18 +506,18 @@ ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 -; SI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 -; SI-NEXT: v_rcp_f32_e32 v4, v3 +; SI-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 +; SI-NEXT: v_rcp_f32_e32 v3, v2 +; SI-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 -; SI-NEXT: v_fma_f32 v4, v5, v4, v4 -; SI-NEXT: v_mul_f32_e32 v5, v2, v4 -; SI-NEXT: v_fma_f32 v6, -v3, v5, v2 -; SI-NEXT: v_fma_f32 v5, v6, v4, v5 -; SI-NEXT: v_fma_f32 v2, -v3, v5, v2 +; SI-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; SI-NEXT: v_fma_f32 v3, v5, v3, v3 +; SI-NEXT: v_mul_f32_e32 v5, v4, v3 +; SI-NEXT: v_fma_f32 v6, -v2, v5, v4 +; SI-NEXT: v_fma_f32 v5, v6, v3, v5 +; SI-NEXT: v_fma_f32 v2, -v2, v5, v4 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; SI-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; SI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 ; SI-NEXT: v_trunc_f32_e32 v2, v2 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 @@ -542,18 +542,18 @@ ; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 -; CI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 -; CI-NEXT: v_rcp_f32_e32 v4, v3 +; CI-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 +; CI-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; CI-NEXT: v_rcp_f32_e32 v3, v2 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 -; CI-NEXT: v_fma_f32 v4, v5, v4, v4 -; CI-NEXT: v_mul_f32_e32 v5, v2, v4 -; CI-NEXT: v_fma_f32 v6, -v3, v5, v2 -; CI-NEXT: v_fma_f32 v5, v6, v4, v5 -; CI-NEXT: v_fma_f32 v2, -v3, v5, v2 +; CI-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; CI-NEXT: v_fma_f32 v3, v5, v3, v3 +; CI-NEXT: v_mul_f32_e32 v5, v4, v3 +; CI-NEXT: v_fma_f32 v6, -v2, v5, v4 +; CI-NEXT: v_fma_f32 v5, v6, v3, v5 +; CI-NEXT: v_fma_f32 v2, -v2, v5, v4 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; CI-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 ; CI-NEXT: v_trunc_f32_e32 v2, v2 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 @@ -576,18 +576,18 @@ ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_div_scale_f32 v5, s[0:1], v2, v2, v4 -; VI-NEXT: v_div_scale_f32 v3, vcc, v4, v2, v4 -; VI-NEXT: v_rcp_f32_e32 v6, v5 +; VI-NEXT: v_div_scale_f32 v3, vcc, v2, v2, v4 +; VI-NEXT: v_div_scale_f32 v6, vcc, v4, v2, v4 +; VI-NEXT: v_rcp_f32_e32 v5, v3 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; VI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 -; VI-NEXT: v_fma_f32 v6, v7, v6, v6 -; VI-NEXT: v_mul_f32_e32 v7, v3, v6 -; VI-NEXT: v_fma_f32 v8, -v5, v7, v3 -; VI-NEXT: v_fma_f32 v7, v8, v6, v7 -; VI-NEXT: v_fma_f32 v3, -v5, v7, v3 +; VI-NEXT: v_fma_f32 v7, -v3, v5, 1.0 +; VI-NEXT: v_fma_f32 v5, v7, v5, v5 +; VI-NEXT: v_mul_f32_e32 v7, v6, v5 +; VI-NEXT: v_fma_f32 v8, -v3, v7, v6 +; VI-NEXT: v_fma_f32 v7, v8, v5, v7 +; VI-NEXT: v_fma_f32 v3, -v3, v7, v6 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; VI-NEXT: v_div_fmas_f32 v3, v3, v6, v7 +; VI-NEXT: v_div_fmas_f32 v3, v3, v5, v7 ; VI-NEXT: v_div_fixup_f32 v3, v3, v2, v4 ; VI-NEXT: v_trunc_f32_e32 v3, v3 ; VI-NEXT: v_fma_f32 v2, -v3, v2, v4 @@ -603,18 +603,18 @@ ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_div_scale_f32 v4, s[0:1], v2, v2, v1 -; GFX9-NEXT: v_div_scale_f32 v3, vcc, v1, v2, v1 -; GFX9-NEXT: v_rcp_f32_e32 v5, v4 +; GFX9-NEXT: v_div_scale_f32 v3, vcc, v2, v2, v1 +; GFX9-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1 +; GFX9-NEXT: v_rcp_f32_e32 v4, v3 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX9-NEXT: v_fma_f32 v6, -v4, v5, 1.0 -; GFX9-NEXT: v_fma_f32 v5, v6, v5, v5 -; GFX9-NEXT: v_mul_f32_e32 v6, v3, v5 -; GFX9-NEXT: v_fma_f32 v7, -v4, v6, v3 -; GFX9-NEXT: v_fma_f32 v6, v7, v5, v6 -; GFX9-NEXT: v_fma_f32 v3, -v4, v6, v3 +; GFX9-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX9-NEXT: v_fma_f32 v4, v6, v4, v4 +; GFX9-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX9-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX9-NEXT: v_fma_f32 v6, v7, v4, v6 +; GFX9-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX9-NEXT: v_div_fmas_f32 v3, v3, v5, v6 +; GFX9-NEXT: v_div_fmas_f32 v3, v3, v4, v6 ; GFX9-NEXT: v_div_fixup_f32 v3, v3, v2, v1 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_fma_f32 v1, -v3, v2, v1 @@ -632,18 +632,18 @@ ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v4, s0, v2, v2, v1 -; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v2, v1 -; GFX10-NEXT: v_rcp_f32_e32 v5, v4 +; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, v2, v2, v1 +; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v2, v1 +; GFX10-NEXT: v_rcp_f32_e32 v4, v3 ; GFX10-NEXT: s_denorm_mode 15 -; GFX10-NEXT: v_fma_f32 v6, -v4, v5, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v5, v6, v5 -; GFX10-NEXT: v_mul_f32_e32 v6, v3, v5 -; GFX10-NEXT: v_fma_f32 v7, -v4, v6, v3 -; GFX10-NEXT: v_fmac_f32_e32 v6, v7, v5 -; GFX10-NEXT: v_fma_f32 v3, -v4, v6, v3 +; GFX10-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX10-NEXT: v_fmac_f32_e32 v4, v6, v4 +; GFX10-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX10-NEXT: v_fma_f32 v7, -v3, v6, v5 +; GFX10-NEXT: v_fmac_f32_e32 v6, v7, v4 +; GFX10-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_fmas_f32 v3, v3, v5, v6 +; GFX10-NEXT: v_div_fmas_f32 v3, v3, v4, v6 ; GFX10-NEXT: v_div_fixup_f32 v3, v3, v2, v1 ; GFX10-NEXT: v_trunc_f32_e32 v3, v3 ; GFX10-NEXT: v_fma_f32 v1, -v3, v2, v1 @@ -661,23 +661,23 @@ ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:16 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v4, null, v2, v2, v1 -; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v2, v1 +; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, v2, v2, v1 +; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v2, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_f32_e32 v5, v4 +; GFX11-NEXT: v_rcp_f32_e32 v4, v3 ; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v6, -v4, v5, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v5 +; GFX11-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; GFX11-NEXT: v_fmac_f32_e32 v4, v6, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_f32_e32 v6, v3, v5 -; GFX11-NEXT: v_fma_f32 v7, -v4, v6, v3 +; GFX11-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX11-NEXT: v_fma_f32 v7, -v3, v6, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v5 -; GFX11-NEXT: v_fma_f32 v3, -v4, v6, v3 +; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v4 +; GFX11-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX11-NEXT: s_denorm_mode 12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_div_fmas_f32 v3, v3, v5, v6 +; GFX11-NEXT: v_div_fmas_f32 v3, v3, v4, v6 ; GFX11-NEXT: v_div_fixup_f32 v3, v3, v2, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_trunc_f32_e32 v3, v3 @@ -987,13 +987,13 @@ ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1] +; SI-NEXT: v_div_scale_f64 v[4:5], vcc, v[2:3], v[2:3], v[0:1] ; SI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; SI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; SI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] ; SI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; SI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; SI-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[0:1], v[2:3], v[0:1] +; SI-NEXT: v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1] ; SI-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] ; SI-NEXT: v_fma_f64 v[12:13], -v[4:5], v[10:11], v[8:9] ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -1040,7 +1040,7 @@ ; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1] +; CI-NEXT: v_div_scale_f64 v[4:5], vcc, v[2:3], v[2:3], v[0:1] ; CI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; CI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] @@ -1071,7 +1071,7 @@ ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[2:3] +; VI-NEXT: v_div_scale_f64 v[6:7], vcc, v[4:5], v[4:5], v[2:3] ; VI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] ; VI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 ; VI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] @@ -1097,7 +1097,7 @@ ; GFX9-NEXT: global_load_dwordx2 v[0:1], v12, s[6:7] ; GFX9-NEXT: global_load_dwordx2 v[2:3], v12, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1] +; GFX9-NEXT: v_div_scale_f64 v[4:5], vcc, v[2:3], v[2:3], v[0:1] ; GFX9-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX9-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] @@ -1125,7 +1125,7 @@ ; GFX10-NEXT: global_load_dwordx2 v[0:1], v12, s[6:7] ; GFX10-NEXT: global_load_dwordx2 v[2:3], v12, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[4:5], s0, v[2:3], v[2:3], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[4:5], vcc_lo, v[2:3], v[2:3], v[0:1] ; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] @@ -1152,7 +1152,7 @@ ; GFX11-NEXT: global_load_b64 v[0:1], v12, s[6:7] ; GFX11-NEXT: global_load_b64 v[2:3], v12, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1] +; GFX11-NEXT: v_div_scale_f64 v[4:5], vcc_lo, v[2:3], v[2:3], v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -1584,36 +1584,36 @@ ; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_div_scale_f32 v4, vcc, v0, v2, v0 -; SI-NEXT: v_div_scale_f32 v5, s[4:5], v2, v2, v0 -; SI-NEXT: v_rcp_f32_e32 v6, v5 +; SI-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v0 +; SI-NEXT: v_rcp_f32_e32 v5, v4 +; SI-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 -; SI-NEXT: v_fma_f32 v6, v7, v6, v6 -; SI-NEXT: v_mul_f32_e32 v7, v4, v6 -; SI-NEXT: v_fma_f32 v8, -v5, v7, v4 -; SI-NEXT: v_fma_f32 v7, v8, v6, v7 -; SI-NEXT: v_fma_f32 v4, -v5, v7, v4 +; SI-NEXT: v_fma_f32 v7, -v4, v5, 1.0 +; SI-NEXT: v_fma_f32 v5, v7, v5, v5 +; SI-NEXT: v_mul_f32_e32 v7, v6, v5 +; SI-NEXT: v_fma_f32 v8, -v4, v7, v6 +; SI-NEXT: v_fma_f32 v7, v8, v5, v7 +; SI-NEXT: v_fma_f32 v4, -v4, v7, v6 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 +; SI-NEXT: v_div_fmas_f32 v4, v4, v5, v7 ; SI-NEXT: v_div_fixup_f32 v4, v4, v2, v0 ; SI-NEXT: v_trunc_f32_e32 v4, v4 ; SI-NEXT: v_fma_f32 v0, -v4, v2, v0 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1 -; SI-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v1 -; SI-NEXT: v_rcp_f32_e32 v5, v4 +; SI-NEXT: v_div_scale_f32 v2, vcc, v3, v3, v1 +; SI-NEXT: v_rcp_f32_e32 v4, v2 +; SI-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 -; SI-NEXT: v_fma_f32 v5, v6, v5, v5 -; SI-NEXT: v_mul_f32_e32 v6, v2, v5 -; SI-NEXT: v_fma_f32 v7, -v4, v6, v2 -; SI-NEXT: v_fma_f32 v6, v7, v5, v6 -; SI-NEXT: v_fma_f32 v2, -v4, v6, v2 +; SI-NEXT: v_fma_f32 v6, -v2, v4, 1.0 +; SI-NEXT: v_fma_f32 v4, v6, v4, v4 +; SI-NEXT: v_mul_f32_e32 v6, v5, v4 +; SI-NEXT: v_fma_f32 v7, -v2, v6, v5 +; SI-NEXT: v_fma_f32 v6, v7, v4, v6 +; SI-NEXT: v_fma_f32 v2, -v2, v6, v5 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v2, v2, v5, v6 +; SI-NEXT: v_div_fmas_f32 v2, v2, v4, v6 ; SI-NEXT: v_div_fixup_f32 v2, v2, v3, v1 ; SI-NEXT: v_trunc_f32_e32 v2, v2 ; SI-NEXT: v_fma_f32 v1, -v2, v3, v1 @@ -1647,36 +1647,36 @@ ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_div_scale_f32 v5, s[4:5], v2, v2, v0 -; CI-NEXT: v_div_scale_f32 v4, vcc, v0, v2, v0 -; CI-NEXT: v_rcp_f32_e32 v6, v5 +; CI-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v0 +; CI-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 +; CI-NEXT: v_rcp_f32_e32 v5, v4 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 -; CI-NEXT: v_fma_f32 v6, v7, v6, v6 -; CI-NEXT: v_mul_f32_e32 v7, v4, v6 -; CI-NEXT: v_fma_f32 v8, -v5, v7, v4 -; CI-NEXT: v_fma_f32 v7, v8, v6, v7 -; CI-NEXT: v_fma_f32 v4, -v5, v7, v4 +; CI-NEXT: v_fma_f32 v7, -v4, v5, 1.0 +; CI-NEXT: v_fma_f32 v5, v7, v5, v5 +; CI-NEXT: v_mul_f32_e32 v7, v6, v5 +; CI-NEXT: v_fma_f32 v8, -v4, v7, v6 +; CI-NEXT: v_fma_f32 v7, v8, v5, v7 +; CI-NEXT: v_fma_f32 v4, -v4, v7, v6 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 +; CI-NEXT: v_div_fmas_f32 v4, v4, v5, v7 ; CI-NEXT: v_div_fixup_f32 v4, v4, v2, v0 ; CI-NEXT: v_trunc_f32_e32 v4, v4 ; CI-NEXT: v_fma_f32 v0, -v4, v2, v0 -; CI-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v1 -; CI-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1 +; CI-NEXT: v_div_scale_f32 v2, vcc, v3, v3, v1 +; CI-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_rcp_f32_e32 v5, v4 +; CI-NEXT: v_rcp_f32_e32 v4, v2 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 -; CI-NEXT: v_fma_f32 v5, v6, v5, v5 -; CI-NEXT: v_mul_f32_e32 v6, v2, v5 -; CI-NEXT: v_fma_f32 v7, -v4, v6, v2 -; CI-NEXT: v_fma_f32 v6, v7, v5, v6 -; CI-NEXT: v_fma_f32 v2, -v4, v6, v2 +; CI-NEXT: v_fma_f32 v6, -v2, v4, 1.0 +; CI-NEXT: v_fma_f32 v4, v6, v4, v4 +; CI-NEXT: v_mul_f32_e32 v6, v5, v4 +; CI-NEXT: v_fma_f32 v7, -v2, v6, v5 +; CI-NEXT: v_fma_f32 v6, v7, v4, v6 +; CI-NEXT: v_fma_f32 v2, -v2, v6, v5 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v2, v2, v5, v6 +; CI-NEXT: v_div_fmas_f32 v2, v2, v4, v6 ; CI-NEXT: v_div_fixup_f32 v2, v2, v3, v1 ; CI-NEXT: v_trunc_f32_e32 v2, v2 ; CI-NEXT: v_fma_f32 v1, -v2, v3, v1 @@ -1875,70 +1875,70 @@ ; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_div_scale_f32 v8, vcc, v5, v1, v5 -; SI-NEXT: v_div_scale_f32 v9, s[4:5], v1, v1, v5 -; SI-NEXT: v_rcp_f32_e32 v10, v9 +; SI-NEXT: v_div_scale_f32 v8, vcc, v1, v1, v5 +; SI-NEXT: v_rcp_f32_e32 v9, v8 +; SI-NEXT: v_div_scale_f32 v10, vcc, v5, v1, v5 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 -; SI-NEXT: v_fma_f32 v10, v11, v10, v10 -; SI-NEXT: v_mul_f32_e32 v11, v8, v10 -; SI-NEXT: v_fma_f32 v12, -v9, v11, v8 -; SI-NEXT: v_fma_f32 v11, v12, v10, v11 -; SI-NEXT: v_fma_f32 v8, -v9, v11, v8 +; SI-NEXT: v_fma_f32 v11, -v8, v9, 1.0 +; SI-NEXT: v_fma_f32 v9, v11, v9, v9 +; SI-NEXT: v_mul_f32_e32 v11, v10, v9 +; SI-NEXT: v_fma_f32 v12, -v8, v11, v10 +; SI-NEXT: v_fma_f32 v11, v12, v9, v11 +; SI-NEXT: v_fma_f32 v8, -v8, v11, v10 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 +; SI-NEXT: v_div_fmas_f32 v8, v8, v9, v11 ; SI-NEXT: v_div_fixup_f32 v8, v8, v1, v5 ; SI-NEXT: v_trunc_f32_e32 v8, v8 ; SI-NEXT: v_fma_f32 v1, -v8, v1, v5 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_div_scale_f32 v5, vcc, v4, v7, v4 -; SI-NEXT: v_div_scale_f32 v8, s[4:5], v7, v7, v4 -; SI-NEXT: v_rcp_f32_e32 v9, v8 +; SI-NEXT: v_div_scale_f32 v5, vcc, v7, v7, v4 +; SI-NEXT: v_rcp_f32_e32 v8, v5 +; SI-NEXT: v_div_scale_f32 v9, vcc, v4, v7, v4 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 -; SI-NEXT: v_fma_f32 v9, v10, v9, v9 -; SI-NEXT: v_mul_f32_e32 v10, v5, v9 -; SI-NEXT: v_fma_f32 v11, -v8, v10, v5 -; SI-NEXT: v_fma_f32 v10, v11, v9, v10 -; SI-NEXT: v_fma_f32 v5, -v8, v10, v5 +; SI-NEXT: v_fma_f32 v10, -v5, v8, 1.0 +; SI-NEXT: v_fma_f32 v8, v10, v8, v8 +; SI-NEXT: v_mul_f32_e32 v10, v9, v8 +; SI-NEXT: v_fma_f32 v11, -v5, v10, v9 +; SI-NEXT: v_fma_f32 v10, v11, v8, v10 +; SI-NEXT: v_fma_f32 v5, -v5, v10, v9 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v5, v5, v9, v10 +; SI-NEXT: v_div_fmas_f32 v5, v5, v8, v10 ; SI-NEXT: v_div_fixup_f32 v5, v5, v7, v4 ; SI-NEXT: v_trunc_f32_e32 v5, v5 ; SI-NEXT: v_fma_f32 v4, -v5, v7, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v4, v1 -; SI-NEXT: v_div_scale_f32 v4, vcc, v3, v0, v3 -; SI-NEXT: v_div_scale_f32 v5, s[4:5], v0, v0, v3 -; SI-NEXT: v_rcp_f32_e32 v7, v5 +; SI-NEXT: v_div_scale_f32 v4, vcc, v0, v0, v3 +; SI-NEXT: v_rcp_f32_e32 v5, v4 +; SI-NEXT: v_div_scale_f32 v7, vcc, v3, v0, v3 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v8, -v5, v7, 1.0 -; SI-NEXT: v_fma_f32 v7, v8, v7, v7 -; SI-NEXT: v_mul_f32_e32 v8, v4, v7 -; SI-NEXT: v_fma_f32 v9, -v5, v8, v4 -; SI-NEXT: v_fma_f32 v8, v9, v7, v8 -; SI-NEXT: v_fma_f32 v4, -v5, v8, v4 +; SI-NEXT: v_fma_f32 v8, -v4, v5, 1.0 +; SI-NEXT: v_fma_f32 v5, v8, v5, v5 +; SI-NEXT: v_mul_f32_e32 v8, v7, v5 +; SI-NEXT: v_fma_f32 v9, -v4, v8, v7 +; SI-NEXT: v_fma_f32 v8, v9, v5, v8 +; SI-NEXT: v_fma_f32 v4, -v4, v8, v7 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v4, v4, v7, v8 +; SI-NEXT: v_div_fmas_f32 v4, v4, v5, v8 ; SI-NEXT: v_div_fixup_f32 v4, v4, v0, v3 ; SI-NEXT: v_trunc_f32_e32 v4, v4 ; SI-NEXT: v_fma_f32 v0, -v4, v0, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_div_scale_f32 v3, vcc, v2, v6, v2 -; SI-NEXT: v_div_scale_f32 v4, s[4:5], v6, v6, v2 -; SI-NEXT: v_rcp_f32_e32 v5, v4 +; SI-NEXT: v_div_scale_f32 v3, vcc, v6, v6, v2 +; SI-NEXT: v_rcp_f32_e32 v4, v3 +; SI-NEXT: v_div_scale_f32 v5, vcc, v2, v6, v2 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v7, -v4, v5, 1.0 -; SI-NEXT: v_fma_f32 v5, v7, v5, v5 -; SI-NEXT: v_mul_f32_e32 v7, v3, v5 -; SI-NEXT: v_fma_f32 v8, -v4, v7, v3 -; SI-NEXT: v_fma_f32 v7, v8, v5, v7 -; SI-NEXT: v_fma_f32 v3, -v4, v7, v3 +; SI-NEXT: v_fma_f32 v7, -v3, v4, 1.0 +; SI-NEXT: v_fma_f32 v4, v7, v4, v4 +; SI-NEXT: v_mul_f32_e32 v7, v5, v4 +; SI-NEXT: v_fma_f32 v8, -v3, v7, v5 +; SI-NEXT: v_fma_f32 v7, v8, v4, v7 +; SI-NEXT: v_fma_f32 v3, -v3, v7, v5 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v3, v3, v5, v7 +; SI-NEXT: v_div_fmas_f32 v3, v3, v4, v7 ; SI-NEXT: v_div_fixup_f32 v3, v3, v6, v2 ; SI-NEXT: v_trunc_f32_e32 v3, v3 ; SI-NEXT: v_fma_f32 v2, -v3, v6, v2 @@ -1978,70 +1978,70 @@ ; CI-NEXT: v_cvt_f32_f16_e32 v6, v0 ; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_div_scale_f32 v9, s[4:5], v1, v1, v5 -; CI-NEXT: v_div_scale_f32 v8, vcc, v5, v1, v5 -; CI-NEXT: v_rcp_f32_e32 v10, v9 +; CI-NEXT: v_div_scale_f32 v8, vcc, v1, v1, v5 +; CI-NEXT: v_div_scale_f32 v10, vcc, v5, v1, v5 +; CI-NEXT: v_rcp_f32_e32 v9, v8 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 -; CI-NEXT: v_fma_f32 v10, v11, v10, v10 -; CI-NEXT: v_mul_f32_e32 v11, v8, v10 -; CI-NEXT: v_fma_f32 v12, -v9, v11, v8 -; CI-NEXT: v_fma_f32 v11, v12, v10, v11 -; CI-NEXT: v_fma_f32 v8, -v9, v11, v8 +; CI-NEXT: v_fma_f32 v11, -v8, v9, 1.0 +; CI-NEXT: v_fma_f32 v9, v11, v9, v9 +; CI-NEXT: v_mul_f32_e32 v11, v10, v9 +; CI-NEXT: v_fma_f32 v12, -v8, v11, v10 +; CI-NEXT: v_fma_f32 v11, v12, v9, v11 +; CI-NEXT: v_fma_f32 v8, -v8, v11, v10 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 +; CI-NEXT: v_div_fmas_f32 v8, v8, v9, v11 ; CI-NEXT: v_div_fixup_f32 v8, v8, v1, v5 ; CI-NEXT: v_trunc_f32_e32 v8, v8 ; CI-NEXT: v_fma_f32 v1, -v8, v1, v5 -; CI-NEXT: v_div_scale_f32 v8, s[4:5], v7, v7, v4 -; CI-NEXT: v_div_scale_f32 v5, vcc, v4, v7, v4 +; CI-NEXT: v_div_scale_f32 v5, vcc, v7, v7, v4 +; CI-NEXT: v_div_scale_f32 v9, vcc, v4, v7, v4 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_rcp_f32_e32 v9, v8 +; CI-NEXT: v_rcp_f32_e32 v8, v5 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 -; CI-NEXT: v_fma_f32 v9, v10, v9, v9 -; CI-NEXT: v_mul_f32_e32 v10, v5, v9 -; CI-NEXT: v_fma_f32 v11, -v8, v10, v5 -; CI-NEXT: v_fma_f32 v10, v11, v9, v10 -; CI-NEXT: v_fma_f32 v5, -v8, v10, v5 +; CI-NEXT: v_fma_f32 v10, -v5, v8, 1.0 +; CI-NEXT: v_fma_f32 v8, v10, v8, v8 +; CI-NEXT: v_mul_f32_e32 v10, v9, v8 +; CI-NEXT: v_fma_f32 v11, -v5, v10, v9 +; CI-NEXT: v_fma_f32 v10, v11, v8, v10 +; CI-NEXT: v_fma_f32 v5, -v5, v10, v9 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v5, v5, v9, v10 +; CI-NEXT: v_div_fmas_f32 v5, v5, v8, v10 ; CI-NEXT: v_div_fixup_f32 v5, v5, v7, v4 ; CI-NEXT: v_trunc_f32_e32 v5, v5 ; CI-NEXT: v_fma_f32 v4, -v5, v7, v4 -; CI-NEXT: v_div_scale_f32 v5, s[4:5], v0, v0, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; CI-NEXT: v_or_b32_e32 v1, v4, v1 -; CI-NEXT: v_div_scale_f32 v4, vcc, v3, v0, v3 -; CI-NEXT: v_rcp_f32_e32 v7, v5 +; CI-NEXT: v_div_scale_f32 v4, vcc, v0, v0, v3 +; CI-NEXT: v_div_scale_f32 v7, vcc, v3, v0, v3 +; CI-NEXT: v_rcp_f32_e32 v5, v4 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v8, -v5, v7, 1.0 -; CI-NEXT: v_fma_f32 v7, v8, v7, v7 -; CI-NEXT: v_mul_f32_e32 v8, v4, v7 -; CI-NEXT: v_fma_f32 v9, -v5, v8, v4 -; CI-NEXT: v_fma_f32 v8, v9, v7, v8 -; CI-NEXT: v_fma_f32 v4, -v5, v8, v4 +; CI-NEXT: v_fma_f32 v8, -v4, v5, 1.0 +; CI-NEXT: v_fma_f32 v5, v8, v5, v5 +; CI-NEXT: v_mul_f32_e32 v8, v7, v5 +; CI-NEXT: v_fma_f32 v9, -v4, v8, v7 +; CI-NEXT: v_fma_f32 v8, v9, v5, v8 +; CI-NEXT: v_fma_f32 v4, -v4, v8, v7 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v4, v4, v7, v8 +; CI-NEXT: v_div_fmas_f32 v4, v4, v5, v8 ; CI-NEXT: v_div_fixup_f32 v4, v4, v0, v3 ; CI-NEXT: v_trunc_f32_e32 v4, v4 ; CI-NEXT: v_fma_f32 v0, -v4, v0, v3 -; CI-NEXT: v_div_scale_f32 v4, s[4:5], v6, v6, v2 -; CI-NEXT: v_div_scale_f32 v3, vcc, v2, v6, v2 +; CI-NEXT: v_div_scale_f32 v3, vcc, v6, v6, v2 +; CI-NEXT: v_div_scale_f32 v5, vcc, v2, v6, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_rcp_f32_e32 v5, v4 +; CI-NEXT: v_rcp_f32_e32 v4, v3 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v7, -v4, v5, 1.0 -; CI-NEXT: v_fma_f32 v5, v7, v5, v5 -; CI-NEXT: v_mul_f32_e32 v7, v3, v5 -; CI-NEXT: v_fma_f32 v8, -v4, v7, v3 -; CI-NEXT: v_fma_f32 v7, v8, v5, v7 -; CI-NEXT: v_fma_f32 v3, -v4, v7, v3 +; CI-NEXT: v_fma_f32 v7, -v3, v4, 1.0 +; CI-NEXT: v_fma_f32 v4, v7, v4, v4 +; CI-NEXT: v_mul_f32_e32 v7, v5, v4 +; CI-NEXT: v_fma_f32 v8, -v3, v7, v5 +; CI-NEXT: v_fma_f32 v7, v8, v4, v7 +; CI-NEXT: v_fma_f32 v3, -v3, v7, v5 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v7 +; CI-NEXT: v_div_fmas_f32 v3, v3, v4, v7 ; CI-NEXT: v_div_fixup_f32 v3, v3, v6, v2 ; CI-NEXT: v_trunc_f32_e32 v3, v3 ; CI-NEXT: v_fma_f32 v2, -v3, v6, v2 @@ -2312,33 +2312,33 @@ ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_div_scale_f32 v4, vcc, v1, v3, v1 -; SI-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 -; SI-NEXT: v_rcp_f32_e32 v6, v5 +; SI-NEXT: v_div_scale_f32 v4, vcc, v3, v3, v1 +; SI-NEXT: v_rcp_f32_e32 v5, v4 +; SI-NEXT: v_div_scale_f32 v6, vcc, v1, v3, v1 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 -; SI-NEXT: v_fma_f32 v6, v7, v6, v6 -; SI-NEXT: v_mul_f32_e32 v7, v4, v6 -; SI-NEXT: v_fma_f32 v8, -v5, v7, v4 -; SI-NEXT: v_fma_f32 v7, v8, v6, v7 -; SI-NEXT: v_fma_f32 v4, -v5, v7, v4 +; SI-NEXT: v_fma_f32 v7, -v4, v5, 1.0 +; SI-NEXT: v_fma_f32 v5, v7, v5, v5 +; SI-NEXT: v_mul_f32_e32 v7, v6, v5 +; SI-NEXT: v_fma_f32 v8, -v4, v7, v6 +; SI-NEXT: v_fma_f32 v7, v8, v5, v7 +; SI-NEXT: v_fma_f32 v4, -v4, v7, v6 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 +; SI-NEXT: v_div_fmas_f32 v4, v4, v5, v7 ; SI-NEXT: v_div_fixup_f32 v4, v4, v3, v1 ; SI-NEXT: v_trunc_f32_e32 v4, v4 ; SI-NEXT: v_fma_f32 v1, -v4, v3, v1 -; SI-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0 -; SI-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 -; SI-NEXT: v_rcp_f32_e32 v5, v4 +; SI-NEXT: v_div_scale_f32 v3, vcc, v2, v2, v0 +; SI-NEXT: v_rcp_f32_e32 v4, v3 +; SI-NEXT: v_div_scale_f32 v5, vcc, v0, v2, v0 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 -; SI-NEXT: v_fma_f32 v5, v6, v5, v5 -; SI-NEXT: v_mul_f32_e32 v6, v3, v5 -; SI-NEXT: v_fma_f32 v7, -v4, v6, v3 -; SI-NEXT: v_fma_f32 v6, v7, v5, v6 -; SI-NEXT: v_fma_f32 v3, -v4, v6, v3 +; SI-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; SI-NEXT: v_fma_f32 v4, v6, v4, v4 +; SI-NEXT: v_mul_f32_e32 v6, v5, v4 +; SI-NEXT: v_fma_f32 v7, -v3, v6, v5 +; SI-NEXT: v_fma_f32 v6, v7, v4, v6 +; SI-NEXT: v_fma_f32 v3, -v3, v6, v5 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v3, v3, v5, v6 +; SI-NEXT: v_div_fmas_f32 v3, v3, v4, v6 ; SI-NEXT: v_div_fixup_f32 v3, v3, v2, v0 ; SI-NEXT: v_trunc_f32_e32 v3, v3 ; SI-NEXT: v_fma_f32 v0, -v3, v2, v0 @@ -2363,33 +2363,33 @@ ; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 offset:32 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 -; CI-NEXT: v_div_scale_f32 v4, vcc, v1, v3, v1 -; CI-NEXT: v_rcp_f32_e32 v6, v5 +; CI-NEXT: v_div_scale_f32 v4, vcc, v3, v3, v1 +; CI-NEXT: v_div_scale_f32 v6, vcc, v1, v3, v1 +; CI-NEXT: v_rcp_f32_e32 v5, v4 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 -; CI-NEXT: v_fma_f32 v6, v7, v6, v6 -; CI-NEXT: v_mul_f32_e32 v7, v4, v6 -; CI-NEXT: v_fma_f32 v8, -v5, v7, v4 -; CI-NEXT: v_fma_f32 v7, v8, v6, v7 -; CI-NEXT: v_fma_f32 v4, -v5, v7, v4 +; CI-NEXT: v_fma_f32 v7, -v4, v5, 1.0 +; CI-NEXT: v_fma_f32 v5, v7, v5, v5 +; CI-NEXT: v_mul_f32_e32 v7, v6, v5 +; CI-NEXT: v_fma_f32 v8, -v4, v7, v6 +; CI-NEXT: v_fma_f32 v7, v8, v5, v7 +; CI-NEXT: v_fma_f32 v4, -v4, v7, v6 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 +; CI-NEXT: v_div_fmas_f32 v4, v4, v5, v7 ; CI-NEXT: v_div_fixup_f32 v4, v4, v3, v1 ; CI-NEXT: v_trunc_f32_e32 v4, v4 ; CI-NEXT: v_fma_f32 v1, -v4, v3, v1 -; CI-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 -; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0 -; CI-NEXT: v_rcp_f32_e32 v5, v4 +; CI-NEXT: v_div_scale_f32 v3, vcc, v2, v2, v0 +; CI-NEXT: v_div_scale_f32 v5, vcc, v0, v2, v0 +; CI-NEXT: v_rcp_f32_e32 v4, v3 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 -; CI-NEXT: v_fma_f32 v5, v6, v5, v5 -; CI-NEXT: v_mul_f32_e32 v6, v3, v5 -; CI-NEXT: v_fma_f32 v7, -v4, v6, v3 -; CI-NEXT: v_fma_f32 v6, v7, v5, v6 -; CI-NEXT: v_fma_f32 v3, -v4, v6, v3 +; CI-NEXT: v_fma_f32 v6, -v3, v4, 1.0 +; CI-NEXT: v_fma_f32 v4, v6, v4, v4 +; CI-NEXT: v_mul_f32_e32 v6, v5, v4 +; CI-NEXT: v_fma_f32 v7, -v3, v6, v5 +; CI-NEXT: v_fma_f32 v6, v7, v4, v6 +; CI-NEXT: v_fma_f32 v3, -v3, v6, v5 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6 +; CI-NEXT: v_div_fmas_f32 v3, v3, v4, v6 ; CI-NEXT: v_div_fixup_f32 v3, v3, v2, v0 ; CI-NEXT: v_trunc_f32_e32 v3, v3 ; CI-NEXT: v_fma_f32 v0, -v3, v2, v0 @@ -2412,33 +2412,33 @@ ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v3 -; VI-NEXT: v_div_scale_f32 v6, vcc, v3, v5, v3 -; VI-NEXT: v_rcp_f32_e32 v8, v7 +; VI-NEXT: v_div_scale_f32 v6, vcc, v5, v5, v3 +; VI-NEXT: v_div_scale_f32 v8, vcc, v3, v5, v3 +; VI-NEXT: v_rcp_f32_e32 v7, v6 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; VI-NEXT: v_fma_f32 v9, -v7, v8, 1.0 -; VI-NEXT: v_fma_f32 v8, v9, v8, v8 -; VI-NEXT: v_mul_f32_e32 v9, v6, v8 -; VI-NEXT: v_fma_f32 v10, -v7, v9, v6 -; VI-NEXT: v_fma_f32 v9, v10, v8, v9 -; VI-NEXT: v_fma_f32 v6, -v7, v9, v6 +; VI-NEXT: v_fma_f32 v9, -v6, v7, 1.0 +; VI-NEXT: v_fma_f32 v7, v9, v7, v7 +; VI-NEXT: v_mul_f32_e32 v9, v8, v7 +; VI-NEXT: v_fma_f32 v10, -v6, v9, v8 +; VI-NEXT: v_fma_f32 v9, v10, v7, v9 +; VI-NEXT: v_fma_f32 v6, -v6, v9, v8 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; VI-NEXT: v_div_fmas_f32 v6, v6, v8, v9 +; VI-NEXT: v_div_fmas_f32 v6, v6, v7, v9 ; VI-NEXT: v_div_fixup_f32 v6, v6, v5, v3 ; VI-NEXT: v_trunc_f32_e32 v6, v6 ; VI-NEXT: v_fma_f32 v3, -v6, v5, v3 -; VI-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v2 -; VI-NEXT: v_div_scale_f32 v5, vcc, v2, v4, v2 -; VI-NEXT: v_rcp_f32_e32 v7, v6 +; VI-NEXT: v_div_scale_f32 v5, vcc, v4, v4, v2 +; VI-NEXT: v_div_scale_f32 v7, vcc, v2, v4, v2 +; VI-NEXT: v_rcp_f32_e32 v6, v5 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; VI-NEXT: v_fma_f32 v8, -v6, v7, 1.0 -; VI-NEXT: v_fma_f32 v7, v8, v7, v7 -; VI-NEXT: v_mul_f32_e32 v8, v5, v7 -; VI-NEXT: v_fma_f32 v9, -v6, v8, v5 -; VI-NEXT: v_fma_f32 v8, v9, v7, v8 -; VI-NEXT: v_fma_f32 v5, -v6, v8, v5 +; VI-NEXT: v_fma_f32 v8, -v5, v6, 1.0 +; VI-NEXT: v_fma_f32 v6, v8, v6, v6 +; VI-NEXT: v_mul_f32_e32 v8, v7, v6 +; VI-NEXT: v_fma_f32 v9, -v5, v8, v7 +; VI-NEXT: v_fma_f32 v8, v9, v6, v8 +; VI-NEXT: v_fma_f32 v5, -v5, v8, v7 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; VI-NEXT: v_div_fmas_f32 v5, v5, v7, v8 +; VI-NEXT: v_div_fmas_f32 v5, v5, v6, v8 ; VI-NEXT: v_div_fixup_f32 v5, v5, v4, v2 ; VI-NEXT: v_trunc_f32_e32 v5, v5 ; VI-NEXT: v_fma_f32 v2, -v5, v4, v2 @@ -2454,33 +2454,33 @@ ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_div_scale_f32 v6, s[0:1], v3, v3, v1 -; GFX9-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 -; GFX9-NEXT: v_rcp_f32_e32 v7, v6 +; GFX9-NEXT: v_div_scale_f32 v5, vcc, v3, v3, v1 +; GFX9-NEXT: v_div_scale_f32 v7, vcc, v1, v3, v1 +; GFX9-NEXT: v_rcp_f32_e32 v6, v5 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX9-NEXT: v_fma_f32 v8, -v6, v7, 1.0 -; GFX9-NEXT: v_fma_f32 v7, v8, v7, v7 -; GFX9-NEXT: v_mul_f32_e32 v8, v5, v7 -; GFX9-NEXT: v_fma_f32 v9, -v6, v8, v5 -; GFX9-NEXT: v_fma_f32 v8, v9, v7, v8 -; GFX9-NEXT: v_fma_f32 v5, -v6, v8, v5 +; GFX9-NEXT: v_fma_f32 v8, -v5, v6, 1.0 +; GFX9-NEXT: v_fma_f32 v6, v8, v6, v6 +; GFX9-NEXT: v_mul_f32_e32 v8, v7, v6 +; GFX9-NEXT: v_fma_f32 v9, -v5, v8, v7 +; GFX9-NEXT: v_fma_f32 v8, v9, v6, v8 +; GFX9-NEXT: v_fma_f32 v5, -v5, v8, v7 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX9-NEXT: v_div_fmas_f32 v5, v5, v7, v8 +; GFX9-NEXT: v_div_fmas_f32 v5, v5, v6, v8 ; GFX9-NEXT: v_div_fixup_f32 v5, v5, v3, v1 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_fma_f32 v1, -v5, v3, v1 -; GFX9-NEXT: v_div_scale_f32 v5, s[0:1], v2, v2, v0 -; GFX9-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0 -; GFX9-NEXT: v_rcp_f32_e32 v6, v5 +; GFX9-NEXT: v_div_scale_f32 v3, vcc, v2, v2, v0 +; GFX9-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 +; GFX9-NEXT: v_rcp_f32_e32 v5, v3 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX9-NEXT: v_fma_f32 v7, -v5, v6, 1.0 -; GFX9-NEXT: v_fma_f32 v6, v7, v6, v6 -; GFX9-NEXT: v_mul_f32_e32 v7, v3, v6 -; GFX9-NEXT: v_fma_f32 v8, -v5, v7, v3 -; GFX9-NEXT: v_fma_f32 v7, v8, v6, v7 -; GFX9-NEXT: v_fma_f32 v3, -v5, v7, v3 +; GFX9-NEXT: v_fma_f32 v7, -v3, v5, 1.0 +; GFX9-NEXT: v_fma_f32 v5, v7, v5, v5 +; GFX9-NEXT: v_mul_f32_e32 v7, v6, v5 +; GFX9-NEXT: v_fma_f32 v8, -v3, v7, v6 +; GFX9-NEXT: v_fma_f32 v7, v8, v5, v7 +; GFX9-NEXT: v_fma_f32 v3, -v3, v7, v6 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX9-NEXT: v_div_fmas_f32 v3, v3, v6, v7 +; GFX9-NEXT: v_div_fmas_f32 v3, v3, v5, v7 ; GFX9-NEXT: v_div_fixup_f32 v3, v3, v2, v0 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 ; GFX9-NEXT: v_fma_f32 v0, -v3, v2, v0 @@ -2498,33 +2498,33 @@ ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] ; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v6, s0, v3, v3, v1 -; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1 -; GFX10-NEXT: v_rcp_f32_e32 v7, v6 +; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v3, v3, v1 +; GFX10-NEXT: v_div_scale_f32 v7, vcc_lo, v1, v3, v1 +; GFX10-NEXT: v_rcp_f32_e32 v6, v5 ; GFX10-NEXT: s_denorm_mode 15 -; GFX10-NEXT: v_fma_f32 v8, -v6, v7, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v7, v8, v7 -; GFX10-NEXT: v_mul_f32_e32 v8, v5, v7 -; GFX10-NEXT: v_fma_f32 v9, -v6, v8, v5 -; GFX10-NEXT: v_fmac_f32_e32 v8, v9, v7 -; GFX10-NEXT: v_fma_f32 v5, -v6, v8, v5 +; GFX10-NEXT: v_fma_f32 v8, -v5, v6, 1.0 +; GFX10-NEXT: v_fmac_f32_e32 v6, v8, v6 +; GFX10-NEXT: v_mul_f32_e32 v8, v7, v6 +; GFX10-NEXT: v_fma_f32 v9, -v5, v8, v7 +; GFX10-NEXT: v_fmac_f32_e32 v8, v9, v6 +; GFX10-NEXT: v_fma_f32 v5, -v5, v8, v7 ; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_fmas_f32 v5, v5, v7, v8 +; GFX10-NEXT: v_div_fmas_f32 v5, v5, v6, v8 ; GFX10-NEXT: v_div_fixup_f32 v5, v5, v3, v1 ; GFX10-NEXT: v_trunc_f32_e32 v5, v5 ; GFX10-NEXT: v_fma_f32 v1, -v5, v3, v1 -; GFX10-NEXT: v_div_scale_f32 v5, s0, v2, v2, v0 -; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, v0, v2, v0 -; GFX10-NEXT: v_rcp_f32_e32 v6, v5 +; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, v2, v2, v0 +; GFX10-NEXT: v_div_scale_f32 v6, vcc_lo, v0, v2, v0 +; GFX10-NEXT: v_rcp_f32_e32 v5, v3 ; GFX10-NEXT: s_denorm_mode 15 -; GFX10-NEXT: v_fma_f32 v7, -v5, v6, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v6, v7, v6 -; GFX10-NEXT: v_mul_f32_e32 v7, v3, v6 -; GFX10-NEXT: v_fma_f32 v8, -v5, v7, v3 -; GFX10-NEXT: v_fmac_f32_e32 v7, v8, v6 -; GFX10-NEXT: v_fma_f32 v3, -v5, v7, v3 +; GFX10-NEXT: v_fma_f32 v7, -v3, v5, 1.0 +; GFX10-NEXT: v_fmac_f32_e32 v5, v7, v5 +; GFX10-NEXT: v_mul_f32_e32 v7, v6, v5 +; GFX10-NEXT: v_fma_f32 v8, -v3, v7, v6 +; GFX10-NEXT: v_fmac_f32_e32 v7, v8, v5 +; GFX10-NEXT: v_fma_f32 v3, -v3, v7, v6 ; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_fmas_f32 v3, v3, v6, v7 +; GFX10-NEXT: v_div_fmas_f32 v3, v3, v5, v7 ; GFX10-NEXT: v_div_fixup_f32 v3, v3, v2, v0 ; GFX10-NEXT: v_trunc_f32_e32 v3, v3 ; GFX10-NEXT: v_fma_f32 v0, -v3, v2, v0 @@ -2542,44 +2542,44 @@ ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7] ; GFX11-NEXT: global_load_b64 v[2:3], v4, s[0:1] offset:32 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v6, null, v3, v3, v1 -; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1 +; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, v3, v3, v1 +; GFX11-NEXT: v_div_scale_f32 v7, vcc_lo, v1, v3, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_f32_e32 v7, v6 +; GFX11-NEXT: v_rcp_f32_e32 v6, v5 ; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v8, -v6, v7, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v7, v8, v7 +; GFX11-NEXT: v_fma_f32 v8, -v5, v6, 1.0 +; GFX11-NEXT: v_fmac_f32_e32 v6, v8, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_f32_e32 v8, v5, v7 -; GFX11-NEXT: v_fma_f32 v9, -v6, v8, v5 +; GFX11-NEXT: v_mul_f32_e32 v8, v7, v6 +; GFX11-NEXT: v_fma_f32 v9, -v5, v8, v7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fmac_f32_e32 v8, v9, v7 -; GFX11-NEXT: v_fma_f32 v5, -v6, v8, v5 +; GFX11-NEXT: v_fmac_f32_e32 v8, v9, v6 +; GFX11-NEXT: v_fma_f32 v5, -v5, v8, v7 ; GFX11-NEXT: s_denorm_mode 12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_div_fmas_f32 v5, v5, v7, v8 +; GFX11-NEXT: v_div_fmas_f32 v5, v5, v6, v8 ; GFX11-NEXT: v_div_fixup_f32 v5, v5, v3, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_trunc_f32_e32 v5, v5 ; GFX11-NEXT: v_fma_f32 v1, -v5, v3, v1 -; GFX11-NEXT: v_div_scale_f32 v5, null, v2, v2, v0 -; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, v0, v2, v0 +; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, v2, v2, v0 +; GFX11-NEXT: v_div_scale_f32 v6, vcc_lo, v0, v2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_f32_e32 v6, v5 +; GFX11-NEXT: v_rcp_f32_e32 v5, v3 ; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v7, -v5, v6, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v6 +; GFX11-NEXT: v_fma_f32 v7, -v3, v5, 1.0 +; GFX11-NEXT: v_fmac_f32_e32 v5, v7, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_f32_e32 v7, v3, v6 -; GFX11-NEXT: v_fma_f32 v8, -v5, v7, v3 +; GFX11-NEXT: v_mul_f32_e32 v7, v6, v5 +; GFX11-NEXT: v_fma_f32 v8, -v3, v7, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fmac_f32_e32 v7, v8, v6 -; GFX11-NEXT: v_fma_f32 v3, -v5, v7, v3 +; GFX11-NEXT: v_fmac_f32_e32 v7, v8, v5 +; GFX11-NEXT: v_fma_f32 v3, -v3, v7, v6 ; GFX11-NEXT: s_denorm_mode 12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_div_fmas_f32 v3, v3, v6, v7 +; GFX11-NEXT: v_div_fmas_f32 v3, v3, v5, v7 ; GFX11-NEXT: v_div_fixup_f32 v3, v3, v2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_trunc_f32_e32 v3, v3 @@ -2615,63 +2615,63 @@ ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_div_scale_f32 v8, vcc, v3, v7, v3 -; SI-NEXT: v_div_scale_f32 v9, s[4:5], v7, v7, v3 -; SI-NEXT: v_rcp_f32_e32 v10, v9 +; SI-NEXT: v_div_scale_f32 v8, vcc, v7, v7, v3 +; SI-NEXT: v_rcp_f32_e32 v9, v8 +; SI-NEXT: v_div_scale_f32 v10, vcc, v3, v7, v3 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 -; SI-NEXT: v_fma_f32 v10, v11, v10, v10 -; SI-NEXT: v_mul_f32_e32 v11, v8, v10 -; SI-NEXT: v_fma_f32 v12, -v9, v11, v8 -; SI-NEXT: v_fma_f32 v11, v12, v10, v11 -; SI-NEXT: v_fma_f32 v8, -v9, v11, v8 +; SI-NEXT: v_fma_f32 v11, -v8, v9, 1.0 +; SI-NEXT: v_fma_f32 v9, v11, v9, v9 +; SI-NEXT: v_mul_f32_e32 v11, v10, v9 +; SI-NEXT: v_fma_f32 v12, -v8, v11, v10 +; SI-NEXT: v_fma_f32 v11, v12, v9, v11 +; SI-NEXT: v_fma_f32 v8, -v8, v11, v10 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 +; SI-NEXT: v_div_fmas_f32 v8, v8, v9, v11 ; SI-NEXT: v_div_fixup_f32 v8, v8, v7, v3 ; SI-NEXT: v_trunc_f32_e32 v8, v8 ; SI-NEXT: v_fma_f32 v3, -v8, v7, v3 -; SI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 -; SI-NEXT: v_div_scale_f32 v8, s[4:5], v6, v6, v2 -; SI-NEXT: v_rcp_f32_e32 v9, v8 +; SI-NEXT: v_div_scale_f32 v7, vcc, v6, v6, v2 +; SI-NEXT: v_rcp_f32_e32 v8, v7 +; SI-NEXT: v_div_scale_f32 v9, vcc, v2, v6, v2 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 -; SI-NEXT: v_fma_f32 v9, v10, v9, v9 -; SI-NEXT: v_mul_f32_e32 v10, v7, v9 -; SI-NEXT: v_fma_f32 v11, -v8, v10, v7 -; SI-NEXT: v_fma_f32 v10, v11, v9, v10 -; SI-NEXT: v_fma_f32 v7, -v8, v10, v7 +; SI-NEXT: v_fma_f32 v10, -v7, v8, 1.0 +; SI-NEXT: v_fma_f32 v8, v10, v8, v8 +; SI-NEXT: v_mul_f32_e32 v10, v9, v8 +; SI-NEXT: v_fma_f32 v11, -v7, v10, v9 +; SI-NEXT: v_fma_f32 v10, v11, v8, v10 +; SI-NEXT: v_fma_f32 v7, -v7, v10, v9 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v7, v7, v9, v10 +; SI-NEXT: v_div_fmas_f32 v7, v7, v8, v10 ; SI-NEXT: v_div_fixup_f32 v7, v7, v6, v2 ; SI-NEXT: v_trunc_f32_e32 v7, v7 ; SI-NEXT: v_fma_f32 v2, -v7, v6, v2 -; SI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 -; SI-NEXT: v_div_scale_f32 v7, s[4:5], v5, v5, v1 -; SI-NEXT: v_rcp_f32_e32 v8, v7 +; SI-NEXT: v_div_scale_f32 v6, vcc, v5, v5, v1 +; SI-NEXT: v_rcp_f32_e32 v7, v6 +; SI-NEXT: v_div_scale_f32 v8, vcc, v1, v5, v1 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v9, -v7, v8, 1.0 -; SI-NEXT: v_fma_f32 v8, v9, v8, v8 -; SI-NEXT: v_mul_f32_e32 v9, v6, v8 -; SI-NEXT: v_fma_f32 v10, -v7, v9, v6 -; SI-NEXT: v_fma_f32 v9, v10, v8, v9 -; SI-NEXT: v_fma_f32 v6, -v7, v9, v6 +; SI-NEXT: v_fma_f32 v9, -v6, v7, 1.0 +; SI-NEXT: v_fma_f32 v7, v9, v7, v7 +; SI-NEXT: v_mul_f32_e32 v9, v8, v7 +; SI-NEXT: v_fma_f32 v10, -v6, v9, v8 +; SI-NEXT: v_fma_f32 v9, v10, v7, v9 +; SI-NEXT: v_fma_f32 v6, -v6, v9, v8 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v6, v6, v8, v9 +; SI-NEXT: v_div_fmas_f32 v6, v6, v7, v9 ; SI-NEXT: v_div_fixup_f32 v6, v6, v5, v1 ; SI-NEXT: v_trunc_f32_e32 v6, v6 ; SI-NEXT: v_fma_f32 v1, -v6, v5, v1 -; SI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 -; SI-NEXT: v_div_scale_f32 v6, s[4:5], v4, v4, v0 -; SI-NEXT: v_rcp_f32_e32 v7, v6 +; SI-NEXT: v_div_scale_f32 v5, vcc, v4, v4, v0 +; SI-NEXT: v_rcp_f32_e32 v6, v5 +; SI-NEXT: v_div_scale_f32 v7, vcc, v0, v4, v0 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NEXT: v_fma_f32 v8, -v6, v7, 1.0 -; SI-NEXT: v_fma_f32 v7, v8, v7, v7 -; SI-NEXT: v_mul_f32_e32 v8, v5, v7 -; SI-NEXT: v_fma_f32 v9, -v6, v8, v5 -; SI-NEXT: v_fma_f32 v8, v9, v7, v8 -; SI-NEXT: v_fma_f32 v5, -v6, v8, v5 +; SI-NEXT: v_fma_f32 v8, -v5, v6, 1.0 +; SI-NEXT: v_fma_f32 v6, v8, v6, v6 +; SI-NEXT: v_mul_f32_e32 v8, v7, v6 +; SI-NEXT: v_fma_f32 v9, -v5, v8, v7 +; SI-NEXT: v_fma_f32 v8, v9, v6, v8 +; SI-NEXT: v_fma_f32 v5, -v5, v8, v7 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NEXT: v_div_fmas_f32 v5, v5, v7, v8 +; SI-NEXT: v_div_fmas_f32 v5, v5, v6, v8 ; SI-NEXT: v_div_fixup_f32 v5, v5, v4, v0 ; SI-NEXT: v_trunc_f32_e32 v5, v5 ; SI-NEXT: v_fma_f32 v0, -v5, v4, v0 @@ -2696,63 +2696,63 @@ ; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_div_scale_f32 v9, s[4:5], v7, v7, v3 -; CI-NEXT: v_div_scale_f32 v8, vcc, v3, v7, v3 -; CI-NEXT: v_rcp_f32_e32 v10, v9 +; CI-NEXT: v_div_scale_f32 v8, vcc, v7, v7, v3 +; CI-NEXT: v_div_scale_f32 v10, vcc, v3, v7, v3 +; CI-NEXT: v_rcp_f32_e32 v9, v8 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 -; CI-NEXT: v_fma_f32 v10, v11, v10, v10 -; CI-NEXT: v_mul_f32_e32 v11, v8, v10 -; CI-NEXT: v_fma_f32 v12, -v9, v11, v8 -; CI-NEXT: v_fma_f32 v11, v12, v10, v11 -; CI-NEXT: v_fma_f32 v8, -v9, v11, v8 +; CI-NEXT: v_fma_f32 v11, -v8, v9, 1.0 +; CI-NEXT: v_fma_f32 v9, v11, v9, v9 +; CI-NEXT: v_mul_f32_e32 v11, v10, v9 +; CI-NEXT: v_fma_f32 v12, -v8, v11, v10 +; CI-NEXT: v_fma_f32 v11, v12, v9, v11 +; CI-NEXT: v_fma_f32 v8, -v8, v11, v10 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 +; CI-NEXT: v_div_fmas_f32 v8, v8, v9, v11 ; CI-NEXT: v_div_fixup_f32 v8, v8, v7, v3 ; CI-NEXT: v_trunc_f32_e32 v8, v8 ; CI-NEXT: v_fma_f32 v3, -v8, v7, v3 -; CI-NEXT: v_div_scale_f32 v8, s[4:5], v6, v6, v2 -; CI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 -; CI-NEXT: v_rcp_f32_e32 v9, v8 +; CI-NEXT: v_div_scale_f32 v7, vcc, v6, v6, v2 +; CI-NEXT: v_div_scale_f32 v9, vcc, v2, v6, v2 +; CI-NEXT: v_rcp_f32_e32 v8, v7 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 -; CI-NEXT: v_fma_f32 v9, v10, v9, v9 -; CI-NEXT: v_mul_f32_e32 v10, v7, v9 -; CI-NEXT: v_fma_f32 v11, -v8, v10, v7 -; CI-NEXT: v_fma_f32 v10, v11, v9, v10 -; CI-NEXT: v_fma_f32 v7, -v8, v10, v7 +; CI-NEXT: v_fma_f32 v10, -v7, v8, 1.0 +; CI-NEXT: v_fma_f32 v8, v10, v8, v8 +; CI-NEXT: v_mul_f32_e32 v10, v9, v8 +; CI-NEXT: v_fma_f32 v11, -v7, v10, v9 +; CI-NEXT: v_fma_f32 v10, v11, v8, v10 +; CI-NEXT: v_fma_f32 v7, -v7, v10, v9 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v7, v7, v9, v10 +; CI-NEXT: v_div_fmas_f32 v7, v7, v8, v10 ; CI-NEXT: v_div_fixup_f32 v7, v7, v6, v2 ; CI-NEXT: v_trunc_f32_e32 v7, v7 ; CI-NEXT: v_fma_f32 v2, -v7, v6, v2 -; CI-NEXT: v_div_scale_f32 v7, s[4:5], v5, v5, v1 -; CI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 -; CI-NEXT: v_rcp_f32_e32 v8, v7 +; CI-NEXT: v_div_scale_f32 v6, vcc, v5, v5, v1 +; CI-NEXT: v_div_scale_f32 v8, vcc, v1, v5, v1 +; CI-NEXT: v_rcp_f32_e32 v7, v6 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v9, -v7, v8, 1.0 -; CI-NEXT: v_fma_f32 v8, v9, v8, v8 -; CI-NEXT: v_mul_f32_e32 v9, v6, v8 -; CI-NEXT: v_fma_f32 v10, -v7, v9, v6 -; CI-NEXT: v_fma_f32 v9, v10, v8, v9 -; CI-NEXT: v_fma_f32 v6, -v7, v9, v6 +; CI-NEXT: v_fma_f32 v9, -v6, v7, 1.0 +; CI-NEXT: v_fma_f32 v7, v9, v7, v7 +; CI-NEXT: v_mul_f32_e32 v9, v8, v7 +; CI-NEXT: v_fma_f32 v10, -v6, v9, v8 +; CI-NEXT: v_fma_f32 v9, v10, v7, v9 +; CI-NEXT: v_fma_f32 v6, -v6, v9, v8 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v6, v6, v8, v9 +; CI-NEXT: v_div_fmas_f32 v6, v6, v7, v9 ; CI-NEXT: v_div_fixup_f32 v6, v6, v5, v1 ; CI-NEXT: v_trunc_f32_e32 v6, v6 ; CI-NEXT: v_fma_f32 v1, -v6, v5, v1 -; CI-NEXT: v_div_scale_f32 v6, s[4:5], v4, v4, v0 -; CI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 -; CI-NEXT: v_rcp_f32_e32 v7, v6 +; CI-NEXT: v_div_scale_f32 v5, vcc, v4, v4, v0 +; CI-NEXT: v_div_scale_f32 v7, vcc, v0, v4, v0 +; CI-NEXT: v_rcp_f32_e32 v6, v5 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; CI-NEXT: v_fma_f32 v8, -v6, v7, 1.0 -; CI-NEXT: v_fma_f32 v7, v8, v7, v7 -; CI-NEXT: v_mul_f32_e32 v8, v5, v7 -; CI-NEXT: v_fma_f32 v9, -v6, v8, v5 -; CI-NEXT: v_fma_f32 v8, v9, v7, v8 -; CI-NEXT: v_fma_f32 v5, -v6, v8, v5 +; CI-NEXT: v_fma_f32 v8, -v5, v6, 1.0 +; CI-NEXT: v_fma_f32 v6, v8, v6, v6 +; CI-NEXT: v_mul_f32_e32 v8, v7, v6 +; CI-NEXT: v_fma_f32 v9, -v5, v8, v7 +; CI-NEXT: v_fma_f32 v8, v9, v6, v8 +; CI-NEXT: v_fma_f32 v5, -v5, v8, v7 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; CI-NEXT: v_div_fmas_f32 v5, v5, v7, v8 +; CI-NEXT: v_div_fmas_f32 v5, v5, v6, v8 ; CI-NEXT: v_div_fixup_f32 v5, v5, v4, v0 ; CI-NEXT: v_trunc_f32_e32 v5, v5 ; CI-NEXT: v_fma_f32 v0, -v5, v4, v0 @@ -2775,63 +2775,63 @@ ; VI-NEXT: v_mov_b32_e32 v8, s4 ; VI-NEXT: v_mov_b32_e32 v9, s5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_div_scale_f32 v11, s[0:1], v7, v7, v3 -; VI-NEXT: v_div_scale_f32 v10, vcc, v3, v7, v3 -; VI-NEXT: v_rcp_f32_e32 v12, v11 +; VI-NEXT: v_div_scale_f32 v10, vcc, v7, v7, v3 +; VI-NEXT: v_div_scale_f32 v12, vcc, v3, v7, v3 +; VI-NEXT: v_rcp_f32_e32 v11, v10 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; VI-NEXT: v_fma_f32 v13, -v11, v12, 1.0 -; VI-NEXT: v_fma_f32 v12, v13, v12, v12 -; VI-NEXT: v_mul_f32_e32 v13, v10, v12 -; VI-NEXT: v_fma_f32 v14, -v11, v13, v10 -; VI-NEXT: v_fma_f32 v13, v14, v12, v13 -; VI-NEXT: v_fma_f32 v10, -v11, v13, v10 +; VI-NEXT: v_fma_f32 v13, -v10, v11, 1.0 +; VI-NEXT: v_fma_f32 v11, v13, v11, v11 +; VI-NEXT: v_mul_f32_e32 v13, v12, v11 +; VI-NEXT: v_fma_f32 v14, -v10, v13, v12 +; VI-NEXT: v_fma_f32 v13, v14, v11, v13 +; VI-NEXT: v_fma_f32 v10, -v10, v13, v12 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; VI-NEXT: v_div_fmas_f32 v10, v10, v12, v13 +; VI-NEXT: v_div_fmas_f32 v10, v10, v11, v13 ; VI-NEXT: v_div_fixup_f32 v10, v10, v7, v3 ; VI-NEXT: v_trunc_f32_e32 v10, v10 ; VI-NEXT: v_fma_f32 v3, -v10, v7, v3 -; VI-NEXT: v_div_scale_f32 v10, s[0:1], v6, v6, v2 -; VI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 -; VI-NEXT: v_rcp_f32_e32 v11, v10 +; VI-NEXT: v_div_scale_f32 v7, vcc, v6, v6, v2 +; VI-NEXT: v_div_scale_f32 v11, vcc, v2, v6, v2 +; VI-NEXT: v_rcp_f32_e32 v10, v7 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; VI-NEXT: v_fma_f32 v12, -v10, v11, 1.0 -; VI-NEXT: v_fma_f32 v11, v12, v11, v11 -; VI-NEXT: v_mul_f32_e32 v12, v7, v11 -; VI-NEXT: v_fma_f32 v13, -v10, v12, v7 -; VI-NEXT: v_fma_f32 v12, v13, v11, v12 -; VI-NEXT: v_fma_f32 v7, -v10, v12, v7 +; VI-NEXT: v_fma_f32 v12, -v7, v10, 1.0 +; VI-NEXT: v_fma_f32 v10, v12, v10, v10 +; VI-NEXT: v_mul_f32_e32 v12, v11, v10 +; VI-NEXT: v_fma_f32 v13, -v7, v12, v11 +; VI-NEXT: v_fma_f32 v12, v13, v10, v12 +; VI-NEXT: v_fma_f32 v7, -v7, v12, v11 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; VI-NEXT: v_div_fmas_f32 v7, v7, v11, v12 +; VI-NEXT: v_div_fmas_f32 v7, v7, v10, v12 ; VI-NEXT: v_div_fixup_f32 v7, v7, v6, v2 ; VI-NEXT: v_trunc_f32_e32 v7, v7 ; VI-NEXT: v_fma_f32 v2, -v7, v6, v2 -; VI-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v1 -; VI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 -; VI-NEXT: v_rcp_f32_e32 v10, v7 +; VI-NEXT: v_div_scale_f32 v6, vcc, v5, v5, v1 +; VI-NEXT: v_div_scale_f32 v10, vcc, v1, v5, v1 +; VI-NEXT: v_rcp_f32_e32 v7, v6 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; VI-NEXT: v_fma_f32 v11, -v7, v10, 1.0 -; VI-NEXT: v_fma_f32 v10, v11, v10, v10 -; VI-NEXT: v_mul_f32_e32 v11, v6, v10 -; VI-NEXT: v_fma_f32 v12, -v7, v11, v6 -; VI-NEXT: v_fma_f32 v11, v12, v10, v11 -; VI-NEXT: v_fma_f32 v6, -v7, v11, v6 +; VI-NEXT: v_fma_f32 v11, -v6, v7, 1.0 +; VI-NEXT: v_fma_f32 v7, v11, v7, v7 +; VI-NEXT: v_mul_f32_e32 v11, v10, v7 +; VI-NEXT: v_fma_f32 v12, -v6, v11, v10 +; VI-NEXT: v_fma_f32 v11, v12, v7, v11 +; VI-NEXT: v_fma_f32 v6, -v6, v11, v10 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; VI-NEXT: v_div_fmas_f32 v6, v6, v10, v11 +; VI-NEXT: v_div_fmas_f32 v6, v6, v7, v11 ; VI-NEXT: v_div_fixup_f32 v6, v6, v5, v1 ; VI-NEXT: v_trunc_f32_e32 v6, v6 ; VI-NEXT: v_fma_f32 v1, -v6, v5, v1 -; VI-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v0 -; VI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 -; VI-NEXT: v_rcp_f32_e32 v7, v6 +; VI-NEXT: v_div_scale_f32 v5, vcc, v4, v4, v0 +; VI-NEXT: v_div_scale_f32 v7, vcc, v0, v4, v0 +; VI-NEXT: v_rcp_f32_e32 v6, v5 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; VI-NEXT: v_fma_f32 v10, -v6, v7, 1.0 -; VI-NEXT: v_fma_f32 v7, v10, v7, v7 -; VI-NEXT: v_mul_f32_e32 v10, v5, v7 -; VI-NEXT: v_fma_f32 v11, -v6, v10, v5 -; VI-NEXT: v_fma_f32 v10, v11, v7, v10 -; VI-NEXT: v_fma_f32 v5, -v6, v10, v5 +; VI-NEXT: v_fma_f32 v10, -v5, v6, 1.0 +; VI-NEXT: v_fma_f32 v6, v10, v6, v6 +; VI-NEXT: v_mul_f32_e32 v10, v7, v6 +; VI-NEXT: v_fma_f32 v11, -v5, v10, v7 +; VI-NEXT: v_fma_f32 v10, v11, v6, v10 +; VI-NEXT: v_fma_f32 v5, -v5, v10, v7 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; VI-NEXT: v_div_fmas_f32 v5, v5, v7, v10 +; VI-NEXT: v_div_fmas_f32 v5, v5, v6, v10 ; VI-NEXT: v_div_fixup_f32 v5, v5, v4, v0 ; VI-NEXT: v_trunc_f32_e32 v5, v5 ; VI-NEXT: v_fma_f32 v0, -v5, v4, v0 @@ -2847,63 +2847,63 @@ ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] ; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:64 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_div_scale_f32 v10, s[0:1], v7, v7, v3 -; GFX9-NEXT: v_div_scale_f32 v9, vcc, v3, v7, v3 -; GFX9-NEXT: v_rcp_f32_e32 v11, v10 +; GFX9-NEXT: v_div_scale_f32 v9, vcc, v7, v7, v3 +; GFX9-NEXT: v_div_scale_f32 v11, vcc, v3, v7, v3 +; GFX9-NEXT: v_rcp_f32_e32 v10, v9 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX9-NEXT: v_fma_f32 v12, -v10, v11, 1.0 -; GFX9-NEXT: v_fma_f32 v11, v12, v11, v11 -; GFX9-NEXT: v_mul_f32_e32 v12, v9, v11 -; GFX9-NEXT: v_fma_f32 v13, -v10, v12, v9 -; GFX9-NEXT: v_fma_f32 v12, v13, v11, v12 -; GFX9-NEXT: v_fma_f32 v9, -v10, v12, v9 +; GFX9-NEXT: v_fma_f32 v12, -v9, v10, 1.0 +; GFX9-NEXT: v_fma_f32 v10, v12, v10, v10 +; GFX9-NEXT: v_mul_f32_e32 v12, v11, v10 +; GFX9-NEXT: v_fma_f32 v13, -v9, v12, v11 +; GFX9-NEXT: v_fma_f32 v12, v13, v10, v12 +; GFX9-NEXT: v_fma_f32 v9, -v9, v12, v11 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX9-NEXT: v_div_fmas_f32 v9, v9, v11, v12 +; GFX9-NEXT: v_div_fmas_f32 v9, v9, v10, v12 ; GFX9-NEXT: v_div_fixup_f32 v9, v9, v7, v3 ; GFX9-NEXT: v_trunc_f32_e32 v9, v9 ; GFX9-NEXT: v_fma_f32 v3, -v9, v7, v3 -; GFX9-NEXT: v_div_scale_f32 v9, s[0:1], v6, v6, v2 -; GFX9-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 -; GFX9-NEXT: v_rcp_f32_e32 v10, v9 +; GFX9-NEXT: v_div_scale_f32 v7, vcc, v6, v6, v2 +; GFX9-NEXT: v_div_scale_f32 v10, vcc, v2, v6, v2 +; GFX9-NEXT: v_rcp_f32_e32 v9, v7 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX9-NEXT: v_fma_f32 v11, -v9, v10, 1.0 -; GFX9-NEXT: v_fma_f32 v10, v11, v10, v10 -; GFX9-NEXT: v_mul_f32_e32 v11, v7, v10 -; GFX9-NEXT: v_fma_f32 v12, -v9, v11, v7 -; GFX9-NEXT: v_fma_f32 v11, v12, v10, v11 -; GFX9-NEXT: v_fma_f32 v7, -v9, v11, v7 +; GFX9-NEXT: v_fma_f32 v11, -v7, v9, 1.0 +; GFX9-NEXT: v_fma_f32 v9, v11, v9, v9 +; GFX9-NEXT: v_mul_f32_e32 v11, v10, v9 +; GFX9-NEXT: v_fma_f32 v12, -v7, v11, v10 +; GFX9-NEXT: v_fma_f32 v11, v12, v9, v11 +; GFX9-NEXT: v_fma_f32 v7, -v7, v11, v10 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX9-NEXT: v_div_fmas_f32 v7, v7, v10, v11 +; GFX9-NEXT: v_div_fmas_f32 v7, v7, v9, v11 ; GFX9-NEXT: v_div_fixup_f32 v7, v7, v6, v2 ; GFX9-NEXT: v_trunc_f32_e32 v7, v7 ; GFX9-NEXT: v_fma_f32 v2, -v7, v6, v2 -; GFX9-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v1 -; GFX9-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 -; GFX9-NEXT: v_rcp_f32_e32 v9, v7 +; GFX9-NEXT: v_div_scale_f32 v6, vcc, v5, v5, v1 +; GFX9-NEXT: v_div_scale_f32 v9, vcc, v1, v5, v1 +; GFX9-NEXT: v_rcp_f32_e32 v7, v6 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX9-NEXT: v_fma_f32 v10, -v7, v9, 1.0 -; GFX9-NEXT: v_fma_f32 v9, v10, v9, v9 -; GFX9-NEXT: v_mul_f32_e32 v10, v6, v9 -; GFX9-NEXT: v_fma_f32 v11, -v7, v10, v6 -; GFX9-NEXT: v_fma_f32 v10, v11, v9, v10 -; GFX9-NEXT: v_fma_f32 v6, -v7, v10, v6 +; GFX9-NEXT: v_fma_f32 v10, -v6, v7, 1.0 +; GFX9-NEXT: v_fma_f32 v7, v10, v7, v7 +; GFX9-NEXT: v_mul_f32_e32 v10, v9, v7 +; GFX9-NEXT: v_fma_f32 v11, -v6, v10, v9 +; GFX9-NEXT: v_fma_f32 v10, v11, v7, v10 +; GFX9-NEXT: v_fma_f32 v6, -v6, v10, v9 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX9-NEXT: v_div_fmas_f32 v6, v6, v9, v10 +; GFX9-NEXT: v_div_fmas_f32 v6, v6, v7, v10 ; GFX9-NEXT: v_div_fixup_f32 v6, v6, v5, v1 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 ; GFX9-NEXT: v_fma_f32 v1, -v6, v5, v1 -; GFX9-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v0 -; GFX9-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 -; GFX9-NEXT: v_rcp_f32_e32 v7, v6 +; GFX9-NEXT: v_div_scale_f32 v5, vcc, v4, v4, v0 +; GFX9-NEXT: v_div_scale_f32 v7, vcc, v0, v4, v0 +; GFX9-NEXT: v_rcp_f32_e32 v6, v5 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX9-NEXT: v_fma_f32 v9, -v6, v7, 1.0 -; GFX9-NEXT: v_fma_f32 v7, v9, v7, v7 -; GFX9-NEXT: v_mul_f32_e32 v9, v5, v7 -; GFX9-NEXT: v_fma_f32 v10, -v6, v9, v5 -; GFX9-NEXT: v_fma_f32 v9, v10, v7, v9 -; GFX9-NEXT: v_fma_f32 v5, -v6, v9, v5 +; GFX9-NEXT: v_fma_f32 v9, -v5, v6, 1.0 +; GFX9-NEXT: v_fma_f32 v6, v9, v6, v6 +; GFX9-NEXT: v_mul_f32_e32 v9, v7, v6 +; GFX9-NEXT: v_fma_f32 v10, -v5, v9, v7 +; GFX9-NEXT: v_fma_f32 v9, v10, v6, v9 +; GFX9-NEXT: v_fma_f32 v5, -v5, v9, v7 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX9-NEXT: v_div_fmas_f32 v5, v5, v7, v9 +; GFX9-NEXT: v_div_fmas_f32 v5, v5, v6, v9 ; GFX9-NEXT: v_div_fixup_f32 v5, v5, v4, v0 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_fma_f32 v0, -v5, v4, v0 @@ -2921,63 +2921,63 @@ ; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] ; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:64 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v10, s0, v7, v7, v3 -; GFX10-NEXT: v_div_scale_f32 v9, vcc_lo, v3, v7, v3 -; GFX10-NEXT: v_rcp_f32_e32 v11, v10 +; GFX10-NEXT: v_div_scale_f32 v9, vcc_lo, v7, v7, v3 +; GFX10-NEXT: v_div_scale_f32 v11, vcc_lo, v3, v7, v3 +; GFX10-NEXT: v_rcp_f32_e32 v10, v9 ; GFX10-NEXT: s_denorm_mode 15 -; GFX10-NEXT: v_fma_f32 v12, -v10, v11, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v11, v12, v11 -; GFX10-NEXT: v_mul_f32_e32 v12, v9, v11 -; GFX10-NEXT: v_fma_f32 v13, -v10, v12, v9 -; GFX10-NEXT: v_fmac_f32_e32 v12, v13, v11 -; GFX10-NEXT: v_fma_f32 v9, -v10, v12, v9 +; GFX10-NEXT: v_fma_f32 v12, -v9, v10, 1.0 +; GFX10-NEXT: v_fmac_f32_e32 v10, v12, v10 +; GFX10-NEXT: v_mul_f32_e32 v12, v11, v10 +; GFX10-NEXT: v_fma_f32 v13, -v9, v12, v11 +; GFX10-NEXT: v_fmac_f32_e32 v12, v13, v10 +; GFX10-NEXT: v_fma_f32 v9, -v9, v12, v11 ; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_fmas_f32 v9, v9, v11, v12 +; GFX10-NEXT: v_div_fmas_f32 v9, v9, v10, v12 ; GFX10-NEXT: v_div_fixup_f32 v9, v9, v7, v3 ; GFX10-NEXT: v_trunc_f32_e32 v9, v9 ; GFX10-NEXT: v_fma_f32 v3, -v9, v7, v3 -; GFX10-NEXT: v_div_scale_f32 v9, s0, v6, v6, v2 -; GFX10-NEXT: v_div_scale_f32 v7, vcc_lo, v2, v6, v2 -; GFX10-NEXT: v_rcp_f32_e32 v10, v9 +; GFX10-NEXT: v_div_scale_f32 v7, vcc_lo, v6, v6, v2 +; GFX10-NEXT: v_div_scale_f32 v10, vcc_lo, v2, v6, v2 +; GFX10-NEXT: v_rcp_f32_e32 v9, v7 ; GFX10-NEXT: s_denorm_mode 15 -; GFX10-NEXT: v_fma_f32 v11, -v9, v10, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v10, v11, v10 -; GFX10-NEXT: v_mul_f32_e32 v11, v7, v10 -; GFX10-NEXT: v_fma_f32 v12, -v9, v11, v7 -; GFX10-NEXT: v_fmac_f32_e32 v11, v12, v10 -; GFX10-NEXT: v_fma_f32 v7, -v9, v11, v7 +; GFX10-NEXT: v_fma_f32 v11, -v7, v9, 1.0 +; GFX10-NEXT: v_fmac_f32_e32 v9, v11, v9 +; GFX10-NEXT: v_mul_f32_e32 v11, v10, v9 +; GFX10-NEXT: v_fma_f32 v12, -v7, v11, v10 +; GFX10-NEXT: v_fmac_f32_e32 v11, v12, v9 +; GFX10-NEXT: v_fma_f32 v7, -v7, v11, v10 ; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_fmas_f32 v7, v7, v10, v11 +; GFX10-NEXT: v_div_fmas_f32 v7, v7, v9, v11 ; GFX10-NEXT: v_div_fixup_f32 v7, v7, v6, v2 ; GFX10-NEXT: v_trunc_f32_e32 v7, v7 ; GFX10-NEXT: v_fma_f32 v2, -v7, v6, v2 -; GFX10-NEXT: v_div_scale_f32 v7, s0, v5, v5, v1 -; GFX10-NEXT: v_div_scale_f32 v6, vcc_lo, v1, v5, v1 -; GFX10-NEXT: v_rcp_f32_e32 v9, v7 +; GFX10-NEXT: v_div_scale_f32 v6, vcc_lo, v5, v5, v1 +; GFX10-NEXT: v_div_scale_f32 v9, vcc_lo, v1, v5, v1 +; GFX10-NEXT: v_rcp_f32_e32 v7, v6 ; GFX10-NEXT: s_denorm_mode 15 -; GFX10-NEXT: v_fma_f32 v10, -v7, v9, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v9, v10, v9 -; GFX10-NEXT: v_mul_f32_e32 v10, v6, v9 -; GFX10-NEXT: v_fma_f32 v11, -v7, v10, v6 -; GFX10-NEXT: v_fmac_f32_e32 v10, v11, v9 -; GFX10-NEXT: v_fma_f32 v6, -v7, v10, v6 +; GFX10-NEXT: v_fma_f32 v10, -v6, v7, 1.0 +; GFX10-NEXT: v_fmac_f32_e32 v7, v10, v7 +; GFX10-NEXT: v_mul_f32_e32 v10, v9, v7 +; GFX10-NEXT: v_fma_f32 v11, -v6, v10, v9 +; GFX10-NEXT: v_fmac_f32_e32 v10, v11, v7 +; GFX10-NEXT: v_fma_f32 v6, -v6, v10, v9 ; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_fmas_f32 v6, v6, v9, v10 +; GFX10-NEXT: v_div_fmas_f32 v6, v6, v7, v10 ; GFX10-NEXT: v_div_fixup_f32 v6, v6, v5, v1 ; GFX10-NEXT: v_trunc_f32_e32 v6, v6 ; GFX10-NEXT: v_fma_f32 v1, -v6, v5, v1 -; GFX10-NEXT: v_div_scale_f32 v6, s0, v4, v4, v0 -; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v4, v0 -; GFX10-NEXT: v_rcp_f32_e32 v7, v6 +; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v4, v4, v0 +; GFX10-NEXT: v_div_scale_f32 v7, vcc_lo, v0, v4, v0 +; GFX10-NEXT: v_rcp_f32_e32 v6, v5 ; GFX10-NEXT: s_denorm_mode 15 -; GFX10-NEXT: v_fma_f32 v9, -v6, v7, 1.0 -; GFX10-NEXT: v_fmac_f32_e32 v7, v9, v7 -; GFX10-NEXT: v_mul_f32_e32 v9, v5, v7 -; GFX10-NEXT: v_fma_f32 v10, -v6, v9, v5 -; GFX10-NEXT: v_fmac_f32_e32 v9, v10, v7 -; GFX10-NEXT: v_fma_f32 v5, -v6, v9, v5 +; GFX10-NEXT: v_fma_f32 v9, -v5, v6, 1.0 +; GFX10-NEXT: v_fmac_f32_e32 v6, v9, v6 +; GFX10-NEXT: v_mul_f32_e32 v9, v7, v6 +; GFX10-NEXT: v_fma_f32 v10, -v5, v9, v7 +; GFX10-NEXT: v_fmac_f32_e32 v9, v10, v6 +; GFX10-NEXT: v_fma_f32 v5, -v5, v9, v7 ; GFX10-NEXT: s_denorm_mode 12 -; GFX10-NEXT: v_div_fmas_f32 v5, v5, v7, v9 +; GFX10-NEXT: v_div_fmas_f32 v5, v5, v6, v9 ; GFX10-NEXT: v_div_fixup_f32 v5, v5, v4, v0 ; GFX10-NEXT: v_trunc_f32_e32 v5, v5 ; GFX10-NEXT: v_fma_f32 v0, -v5, v4, v0 @@ -2995,86 +2995,86 @@ ; GFX11-NEXT: global_load_b128 v[0:3], v8, s[6:7] ; GFX11-NEXT: global_load_b128 v[4:7], v8, s[0:1] offset:64 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v10, null, v7, v7, v3 -; GFX11-NEXT: v_div_scale_f32 v9, vcc_lo, v3, v7, v3 +; GFX11-NEXT: v_div_scale_f32 v9, vcc_lo, v7, v7, v3 +; GFX11-NEXT: v_div_scale_f32 v11, vcc_lo, v3, v7, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_f32_e32 v11, v10 +; GFX11-NEXT: v_rcp_f32_e32 v10, v9 ; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v12, -v10, v11, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v11, v12, v11 +; GFX11-NEXT: v_fma_f32 v12, -v9, v10, 1.0 +; GFX11-NEXT: v_fmac_f32_e32 v10, v12, v10 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_f32_e32 v12, v9, v11 -; GFX11-NEXT: v_fma_f32 v13, -v10, v12, v9 +; GFX11-NEXT: v_mul_f32_e32 v12, v11, v10 +; GFX11-NEXT: v_fma_f32 v13, -v9, v12, v11 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fmac_f32_e32 v12, v13, v11 -; GFX11-NEXT: v_fma_f32 v9, -v10, v12, v9 +; GFX11-NEXT: v_fmac_f32_e32 v12, v13, v10 +; GFX11-NEXT: v_fma_f32 v9, -v9, v12, v11 ; GFX11-NEXT: s_denorm_mode 12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_div_fmas_f32 v9, v9, v11, v12 +; GFX11-NEXT: v_div_fmas_f32 v9, v9, v10, v12 ; GFX11-NEXT: v_div_fixup_f32 v9, v9, v7, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_trunc_f32_e32 v9, v9 ; GFX11-NEXT: v_fma_f32 v3, -v9, v7, v3 -; GFX11-NEXT: v_div_scale_f32 v9, null, v6, v6, v2 -; GFX11-NEXT: v_div_scale_f32 v7, vcc_lo, v2, v6, v2 +; GFX11-NEXT: v_div_scale_f32 v7, vcc_lo, v6, v6, v2 +; GFX11-NEXT: v_div_scale_f32 v10, vcc_lo, v2, v6, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_f32_e32 v10, v9 +; GFX11-NEXT: v_rcp_f32_e32 v9, v7 ; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v11, -v9, v10, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v10, v11, v10 +; GFX11-NEXT: v_fma_f32 v11, -v7, v9, 1.0 +; GFX11-NEXT: v_fmac_f32_e32 v9, v11, v9 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_f32_e32 v11, v7, v10 -; GFX11-NEXT: v_fma_f32 v12, -v9, v11, v7 +; GFX11-NEXT: v_mul_f32_e32 v11, v10, v9 +; GFX11-NEXT: v_fma_f32 v12, -v7, v11, v10 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fmac_f32_e32 v11, v12, v10 -; GFX11-NEXT: v_fma_f32 v7, -v9, v11, v7 +; GFX11-NEXT: v_fmac_f32_e32 v11, v12, v9 +; GFX11-NEXT: v_fma_f32 v7, -v7, v11, v10 ; GFX11-NEXT: s_denorm_mode 12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_div_fmas_f32 v7, v7, v10, v11 +; GFX11-NEXT: v_div_fmas_f32 v7, v7, v9, v11 ; GFX11-NEXT: v_div_fixup_f32 v7, v7, v6, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_trunc_f32_e32 v7, v7 ; GFX11-NEXT: v_fma_f32 v2, -v7, v6, v2 -; GFX11-NEXT: v_div_scale_f32 v7, null, v5, v5, v1 -; GFX11-NEXT: v_div_scale_f32 v6, vcc_lo, v1, v5, v1 +; GFX11-NEXT: v_div_scale_f32 v6, vcc_lo, v5, v5, v1 +; GFX11-NEXT: v_div_scale_f32 v9, vcc_lo, v1, v5, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_f32_e32 v9, v7 +; GFX11-NEXT: v_rcp_f32_e32 v7, v6 ; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v10, -v7, v9, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v9, v10, v9 +; GFX11-NEXT: v_fma_f32 v10, -v6, v7, 1.0 +; GFX11-NEXT: v_fmac_f32_e32 v7, v10, v7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_f32_e32 v10, v6, v9 -; GFX11-NEXT: v_fma_f32 v11, -v7, v10, v6 +; GFX11-NEXT: v_mul_f32_e32 v10, v9, v7 +; GFX11-NEXT: v_fma_f32 v11, -v6, v10, v9 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fmac_f32_e32 v10, v11, v9 -; GFX11-NEXT: v_fma_f32 v6, -v7, v10, v6 +; GFX11-NEXT: v_fmac_f32_e32 v10, v11, v7 +; GFX11-NEXT: v_fma_f32 v6, -v6, v10, v9 ; GFX11-NEXT: s_denorm_mode 12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_div_fmas_f32 v6, v6, v9, v10 +; GFX11-NEXT: v_div_fmas_f32 v6, v6, v7, v10 ; GFX11-NEXT: v_div_fixup_f32 v6, v6, v5, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_trunc_f32_e32 v6, v6 ; GFX11-NEXT: v_fma_f32 v1, -v6, v5, v1 -; GFX11-NEXT: v_div_scale_f32 v6, null, v4, v4, v0 -; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v4, v0 +; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, v4, v4, v0 +; GFX11-NEXT: v_div_scale_f32 v7, vcc_lo, v0, v4, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rcp_f32_e32 v7, v6 +; GFX11-NEXT: v_rcp_f32_e32 v6, v5 ; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f32 v9, -v6, v7, 1.0 -; GFX11-NEXT: v_fmac_f32_e32 v7, v9, v7 +; GFX11-NEXT: v_fma_f32 v9, -v5, v6, 1.0 +; GFX11-NEXT: v_fmac_f32_e32 v6, v9, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_f32_e32 v9, v5, v7 -; GFX11-NEXT: v_fma_f32 v10, -v6, v9, v5 +; GFX11-NEXT: v_mul_f32_e32 v9, v7, v6 +; GFX11-NEXT: v_fma_f32 v10, -v5, v9, v7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_fmac_f32_e32 v9, v10, v7 -; GFX11-NEXT: v_fma_f32 v5, -v6, v9, v5 +; GFX11-NEXT: v_fmac_f32_e32 v9, v10, v6 +; GFX11-NEXT: v_fma_f32 v5, -v5, v9, v7 ; GFX11-NEXT: s_denorm_mode 12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_div_fmas_f32 v5, v5, v7, v9 +; GFX11-NEXT: v_div_fmas_f32 v5, v5, v6, v9 ; GFX11-NEXT: v_div_fixup_f32 v5, v5, v4, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_trunc_f32_e32 v5, v5 @@ -3110,13 +3110,13 @@ ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:64 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[6:7], v[6:7], v[2:3] +; SI-NEXT: v_div_scale_f64 v[8:9], vcc, v[6:7], v[6:7], v[2:3] ; SI-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] ; SI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 ; SI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] ; SI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 ; SI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; SI-NEXT: v_div_scale_f64 v[12:13], s[0:1], v[2:3], v[6:7], v[2:3] +; SI-NEXT: v_div_scale_f64 v[12:13], vcc, v[2:3], v[6:7], v[2:3] ; SI-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] ; SI-NEXT: v_fma_f64 v[16:17], -v[8:9], v[14:15], v[12:13] ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v7, v9 @@ -3141,13 +3141,13 @@ ; SI-NEXT: v_cndmask_b32_e64 v10, v10, 0, vcc ; SI-NEXT: v_cndmask_b32_e64 v8, v10, v8, s[0:1] ; SI-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] -; SI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1] +; SI-NEXT: v_div_scale_f64 v[6:7], vcc, v[4:5], v[4:5], v[0:1] ; SI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] ; SI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 ; SI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] ; SI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 ; SI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] -; SI-NEXT: v_div_scale_f64 v[10:11], s[0:1], v[0:1], v[4:5], v[0:1] +; SI-NEXT: v_div_scale_f64 v[10:11], vcc, v[0:1], v[4:5], v[0:1] ; SI-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] ; SI-NEXT: v_fma_f64 v[14:15], -v[6:7], v[12:13], v[10:11] ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 @@ -3192,7 +3192,7 @@ ; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[6:7], v[6:7], v[2:3] +; CI-NEXT: v_div_scale_f64 v[8:9], vcc, v[6:7], v[6:7], v[2:3] ; CI-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] ; CI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 ; CI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] @@ -3206,7 +3206,7 @@ ; CI-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] ; CI-NEXT: v_trunc_f64_e32 v[8:9], v[8:9] ; CI-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] -; CI-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[4:5], v[4:5], v[0:1] +; CI-NEXT: v_div_scale_f64 v[6:7], vcc, v[4:5], v[4:5], v[0:1] ; CI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] ; CI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 ; CI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] @@ -3239,7 +3239,7 @@ ; VI-NEXT: v_mov_b32_e32 v8, s4 ; VI-NEXT: v_mov_b32_e32 v9, s5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_div_scale_f64 v[10:11], s[0:1], v[6:7], v[6:7], v[2:3] +; VI-NEXT: v_div_scale_f64 v[10:11], vcc, v[6:7], v[6:7], v[2:3] ; VI-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] ; VI-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 ; VI-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] @@ -3253,7 +3253,7 @@ ; VI-NEXT: v_div_fixup_f64 v[10:11], v[10:11], v[6:7], v[2:3] ; VI-NEXT: v_trunc_f64_e32 v[10:11], v[10:11] ; VI-NEXT: v_fma_f64 v[2:3], -v[10:11], v[6:7], v[2:3] -; VI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1] +; VI-NEXT: v_div_scale_f64 v[6:7], vcc, v[4:5], v[4:5], v[0:1] ; VI-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] ; VI-NEXT: v_fma_f64 v[12:13], -v[6:7], v[10:11], 1.0 ; VI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] @@ -3279,7 +3279,7 @@ ; GFX9-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] ; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] offset:64 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[6:7], v[6:7], v[2:3] +; GFX9-NEXT: v_div_scale_f64 v[8:9], vcc, v[6:7], v[6:7], v[2:3] ; GFX9-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] ; GFX9-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 ; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] @@ -3293,7 +3293,7 @@ ; GFX9-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] ; GFX9-NEXT: v_trunc_f64_e32 v[8:9], v[8:9] ; GFX9-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] -; GFX9-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1] +; GFX9-NEXT: v_div_scale_f64 v[6:7], vcc, v[4:5], v[4:5], v[0:1] ; GFX9-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] ; GFX9-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 ; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] @@ -3321,7 +3321,7 @@ ; GFX10-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] ; GFX10-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] offset:64 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[8:9], s0, v[6:7], v[6:7], v[2:3] +; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[6:7], v[6:7], v[2:3] ; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] ; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 ; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] @@ -3334,7 +3334,7 @@ ; GFX10-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] ; GFX10-NEXT: v_trunc_f64_e32 v[8:9], v[8:9] ; GFX10-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] -; GFX10-NEXT: v_div_scale_f64 v[6:7], s0, v[4:5], v[4:5], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[6:7], vcc_lo, v[4:5], v[4:5], v[0:1] ; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] ; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 ; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] @@ -3361,7 +3361,7 @@ ; GFX11-NEXT: global_load_b128 v[0:3], v16, s[6:7] ; GFX11-NEXT: global_load_b128 v[4:7], v16, s[0:1] offset:64 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f64 v[8:9], null, v[6:7], v[6:7], v[2:3] +; GFX11-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[6:7], v[6:7], v[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -3380,7 +3380,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_trunc_f64_e32 v[8:9], v[8:9] ; GFX11-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] -; GFX11-NEXT: v_div_scale_f64 v[6:7], null, v[4:5], v[4:5], v[0:1] +; GFX11-NEXT: v_div_scale_f64 v[6:7], vcc_lo, v[4:5], v[4:5], v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] ; GFX11-NEXT: s_waitcnt_depctr 0xfff diff --git a/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir b/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir --- a/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir +++ b/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir @@ -55,7 +55,7 @@ S_BRANCH %bb.3 bb.3: - $vgpr4, $vcc = V_DIV_SCALE_F32_e64 0, $vgpr1, 0, $vgpr1, 0, $vgpr3, 0, 0, implicit $mode, implicit $exec + $vgpr4 = V_DIV_SCALE_F32_e64 0, $vgpr1, 0, $vgpr1, 0, $vgpr3, 0, 0, implicit $mode, implicit-def $vcc, implicit $exec $vgpr0 = V_DIV_FMAS_F32_e64 0, $vgpr1, 0, $vgpr2, 0, $vgpr3, 0, 0, implicit $mode, implicit $vcc, implicit $exec S_ENDPGM 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll @@ -8,7 +8,7 @@ ; SI-LABEL: {{^}}test_div_scale_f32_1: ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], vcc, [[B]], [[B]], [[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { @@ -28,7 +28,7 @@ ; SI-LABEL: {{^}}test_div_scale_f32_2: ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], vcc, [[A]], [[B]], [[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { @@ -48,7 +48,7 @@ ; SI-LABEL: {{^}}test_div_scale_f64_1: ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 -; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] +; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], vcc, [[B]], [[B]], [[A]] ; SI: buffer_store_dwordx2 [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind { @@ -68,7 +68,7 @@ ; SI-LABEL: {{^}}test_div_scale_f64_2: ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 -; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] +; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], vcc, [[A]], [[B]], [[A]] ; SI: buffer_store_dwordx2 [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind { @@ -88,7 +88,7 @@ ; SI-LABEL: {{^}}test_div_scale_f32_scalar_num_1: ; SI-DAG: buffer_load_dword [[B:v[0-9]+]] ; SI-DAG: s_load_dword [[A:s[0-9]+]] -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], vcc, [[B]], [[B]], [[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind { @@ -106,7 +106,7 @@ ; SI-LABEL: {{^}}test_div_scale_f32_scalar_num_2: ; SI-DAG: buffer_load_dword [[B:v[0-9]+]] ; SI-DAG: s_load_dword [[A:s[0-9]+]] -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], vcc, [[A]], [[B]], [[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind { @@ -124,7 +124,7 @@ ; SI-LABEL: {{^}}test_div_scale_f32_scalar_den_1: ; SI-DAG: buffer_load_dword [[A:v[0-9]+]] ; SI-DAG: s_load_dword [[B:s[0-9]+]] -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], vcc, [[B]], [[B]], [[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind { @@ -142,7 +142,7 @@ ; SI-LABEL: {{^}}test_div_scale_f32_scalar_den_2: ; SI-DAG: buffer_load_dword [[A:v[0-9]+]] ; SI-DAG: s_load_dword [[B:s[0-9]+]] -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], vcc, [[A]], [[B]], [[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind { @@ -160,7 +160,7 @@ ; SI-LABEL: {{^}}test_div_scale_f64_scalar_num_1: ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]] ; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd -; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] +; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], vcc, [[B]], [[B]], [[A]] ; SI: buffer_store_dwordx2 [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind { @@ -178,7 +178,7 @@ ; SI-LABEL: {{^}}test_div_scale_f64_scalar_num_2: ; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]] -; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] +; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], vcc, [[A]], [[B]], [[A]] ; SI: buffer_store_dwordx2 [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind { @@ -196,7 +196,7 @@ ; SI-LABEL: {{^}}test_div_scale_f64_scalar_den_1: ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] ; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd -; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] +; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], vcc, [[B]], [[B]], [[A]] ; SI: buffer_store_dwordx2 [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind { @@ -214,7 +214,7 @@ ; SI-LABEL: {{^}}test_div_scale_f64_scalar_den_2: ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] ; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd -; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] +; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], vcc, [[A]], [[B]], [[A]] ; SI: buffer_store_dwordx2 [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind { @@ -233,7 +233,7 @@ ; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x13 ; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x1c ; SI: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]] -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[VA]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], vcc, [[B]], [[B]], [[VA]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b) nounwind { @@ -247,7 +247,7 @@ ; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x13 ; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x1c ; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]] -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[VB]], [[A]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], vcc, [[A]], [[VB]], [[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b) nounwind { @@ -262,7 +262,7 @@ ; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x1d ; SI-DAG: v_mov_b32_e32 v[[VA_LO:[0-9]+]], s[[A_LO]] ; SI-DAG: v_mov_b32_e32 v[[VA_HI:[0-9]+]], s[[A_HI]] -; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], v[[[VA_LO]]:[[VA_HI]]] +; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], vcc, [[B]], [[B]], v[[[VA_LO]]:[[VA_HI]]] ; SI: buffer_store_dwordx2 [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, [8 x i32], double %a, [8 x i32], double %b) nounwind { @@ -277,7 +277,7 @@ ; SI-DAG: s_load_dwordx2 s[[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]], {{s\[[0-9]+:[0-9]+\]}}, 0x1d ; SI-DAG: v_mov_b32_e32 v[[VB_LO:[0-9]+]], s[[B_LO]] ; SI-DAG: v_mov_b32_e32 v[[VB_HI:[0-9]+]], s[[B_HI]] -; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], v[[[VB_LO]]:[[VB_HI]]], [[A]] +; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], vcc, [[A]], v[[[VB_LO]]:[[VB_HI]]], [[A]] ; SI: buffer_store_dwordx2 [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, [8 x i32], double %a, [8 x i32], double %b) nounwind { @@ -289,7 +289,7 @@ ; SI-LABEL: {{^}}test_div_scale_f32_inline_imm_num: ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[A]], 1.0 +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], vcc, [[A]], [[A]], 1.0 ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { @@ -305,7 +305,7 @@ ; SI-LABEL: {{^}}test_div_scale_f32_inline_imm_den: ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], 2.0, 2.0, [[A]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], vcc, 2.0, 2.0, [[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { @@ -322,7 +322,7 @@ ; SI-LABEL: {{^}}test_div_scale_f32_fneg_num: ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], -[[A]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], vcc, [[B]], [[B]], -[[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f32_fneg_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { @@ -345,7 +345,7 @@ ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI: v_and_b32_e32 [[ABS_A:v[0-9]+]], 0x7fffffff, [[A]] -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[ABS_A]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], vcc, [[B]], [[B]], [[ABS_A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { @@ -367,7 +367,7 @@ ; SI-LABEL: {{^}}test_div_scale_f32_fneg_den: ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], -[[B]], -[[B]], [[A]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], vcc, -[[B]], -[[B]], [[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f32_fneg_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { @@ -390,7 +390,7 @@ ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI: v_and_b32_e32 [[ABS_B:v[0-9]+]], 0x7fffffff, [[B]] -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[ABS_B]], [[ABS_B]], [[A]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], vcc, [[ABS_B]], [[ABS_B]], [[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f32_fabs_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { @@ -411,7 +411,7 @@ ; SI-LABEL: {{^}}test_div_scale_f32_val_undef_val: ; SI: s_mov_b32 [[K:s[0-9]+]], 0x41000000 -; SI: v_div_scale_f32 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, [[K]], v{{[0-9]+}}, [[K]] +; SI: v_div_scale_f32 v{{[0-9]+}}, vcc, [[K]], v{{[0-9]+}}, [[K]] define amdgpu_kernel void @test_div_scale_f32_val_undef_val(float addrspace(1)* %out) #0 { %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float 8.0, float undef, i1 false) %result0 = extractvalue { float, i1 } %result, 0 @@ -421,7 +421,7 @@ ; SI-LABEL: {{^}}test_div_scale_f32_undef_val_val: ; SI: s_mov_b32 [[K:s[0-9]+]], 0x41000000 -; SI: v_div_scale_f32 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[K]], v{{[0-9]+}} +; SI: v_div_scale_f32 v{{[0-9]+}}, vcc, [[K]], [[K]], v{{[0-9]+}} define amdgpu_kernel void @test_div_scale_f32_undef_val_val(float addrspace(1)* %out) #0 { %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float 8.0, i1 false) %result0 = extractvalue { float, i1 } %result, 0 @@ -431,7 +431,7 @@ ; SI-LABEL: {{^}}test_div_scale_f32_undef_undef_val: ; SI-NOT: v0 -; SI: v_div_scale_f32 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s0, s0, v0 +; SI: v_div_scale_f32 v{{[0-9]+}}, vcc, s0, s0, v0 define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(float addrspace(1)* %out) #0 { %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float undef, i1 false) %result0 = extractvalue { float, i1 } %result, 0 @@ -442,7 +442,7 @@ ; SI-LABEL: {{^}}test_div_scale_f64_val_undef_val: ; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}} ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0x40200000 -; SI: v_div_scale_f64 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s[[[K_LO]]:[[K_HI]]], v[0:1], s[[[K_LO]]:[[K_HI]]] +; SI: v_div_scale_f64 v{{\[[0-9]+:[0-9]+\]}}, vcc, s[[[K_LO]]:[[K_HI]]], v[0:1], s[[[K_LO]]:[[K_HI]]] define amdgpu_kernel void @test_div_scale_f64_val_undef_val(double addrspace(1)* %out) #0 { %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double 8.0, double undef, i1 false) %result0 = extractvalue { double, i1 } %result, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.powi.ll b/llvm/test/CodeGen/AMDGPU/llvm.powi.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.powi.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.powi.ll @@ -55,7 +55,7 @@ ; GFX7-LABEL: v_powi_neg1_f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX7-NEXT: v_div_scale_f32 v1, vcc, v0, v0, 1.0 ; GFX7-NEXT: v_rcp_f32_e32 v2, v1 ; GFX7-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 ; GFX7-NEXT: v_fma_f32 v4, -v1, v2, 1.0 @@ -71,7 +71,7 @@ ; GFX8-LABEL: v_powi_neg1_f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX8-NEXT: v_div_scale_f32 v1, vcc, v0, v0, 1.0 ; GFX8-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 ; GFX8-NEXT: v_rcp_f32_e32 v3, v1 ; GFX8-NEXT: v_fma_f32 v4, -v1, v3, 1.0 @@ -102,7 +102,7 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v0 -; GFX7-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX7-NEXT: v_div_scale_f32 v1, vcc, v0, v0, 1.0 ; GFX7-NEXT: v_rcp_f32_e32 v2, v1 ; GFX7-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 ; GFX7-NEXT: v_fma_f32 v4, -v1, v2, 1.0 @@ -119,7 +119,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v0 -; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX8-NEXT: v_div_scale_f32 v1, vcc, v0, v0, 1.0 ; GFX8-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 ; GFX8-NEXT: v_rcp_f32_e32 v3, v1 ; GFX8-NEXT: v_fma_f32 v4, -v1, v3, 1.0 @@ -198,7 +198,7 @@ ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v0 -; GFX7-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX7-NEXT: v_div_scale_f32 v1, vcc, v0, v0, 1.0 ; GFX7-NEXT: v_rcp_f32_e32 v2, v1 ; GFX7-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 ; GFX7-NEXT: v_fma_f32 v4, -v1, v2, 1.0 @@ -221,7 +221,7 @@ ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v0 -; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX8-NEXT: v_div_scale_f32 v1, vcc, v0, v0, 1.0 ; GFX8-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 ; GFX8-NEXT: v_rcp_f32_e32 v3, v1 ; GFX8-NEXT: v_fma_f32 v4, -v1, v3, 1.0 diff --git a/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir b/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir --- a/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir @@ -172,7 +172,7 @@ --- # CHECK: name: sched_dbg_value_crash -# CHECK: DBG_VALUE %99, $noreg, !5, !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef), debug-location !8 +# CHECK: DBG_VALUE %97, $noreg, !5, !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef), debug-location !8 name: sched_dbg_value_crash alignment: 1 @@ -291,9 +291,9 @@ %87:vgpr_32 = IMPLICIT_DEF %88:vgpr_32 = IMPLICIT_DEF %90:vgpr_32 = IMPLICIT_DEF - %91:vgpr_32, dead %92:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, %90, 0, %90, 0, 1065353216, 0, 0, implicit $mode, implicit $exec + %91:vgpr_32 = nofpexcept V_DIV_SCALE_F32_e64 0, %90, 0, %90, 0, 1065353216, 0, 0, implicit $mode, implicit $exec, implicit-def $vcc %95:vgpr_32 = nofpexcept V_FMA_F32_e64 0, 0, 0, 0, 0, undef %93:vgpr_32, 0, 0, implicit $mode, implicit $exec - %96:vgpr_32, %97:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, 1065353216, 0, %90, 0, 1065353216, 0, 0, implicit $mode, implicit $exec + %96:vgpr_32 = nofpexcept V_DIV_SCALE_F32_e64 0, 1065353216, 0, %90, 0, 1065353216, 0, 0, implicit $mode, implicit $exec, implicit-def $vcc %98:vgpr_32 = IMPLICIT_DEF %99:vgpr_32 = IMPLICIT_DEF %100:vgpr_32 = IMPLICIT_DEF @@ -302,11 +302,11 @@ %103:vgpr_32 = IMPLICIT_DEF %104:vgpr_32 = IMPLICIT_DEF %105:vgpr_32 = IMPLICIT_DEF - %106:vgpr_32, dead %107:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, %90, 0, %90, 0, %105, 0, 0, implicit $mode, implicit $exec + %106:vgpr_32 = nofpexcept V_DIV_SCALE_F32_e64 0, %90, 0, %90, 0, %105, 0, 0, implicit $mode, implicit $exec, implicit-def $vcc %108:vgpr_32 = nofpexcept V_RCP_F32_e32 0, implicit $mode, implicit $exec %109:vgpr_32 = IMPLICIT_DEF %110:vgpr_32 = nofpexcept V_FMA_F32_e64 0, 0, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - %111:vgpr_32, %112:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, 0, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %111:vgpr_32 = nofpexcept V_DIV_SCALE_F32_e64 0, 0, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec, implicit-def $vcc %113:vgpr_32 = nofpexcept V_MUL_F32_e32 0, %110, implicit $mode, implicit $exec %114:vgpr_32 = IMPLICIT_DEF %115:vgpr_32 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -351,8 +351,8 @@ } ; GCN-LABEL: {{^}}test_div_scale_f32: -; GFX1032: v_div_scale_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GFX1064: v_div_scale_f32 v{{[0-9]+}}, s[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX1032: v_div_scale_f32 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX1064: v_div_scale_f32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @test_div_scale_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -368,8 +368,8 @@ } ; GCN-LABEL: {{^}}test_div_scale_f64: -; GFX1032: v_div_scale_f64 v[{{[0-9:]+}}], s{{[0-9]+}}, v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] -; GFX1064: v_div_scale_f64 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] +; GFX1032: v_div_scale_f64 v[{{[0-9:]+}}], vcc_lo, v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] +; GFX1064: v_div_scale_f64 v[{{[0-9:]+}}], vcc, v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] define amdgpu_kernel void @test_div_scale_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid @@ -469,8 +469,8 @@ } ; GCN-LABEL: {{^}}fdiv_f32: -; GFX1032: v_div_scale_f32 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -; GFX1064: v_div_scale_f32 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; GFX1032: v_div_scale_f32 v{{[0-9]+}}, vcc_lo, s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; GFX1064: v_div_scale_f32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} ; GCN: v_rcp_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} ; GFX1032: v_div_scale_f32 v{{[0-9]+}}, vcc_lo, s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} ; GFX1064: v_div_scale_f32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s --- a/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s +++ b/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s @@ -6793,333 +6793,333 @@ v_mul_hi_i32 v5, v1, -4.0 // GFX10: encoding: [0x05,0x00,0x6c,0xd5,0x01,0xef,0x01,0x00] -v_div_scale_f32 v5, s0, v1, v2, v3 -// W32: encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0x0e,0x04] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc_lo, v1, v2, v3 +// W32: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0x0e,0x04] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v255, s0, v1, v2, v3 -// W32: encoding: [0xff,0x00,0x6d,0xd5,0x01,0x05,0x0e,0x04] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v255, vcc_lo, v1, v2, v3 +// W32: encoding: [0xff,0x6a,0x6d,0xd5,0x01,0x05,0x0e,0x04] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s0, v255, v2, v3 -// W32: encoding: [0x05,0x00,0x6d,0xd5,0xff,0x05,0x0e,0x04] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc_lo, v255, v2, v3 +// W32: encoding: [0x05,0x6a,0x6d,0xd5,0xff,0x05,0x0e,0x04] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s0, s1, v2, v3 -// W32: encoding: [0x05,0x00,0x6d,0xd5,0x01,0x04,0x0e,0x04] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc_lo, s1, v2, v3 +// W32: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x04,0x0e,0x04] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s0, s103, v2, v3 -// W32: encoding: [0x05,0x00,0x6d,0xd5,0x67,0x04,0x0e,0x04] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc_lo, s103, v2, v3 +// W32: encoding: [0x05,0x6a,0x6d,0xd5,0x67,0x04,0x0e,0x04] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s0, vcc_lo, v2, v3 -// W32: encoding: [0x05,0x00,0x6d,0xd5,0x6a,0x04,0x0e,0x04] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc_lo, vcc_lo, v2, v3 +// W32: encoding: [0x05,0x6a,0x6d,0xd5,0x6a,0x04,0x0e,0x04] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s0, vcc_hi, v2, v3 -// W32: encoding: [0x05,0x00,0x6d,0xd5,0x6b,0x04,0x0e,0x04] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc_lo, vcc_hi, v2, v3 +// W32: encoding: [0x05,0x6a,0x6d,0xd5,0x6b,0x04,0x0e,0x04] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s0, ttmp11, v2, v3 -// W32: encoding: [0x05,0x00,0x6d,0xd5,0x77,0x04,0x0e,0x04] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc_lo, ttmp11, v2, v3 +// W32: encoding: [0x05,0x6a,0x6d,0xd5,0x77,0x04,0x0e,0x04] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s0, m0, v2, v3 -// W32: encoding: [0x05,0x00,0x6d,0xd5,0x7c,0x04,0x0e,0x04] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc_lo, m0, v2, v3 +// W32: encoding: [0x05,0x6a,0x6d,0xd5,0x7c,0x04,0x0e,0x04] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s0, exec_lo, v2, v3 -// W32: encoding: [0x05,0x00,0x6d,0xd5,0x7e,0x04,0x0e,0x04] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc_lo, exec_lo, v2, v3 +// W32: encoding: [0x05,0x6a,0x6d,0xd5,0x7e,0x04,0x0e,0x04] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s0, exec_hi, v2, v3 -// W32: encoding: [0x05,0x00,0x6d,0xd5,0x7f,0x04,0x0e,0x04] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc_lo, exec_hi, v2, v3 +// W32: encoding: [0x05,0x6a,0x6d,0xd5,0x7f,0x04,0x0e,0x04] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s0, 0, v2, v3 -// W32: encoding: [0x05,0x00,0x6d,0xd5,0x80,0x04,0x0e,0x04] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc_lo, 0, v2, v3 +// W32: encoding: [0x05,0x6a,0x6d,0xd5,0x80,0x04,0x0e,0x04] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s0, -1, v2, v3 -// W32: encoding: [0x05,0x00,0x6d,0xd5,0xc1,0x04,0x0e,0x04] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc_lo, -1, v2, v3 +// W32: encoding: [0x05,0x6a,0x6d,0xd5,0xc1,0x04,0x0e,0x04] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s0, 0.5, v2, v3 -// W32: encoding: [0x05,0x00,0x6d,0xd5,0xf0,0x04,0x0e,0x04] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc_lo, 0.5, v2, v3 +// W32: encoding: [0x05,0x6a,0x6d,0xd5,0xf0,0x04,0x0e,0x04] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s0, -4.0, v2, v3 -// W32: encoding: [0x05,0x00,0x6d,0xd5,0xf7,0x04,0x0e,0x04] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc_lo, -4.0, v2, v3 +// W32: encoding: [0x05,0x6a,0x6d,0xd5,0xf7,0x04,0x0e,0x04] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s0, v1, v255, v3 -// W32: encoding: [0x05,0x00,0x6d,0xd5,0x01,0xff,0x0f,0x04] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc_lo, v1, v255, v3 +// W32: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0xff,0x0f,0x04] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s0, v1, s2, v3 -// W32: encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0x0c,0x04] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc_lo, v1, s2, v3 +// W32: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0x0c,0x04] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s0, v1, s103, v3 -// W32: encoding: [0x05,0x00,0x6d,0xd5,0x01,0xcf,0x0c,0x04] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc_lo, v1, s103, v3 +// W32: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0xcf,0x0c,0x04] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s0, v1, vcc_lo, v3 -// W32: encoding: [0x05,0x00,0x6d,0xd5,0x01,0xd5,0x0c,0x04] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc_lo, v1, vcc_lo, v3 +// W32: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0xd5,0x0c,0x04] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s0, v1, vcc_hi, v3 -// W32: encoding: [0x05,0x00,0x6d,0xd5,0x01,0xd7,0x0c,0x04] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc_lo, v1, vcc_hi, v3 +// W32: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0xd7,0x0c,0x04] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s0, v1, ttmp11, v3 -// W32: encoding: [0x05,0x00,0x6d,0xd5,0x01,0xef,0x0c,0x04] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc_lo, v1, ttmp11, v3 +// W32: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0xef,0x0c,0x04] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s0, v1, m0, v3 -// W32: encoding: [0x05,0x00,0x6d,0xd5,0x01,0xf9,0x0c,0x04] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc_lo, v1, m0, v3 +// W32: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0xf9,0x0c,0x04] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s0, v1, exec_lo, v3 -// W32: encoding: [0x05,0x00,0x6d,0xd5,0x01,0xfd,0x0c,0x04] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc_lo, v1, exec_lo, v3 +// W32: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0xfd,0x0c,0x04] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s0, v1, exec_hi, v3 -// W32: encoding: [0x05,0x00,0x6d,0xd5,0x01,0xff,0x0c,0x04] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc_lo, v1, exec_hi, v3 +// W32: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0xff,0x0c,0x04] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s0, v1, 0, v3 -// W32: encoding: [0x05,0x00,0x6d,0xd5,0x01,0x01,0x0d,0x04] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc_lo, v1, 0, v3 +// W32: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x01,0x0d,0x04] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s0, v1, -1, v3 -// W32: encoding: [0x05,0x00,0x6d,0xd5,0x01,0x83,0x0d,0x04] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc_lo, v1, -1, v3 +// W32: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x83,0x0d,0x04] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s0, v1, 0.5, v3 -// W32: encoding: [0x05,0x00,0x6d,0xd5,0x01,0xe1,0x0d,0x04] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc_lo, v1, 0.5, v3 +// W32: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0xe1,0x0d,0x04] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s0, v1, -4.0, v3 -// W32: encoding: [0x05,0x00,0x6d,0xd5,0x01,0xef,0x0d,0x04] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc_lo, v1, -4.0, v3 +// W32: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0xef,0x0d,0x04] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s0, v1, v2, v255 -// W32: encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0xfe,0x07] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc_lo, v1, v2, v255 +// W32: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0xfe,0x07] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s0, v1, v2, s3 -// W32: encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0x0e,0x00] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc_lo, v1, v2, s3 +// W32: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0x0e,0x00] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s0, v1, v2, s103 -// W32: encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0x9e,0x01] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc_lo, v1, v2, s103 +// W32: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0x9e,0x01] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s0, v1, v2, vcc_lo -// W32: encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0xaa,0x01] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc_lo, v1, v2, vcc_lo +// W32: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0xaa,0x01] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s0, v1, v2, vcc_hi -// W32: encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0xae,0x01] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc_lo, v1, v2, vcc_hi +// W32: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0xae,0x01] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s0, v1, v2, ttmp11 -// W32: encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0xde,0x01] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc_lo, v1, v2, ttmp11 +// W32: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0xde,0x01] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s0, v1, v2, m0 -// W32: encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0xf2,0x01] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc_lo, v1, v2, m0 +// W32: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0xf2,0x01] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s0, v1, v2, exec_lo -// W32: encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0xfa,0x01] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc_lo, v1, v2, exec_lo +// W32: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0xfa,0x01] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s0, v1, v2, exec_hi -// W32: encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0xfe,0x01] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc_lo, v1, v2, exec_hi +// W32: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0xfe,0x01] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s0, v1, v2, 0 -// W32: encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0x02,0x02] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc_lo, v1, v2, 0 +// W32: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0x02,0x02] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s0, v1, v2, -1 -// W32: encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0x06,0x03] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc_lo, v1, v2, -1 +// W32: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0x06,0x03] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s0, v1, v2, 0.5 -// W32: encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0xc2,0x03] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc_lo, v1, v2, 0.5 +// W32: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0xc2,0x03] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s0, v1, v2, -4.0 -// W32: encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0xde,0x03] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc_lo, v1, v2, -4.0 +// W32: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0xde,0x03] +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s[0:1], v1, v2, v3 -// W64: encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0x0e,0x04] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc, v1, v2, v3 +// W64: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0x0e,0x04] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v255, s[0:1], v1, v2, v3 -// W64: encoding: [0xff,0x00,0x6d,0xd5,0x01,0x05,0x0e,0x04] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v255, vcc, v1, v2, v3 +// W64: encoding: [0xff,0x6a,0x6d,0xd5,0x01,0x05,0x0e,0x04] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s[0:1], v255, v2, v3 -// W64: encoding: [0x05,0x00,0x6d,0xd5,0xff,0x05,0x0e,0x04] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc, v255, v2, v3 +// W64: encoding: [0x05,0x6a,0x6d,0xd5,0xff,0x05,0x0e,0x04] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s[0:1], s1, v2, v3 -// W64: encoding: [0x05,0x00,0x6d,0xd5,0x01,0x04,0x0e,0x04] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc, s1, v2, v3 +// W64: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x04,0x0e,0x04] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s[0:1], s103, v2, v3 -// W64: encoding: [0x05,0x00,0x6d,0xd5,0x67,0x04,0x0e,0x04] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc, s103, v2, v3 +// W64: encoding: [0x05,0x6a,0x6d,0xd5,0x67,0x04,0x0e,0x04] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s[0:1], vcc_lo, v2, v3 -// W64: encoding: [0x05,0x00,0x6d,0xd5,0x6a,0x04,0x0e,0x04] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc, vcc_lo, v2, v3 +// W64: encoding: [0x05,0x6a,0x6d,0xd5,0x6a,0x04,0x0e,0x04] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s[0:1], vcc_hi, v2, v3 -// W64: encoding: [0x05,0x00,0x6d,0xd5,0x6b,0x04,0x0e,0x04] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc, vcc_hi, v2, v3 +// W64: encoding: [0x05,0x6a,0x6d,0xd5,0x6b,0x04,0x0e,0x04] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s[0:1], ttmp11, v2, v3 -// W64: encoding: [0x05,0x00,0x6d,0xd5,0x77,0x04,0x0e,0x04] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc, ttmp11, v2, v3 +// W64: encoding: [0x05,0x6a,0x6d,0xd5,0x77,0x04,0x0e,0x04] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s[0:1], m0, v2, v3 -// W64: encoding: [0x05,0x00,0x6d,0xd5,0x7c,0x04,0x0e,0x04] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc, m0, v2, v3 +// W64: encoding: [0x05,0x6a,0x6d,0xd5,0x7c,0x04,0x0e,0x04] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s[0:1], exec_lo, v2, v3 -// W64: encoding: [0x05,0x00,0x6d,0xd5,0x7e,0x04,0x0e,0x04] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc, exec_lo, v2, v3 +// W64: encoding: [0x05,0x6a,0x6d,0xd5,0x7e,0x04,0x0e,0x04] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s[0:1], exec_hi, v2, v3 -// W64: encoding: [0x05,0x00,0x6d,0xd5,0x7f,0x04,0x0e,0x04] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc, exec_hi, v2, v3 +// W64: encoding: [0x05,0x6a,0x6d,0xd5,0x7f,0x04,0x0e,0x04] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s[0:1], 0, v2, v3 -// W64: encoding: [0x05,0x00,0x6d,0xd5,0x80,0x04,0x0e,0x04] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc, 0, v2, v3 +// W64: encoding: [0x05,0x6a,0x6d,0xd5,0x80,0x04,0x0e,0x04] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s[0:1], -1, v2, v3 -// W64: encoding: [0x05,0x00,0x6d,0xd5,0xc1,0x04,0x0e,0x04] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc, -1, v2, v3 +// W64: encoding: [0x05,0x6a,0x6d,0xd5,0xc1,0x04,0x0e,0x04] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s[0:1], 0.5, v2, v3 -// W64: encoding: [0x05,0x00,0x6d,0xd5,0xf0,0x04,0x0e,0x04] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc, 0.5, v2, v3 +// W64: encoding: [0x05,0x6a,0x6d,0xd5,0xf0,0x04,0x0e,0x04] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s[0:1], -4.0, v2, v3 -// W64: encoding: [0x05,0x00,0x6d,0xd5,0xf7,0x04,0x0e,0x04] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc, -4.0, v2, v3 +// W64: encoding: [0x05,0x6a,0x6d,0xd5,0xf7,0x04,0x0e,0x04] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s[0:1], v1, v255, v3 -// W64: encoding: [0x05,0x00,0x6d,0xd5,0x01,0xff,0x0f,0x04] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc, v1, v255, v3 +// W64: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0xff,0x0f,0x04] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s[0:1], v1, s2, v3 -// W64: encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0x0c,0x04] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc, v1, s2, v3 +// W64: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0x0c,0x04] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s[0:1], v1, s103, v3 -// W64: encoding: [0x05,0x00,0x6d,0xd5,0x01,0xcf,0x0c,0x04] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc, v1, s103, v3 +// W64: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0xcf,0x0c,0x04] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s[0:1], v1, vcc_lo, v3 -// W64: encoding: [0x05,0x00,0x6d,0xd5,0x01,0xd5,0x0c,0x04] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc, v1, vcc_lo, v3 +// W64: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0xd5,0x0c,0x04] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s[0:1], v1, vcc_hi, v3 -// W64: encoding: [0x05,0x00,0x6d,0xd5,0x01,0xd7,0x0c,0x04] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc, v1, vcc_hi, v3 +// W64: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0xd7,0x0c,0x04] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s[0:1], v1, ttmp11, v3 -// W64: encoding: [0x05,0x00,0x6d,0xd5,0x01,0xef,0x0c,0x04] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc, v1, ttmp11, v3 +// W64: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0xef,0x0c,0x04] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s[0:1], v1, m0, v3 -// W64: encoding: [0x05,0x00,0x6d,0xd5,0x01,0xf9,0x0c,0x04] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc, v1, m0, v3 +// W64: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0xf9,0x0c,0x04] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s[0:1], v1, exec_lo, v3 -// W64: encoding: [0x05,0x00,0x6d,0xd5,0x01,0xfd,0x0c,0x04] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc, v1, exec_lo, v3 +// W64: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0xfd,0x0c,0x04] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s[0:1], v1, exec_hi, v3 -// W64: encoding: [0x05,0x00,0x6d,0xd5,0x01,0xff,0x0c,0x04] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc, v1, exec_hi, v3 +// W64: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0xff,0x0c,0x04] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s[0:1], v1, 0, v3 -// W64: encoding: [0x05,0x00,0x6d,0xd5,0x01,0x01,0x0d,0x04] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc, v1, 0, v3 +// W64: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x01,0x0d,0x04] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s[0:1], v1, -1, v3 -// W64: encoding: [0x05,0x00,0x6d,0xd5,0x01,0x83,0x0d,0x04] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc, v1, -1, v3 +// W64: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x83,0x0d,0x04] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s[0:1], v1, 0.5, v3 -// W64: encoding: [0x05,0x00,0x6d,0xd5,0x01,0xe1,0x0d,0x04] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc, v1, 0.5, v3 +// W64: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0xe1,0x0d,0x04] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s[0:1], v1, -4.0, v3 -// W64: encoding: [0x05,0x00,0x6d,0xd5,0x01,0xef,0x0d,0x04] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc, v1, -4.0, v3 +// W64: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0xef,0x0d,0x04] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s[0:1], v1, v2, v255 -// W64: encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0xfe,0x07] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc, v1, v2, v255 +// W64: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0xfe,0x07] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s[0:1], v1, v2, s3 -// W64: encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0x0e,0x00] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc, v1, v2, s3 +// W64: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0x0e,0x00] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s[0:1], v1, v2, s103 -// W64: encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0x9e,0x01] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc, v1, v2, s103 +// W64: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0x9e,0x01] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s[0:1], v1, v2, vcc_lo -// W64: encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0xaa,0x01] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc, v1, v2, vcc_lo +// W64: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0xaa,0x01] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s[0:1], v1, v2, vcc_hi -// W64: encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0xae,0x01] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc, v1, v2, vcc_hi +// W64: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0xae,0x01] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s[0:1], v1, v2, ttmp11 -// W64: encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0xde,0x01] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc, v1, v2, ttmp11 +// W64: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0xde,0x01] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s[0:1], v1, v2, m0 -// W64: encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0xf2,0x01] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc, v1, v2, m0 +// W64: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0xf2,0x01] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s[0:1], v1, v2, exec_lo -// W64: encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0xfa,0x01] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc, v1, v2, exec_lo +// W64: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0xfa,0x01] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s[0:1], v1, v2, exec_hi -// W64: encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0xfe,0x01] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc, v1, v2, exec_hi +// W64: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0xfe,0x01] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s[0:1], v1, v2, 0 -// W64: encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0x02,0x02] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc, v1, v2, 0 +// W64: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0x02,0x02] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s[0:1], v1, v2, -1 -// W64: encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0x06,0x03] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc, v1, v2, -1 +// W64: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0x06,0x03] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s[0:1], v1, v2, 0.5 -// W64: encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0xc2,0x03] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc, v1, v2, 0.5 +// W64: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0xc2,0x03] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode -v_div_scale_f32 v5, s[0:1], v1, v2, -4.0 -// W64: encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0xde,0x03] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +v_div_scale_f32 v5, vcc, v1, v2, -4.0 +// W64: encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0xde,0x03] +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_fmas_f32 v5, v1, v2, v3 // GFX10: encoding: [0x05,0x00,0x6f,0xd5,0x01,0x05,0x0e,0x04] diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s --- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s +++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s @@ -4060,219 +4060,219 @@ v_div_scale_f32 v5, vcc_lo, v1, v2, s3 // W32: encoding: [0x05,0x6a,0xfc,0xd6,0x01,0x05,0x0e,0x00] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f32 v5, vcc_lo, v255, s2, s105 // W32: encoding: [0x05,0x6a,0xfc,0xd6,0xff,0x05,0xa4,0x01] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f32 v5, vcc_lo, s1, v255, exec_hi // W32: encoding: [0x05,0x6a,0xfc,0xd6,0x01,0xfe,0xff,0x01] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f32 v5, vcc_lo, s105, s105, exec_lo // W32: encoding: [0x05,0x6a,0xfc,0xd6,0x69,0xd2,0xf8,0x01] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f32 v5, vcc_lo, vcc_lo, ttmp15, v3 // W32: encoding: [0x05,0x6a,0xfc,0xd6,0x6a,0xf6,0x0c,0x04] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f32 v5, vcc_lo, vcc_hi, 0xaf123456, v255 // W32: encoding: [0x05,0x6a,0xfc,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f32 v5, vcc_lo, -ttmp15, -src_scc, -ttmp15 // W32: encoding: [0x05,0x6a,0xfc,0xd6,0x7b,0xfa,0xed,0xe1] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f32 v5, vcc_lo, m0, 0.5, m0 // W32: encoding: [0x05,0x6a,0xfc,0xd6,0x7d,0xe0,0xf5,0x01] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f32 v5, vcc_lo, exec_lo, -1, vcc_hi // W32: encoding: [0x05,0x6a,0xfc,0xd6,0x7e,0x82,0xad,0x01] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f32 v5, vcc_lo, -exec_hi, null, -vcc_lo // W32: encoding: [0x05,0x6a,0xfc,0xd6,0x7f,0xf8,0xa8,0xa1] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f32 v5, vcc_lo, null, exec_lo, neg(0xaf123456) // W32: encoding: [0x05,0x6a,0xfc,0xd6,0x7c,0xfc,0xfc,0x83,0x56,0x34,0x12,0xaf] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f32 v5, vcc_lo, -1, -exec_hi, -src_scc // W32: encoding: [0x05,0x6a,0xfc,0xd6,0xc1,0xfe,0xf4,0xc3] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f32 v5, vcc_lo, 0.5, -m0, 0.5 mul:2 // W32: encoding: [0x05,0x6a,0xfc,0xd6,0xf0,0xfa,0xc0,0x4b] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f32 v5, vcc_lo, -src_scc, vcc_lo, -1 mul:4 // W32: encoding: [0x05,0x6a,0xfc,0xd6,0xfd,0xd4,0x04,0x33] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f32 v255, vcc_lo, neg(0xaf123456), -vcc_hi, null clamp div:2 // W32: encoding: [0xff,0xea,0xfc,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f32 v5, vcc, v1, v2, s3 // W64: encoding: [0x05,0x6a,0xfc,0xd6,0x01,0x05,0x0e,0x00] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f32 v5, vcc, v255, s2, s105 // W64: encoding: [0x05,0x6a,0xfc,0xd6,0xff,0x05,0xa4,0x01] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f32 v5, vcc, s1, v255, exec_hi // W64: encoding: [0x05,0x6a,0xfc,0xd6,0x01,0xfe,0xff,0x01] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f32 v5, vcc, s105, s105, exec_lo // W64: encoding: [0x05,0x6a,0xfc,0xd6,0x69,0xd2,0xf8,0x01] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f32 v5, vcc, vcc_lo, ttmp15, v3 // W64: encoding: [0x05,0x6a,0xfc,0xd6,0x6a,0xf6,0x0c,0x04] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f32 v5, vcc, vcc_hi, 0xaf123456, v255 // W64: encoding: [0x05,0x6a,0xfc,0xd6,0x6b,0xfe,0xfd,0x07,0x56,0x34,0x12,0xaf] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f32 v5, vcc, -ttmp15, -src_scc, -ttmp15 // W64: encoding: [0x05,0x6a,0xfc,0xd6,0x7b,0xfa,0xed,0xe1] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f32 v5, vcc, m0, 0.5, m0 // W64: encoding: [0x05,0x6a,0xfc,0xd6,0x7d,0xe0,0xf5,0x01] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f32 v5, vcc, exec_lo, -1, vcc_hi // W64: encoding: [0x05,0x6a,0xfc,0xd6,0x7e,0x82,0xad,0x01] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f32 v5, vcc, -exec_hi, null, -vcc_lo // W64: encoding: [0x05,0x6a,0xfc,0xd6,0x7f,0xf8,0xa8,0xa1] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f32 v5, vcc, null, exec_lo, neg(0xaf123456) // W64: encoding: [0x05,0x6a,0xfc,0xd6,0x7c,0xfc,0xfc,0x83,0x56,0x34,0x12,0xaf] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f32 v5, vcc, -1, -exec_hi, -src_scc // W64: encoding: [0x05,0x6a,0xfc,0xd6,0xc1,0xfe,0xf4,0xc3] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f32 v5, vcc, 0.5, -m0, 0.5 mul:2 // W64: encoding: [0x05,0x6a,0xfc,0xd6,0xf0,0xfa,0xc0,0x4b] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f32 v5, vcc, -src_scc, vcc_lo, -1 mul:4 // W64: encoding: [0x05,0x6a,0xfc,0xd6,0xfd,0xd4,0x04,0x33] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f32 v255, vcc, neg(0xaf123456), -vcc_hi, null clamp div:2 // W64: encoding: [0xff,0xea,0xfc,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f64 v[5:6], vcc_lo, v[1:2], v[2:3], v[3:4] // W32: encoding: [0x05,0x6a,0xfd,0xd6,0x01,0x05,0x0e,0x04] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f64 v[5:6], vcc_lo, v[254:255], v[254:255], s[6:7] // W32: encoding: [0x05,0x6a,0xfd,0xd6,0xfe,0xfd,0x1b,0x00] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f64 v[5:6], vcc_lo, s[2:3], s[4:5], v[254:255] // W32: encoding: [0x05,0x6a,0xfd,0xd6,0x02,0x08,0xf8,0x07] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f64 v[5:6], vcc_lo, -s[104:105], s[104:105], -s[104:105] // W32: encoding: [0x05,0x6a,0xfd,0xd6,0x68,0xd0,0xa0,0xa1] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f64 v[5:6], vcc_lo, vcc, -ttmp[14:15], -ttmp[14:15] // W32: encoding: [0x05,0x6a,0xfd,0xd6,0x6a,0xf4,0xe8,0xc1] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f64 v[5:6], vcc_lo, -ttmp[14:15], 0xaf123456, null // W32: encoding: [0x05,0x6a,0xfd,0xd6,0x7a,0xfe,0xf1,0x21,0x56,0x34,0x12,0xaf] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f64 v[5:6], vcc_lo, -exec, -src_scc, -exec // W32: encoding: [0x05,0x6a,0xfd,0xd6,0x7e,0xfa,0xf9,0xe1] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f64 v[5:6], vcc_lo, null, 0.5, vcc // W32: encoding: [0x05,0x6a,0xfd,0xd6,0x7c,0xe0,0xa9,0x01] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f64 v[5:6], vcc_lo, -1, -1, 0xaf123456 // W32: encoding: [0x05,0x6a,0xfd,0xd6,0xc1,0x82,0xfd,0x03,0x56,0x34,0x12,0xaf] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f64 v[5:6], vcc_lo, 0.5, null, -src_scc mul:2 // W32: encoding: [0x05,0x6a,0xfd,0xd6,0xf0,0xf8,0xf4,0x8b] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f64 v[5:6], vcc_lo, -src_scc, -exec, 0.5 mul:4 // W32: encoding: [0x05,0x6a,0xfd,0xd6,0xfd,0xfc,0xc0,0x73] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f64 v[254:255], vcc_lo, 0xaf123456, -vcc, -1 clamp div:2 // W32: encoding: [0xfe,0xea,0xfd,0xd6,0xff,0xd4,0x04,0x5b,0x56,0x34,0x12,0xaf] -// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W64-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f64 v[5:6], vcc, v[1:2], v[2:3], v[3:4] // W64: encoding: [0x05,0x6a,0xfd,0xd6,0x01,0x05,0x0e,0x04] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f64 v[5:6], vcc, v[254:255], v[254:255], s[6:7] // W64: encoding: [0x05,0x6a,0xfd,0xd6,0xfe,0xfd,0x1b,0x00] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f64 v[5:6], vcc, s[2:3], s[4:5], v[254:255] // W64: encoding: [0x05,0x6a,0xfd,0xd6,0x02,0x08,0xf8,0x07] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f64 v[5:6], vcc, -s[104:105], s[104:105], -s[104:105] // W64: encoding: [0x05,0x6a,0xfd,0xd6,0x68,0xd0,0xa0,0xa1] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f64 v[5:6], vcc, vcc, -ttmp[14:15], -ttmp[14:15] // W64: encoding: [0x05,0x6a,0xfd,0xd6,0x6a,0xf4,0xe8,0xc1] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f64 v[5:6], vcc, -ttmp[14:15], 0xaf123456, null // W64: encoding: [0x05,0x6a,0xfd,0xd6,0x7a,0xfe,0xf1,0x21,0x56,0x34,0x12,0xaf] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f64 v[5:6], vcc, -exec, -src_scc, -exec // W64: encoding: [0x05,0x6a,0xfd,0xd6,0x7e,0xfa,0xf9,0xe1] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f64 v[5:6], vcc, null, 0.5, vcc // W64: encoding: [0x05,0x6a,0xfd,0xd6,0x7c,0xe0,0xa9,0x01] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f64 v[5:6], vcc, -1, -1, 0xaf123456 // W64: encoding: [0x05,0x6a,0xfd,0xd6,0xc1,0x82,0xfd,0x03,0x56,0x34,0x12,0xaf] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f64 v[5:6], vcc, 0.5, null, -src_scc mul:2 // W64: encoding: [0x05,0x6a,0xfd,0xd6,0xf0,0xf8,0xf4,0x8b] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f64 v[5:6], vcc, -src_scc, -exec, 0.5 mul:4 // W64: encoding: [0x05,0x6a,0xfd,0xd6,0xfd,0xfc,0xc0,0x73] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_div_scale_f64 v[254:255], vcc, 0xaf123456, -vcc, -1 clamp div:2 // W64: encoding: [0xfe,0xea,0xfd,0xd6,0xff,0xd4,0x04,0x5b,0x56,0x34,0x12,0xaf] -// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid operand for instruction +// W32-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode v_dot2_bf16_bf16 v5, v1, v2, s3 // GFX11: encoding: [0x05,0x00,0x67,0xd6,0x01,0x05,0x0e,0x00] diff --git a/llvm/test/MC/AMDGPU/vop3.s b/llvm/test/MC/AMDGPU/vop3.s --- a/llvm/test/MC/AMDGPU/vop3.s +++ b/llvm/test/MC/AMDGPU/vop3.s @@ -411,17 +411,17 @@ // SICI: v_div_scale_f64 v[24:25], vcc, v[22:23], v[22:23], v[20:21] ; encoding: [0x18,0x6a,0xdc,0xd2,0x16,0x2d,0x52,0x04] // VI: v_div_scale_f64 v[24:25], vcc, v[22:23], v[22:23], v[20:21] ; encoding: [0x18,0x6a,0xe1,0xd1,0x16,0x2d,0x52,0x04] -v_div_scale_f64 v[24:25], s[10:11], -v[22:23], v[20:21], v[20:21] clamp -// SICI: v_div_scale_f64 v[24:25], s[10:11], -v[22:23], v[20:21], v[20:21] clamp ; encoding: [0x18,0x0a,0xdc,0xd2,0x16,0x29,0x52,0x24] -// VI: v_div_scale_f64 v[24:25], s[10:11], -v[22:23], v[20:21], v[20:21] clamp ; encoding: [0x18,0x8a,0xe1,0xd1,0x16,0x29,0x52,0x24] +v_div_scale_f64 v[24:25], vcc, -v[22:23], v[20:21], v[20:21] clamp +// SICI: v_div_scale_f64 v[24:25], vcc, -v[22:23], v[20:21], v[20:21] clamp ; encoding: [0x18,0x6a,0xdc,0xd2,0x16,0x29,0x52,0x24] +// VI: v_div_scale_f64 v[24:25], vcc, -v[22:23], v[20:21], v[20:21] clamp ; encoding: [0x18,0xea,0xe1,0xd1,0x16,0x29,0x52,0x24] -v_div_scale_f64 v[24:25], s[10:11], v[22:23], -v[20:21], v[20:21] clamp mul:2 -// SICI: v_div_scale_f64 v[24:25], s[10:11], v[22:23], -v[20:21], v[20:21] clamp mul:2 ; encoding: [0x18,0x0a,0xdc,0xd2,0x16,0x29,0x52,0x4c] -// VI: v_div_scale_f64 v[24:25], s[10:11], v[22:23], -v[20:21], v[20:21] clamp mul:2 ; encoding: [0x18,0x8a,0xe1,0xd1,0x16,0x29,0x52,0x4c] +v_div_scale_f64 v[24:25], vcc, v[22:23], -v[20:21], v[20:21] clamp mul:2 +// SICI: v_div_scale_f64 v[24:25], vcc, v[22:23], -v[20:21], v[20:21] clamp mul:2 ; encoding: [0x18,0x6a,0xdc,0xd2,0x16,0x29,0x52,0x4c] +// VI: v_div_scale_f64 v[24:25], vcc, v[22:23], -v[20:21], v[20:21] clamp mul:2 ; encoding: [0x18,0xea,0xe1,0xd1,0x16,0x29,0x52,0x4c] -v_div_scale_f64 v[24:25], s[10:11], v[22:23], v[20:21], -v[20:21] -// SICI: v_div_scale_f64 v[24:25], s[10:11], v[22:23], v[20:21], -v[20:21] ; encoding: [0x18,0x0a,0xdc,0xd2,0x16,0x29,0x52,0x84] -// VI: v_div_scale_f64 v[24:25], s[10:11], v[22:23], v[20:21], -v[20:21] ; encoding: [0x18,0x0a,0xe1,0xd1,0x16,0x29,0x52,0x84] +v_div_scale_f64 v[24:25], vcc, v[22:23], v[20:21], -v[20:21] +// SICI: v_div_scale_f64 v[24:25], vcc, v[22:23], v[20:21], -v[20:21] ; encoding: [0x18,0x6a,0xdc,0xd2,0x16,0x29,0x52,0x84] +// VI: v_div_scale_f64 v[24:25], vcc, v[22:23], v[20:21], -v[20:21] ; encoding: [0x18,0x6a,0xe1,0xd1,0x16,0x29,0x52,0x84] v_div_scale_f32 v24, vcc, v22, v22, v20 // SICI: v_div_scale_f32 v24, vcc, v22, v22, v20 ; encoding: [0x18,0x6a,0xda,0xd2,0x16,0x2d,0x52,0x04] @@ -439,9 +439,9 @@ // SICI: v_div_scale_f32 v24, vcc, v22, v22, -v20 clamp div:2 ; encoding: [0x18,0x6a,0xda,0xd2,0x16,0x2d,0x52,0x9c] // VI: v_div_scale_f32 v24, vcc, v22, v22, -v20 clamp div:2 ; encoding: [0x18,0xea,0xe0,0xd1,0x16,0x2d,0x52,0x9c] -v_div_scale_f32 v24, s[10:11], v22, v22, v20 -// SICI: v_div_scale_f32 v24, s[10:11], v22, v22, v20 ; encoding: [0x18,0x0a,0xda,0xd2,0x16,0x2d,0x52,0x04] -// VI: v_div_scale_f32 v24, s[10:11], v22, v22, v20 ; encoding: [0x18,0x0a,0xe0,0xd1,0x16,0x2d,0x52,0x04] +v_div_scale_f32 v24, vcc, v22, v22, v20 +// SICI: v_div_scale_f32 v24, vcc, v22, v22, v20 ; encoding: [0x18,0x6a,0xda,0xd2,0x16,0x2d,0x52,0x04] +// VI: v_div_scale_f32 v24, vcc, v22, v22, v20 ; encoding: [0x18,0x6a,0xe0,0xd1,0x16,0x2d,0x52,0x04] v_div_scale_f32 v24, vcc, v22, 1.0, v22 // SICI: v_div_scale_f32 v24, vcc, v22, 1.0, v22 ; encoding: [0x18,0x6a,0xda,0xd2,0x16,0xe5,0x59,0x04] diff --git a/llvm/test/MC/AMDGPU/wave32.s b/llvm/test/MC/AMDGPU/wave32.s --- a/llvm/test/MC/AMDGPU/wave32.s +++ b/llvm/test/MC/AMDGPU/wave32.s @@ -383,21 +383,21 @@ // GFX1032-ERR: :[[@LINE-1]]:25: error: invalid operand for instruction // GFX1064: v_add_co_ci_u32_e64 v4, s[0:1], v1, v5, vcc ; encoding: [0x04,0x00,0x28,0xd5,0x01,0x0b,0xaa,0x01] -v_div_scale_f32 v2, s2, v0, v0, v2 -// GFX1032: v_div_scale_f32 v2, s2, v0, v0, v2 ; encoding: [0x02,0x02,0x6d,0xd5,0x00,0x01,0x0a,0x04] -// GFX1064-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction +v_div_scale_f32 v2, vcc_lo, v0, v0, v2 +// GFX1032: v_div_scale_f32 v2, vcc_lo, v0, v0, v2 ; encoding: [0x02,0x6a,0x6d,0xd5,0x00,0x01,0x0a,0x04] +// GFX1064-ERR: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode -v_div_scale_f32 v2, s[2:3], v0, v0, v2 -// GFX1032-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction -// GFX1064: v_div_scale_f32 v2, s[2:3], v0, v0, v2 ; encoding: [0x02,0x02,0x6d,0xd5,0x00,0x01,0x0a,0x04] +v_div_scale_f32 v2, vcc, v0, v0, v2 +// GFX1032-ERR: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +// GFX1064: v_div_scale_f32 v2, vcc, v0, v0, v2 ; encoding: [0x02,0x6a,0x6d,0xd5,0x00,0x01,0x0a,0x04] -v_div_scale_f64 v[2:3], s2, v[0:1], v[0:1], v[2:3] -// GFX1032: v_div_scale_f64 v[2:3], s2, v[0:1], v[0:1], v[2:3] ; encoding: [0x02,0x02,0x6e,0xd5,0x00,0x01,0x0a,0x04] -// GFX1064-ERR: :[[@LINE-2]]:25: error: invalid operand for instruction +v_div_scale_f64 v[2:3], vcc_lo, v[0:1], v[0:1], v[2:3] +// GFX1032: v_div_scale_f64 v[2:3], vcc_lo, v[0:1], v[0:1], v[2:3] ; encoding: [0x02,0x6a,0x6e,0xd5,0x00,0x01,0x0a,0x04] +// GFX1064-ERR: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode -v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], v[2:3] -// GFX1032-ERR: :[[@LINE-1]]:25: error: invalid operand for instruction -// GFX1064: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], v[2:3] ; encoding: [0x02,0x02,0x6e,0xd5,0x00,0x01,0x0a,0x04] +v_div_scale_f64 v[2:3], vcc, v[0:1], v[0:1], v[2:3] +// GFX1032-ERR: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode +// GFX1064: v_div_scale_f64 v[2:3], vcc, v[0:1], v[0:1], v[2:3] ; encoding: [0x02,0x6a,0x6e,0xd5,0x00,0x01,0x0a,0x04] v_mad_i64_i32 v[0:1], s6, v0, v1, v[2:3] // GFX1032: v_mad_i64_i32 v[0:1], s6, v0, v1, v[2:3] ; encoding: [0x00,0x06,0x77,0xd5,0x00,0x03,0x0a,0x04] diff --git a/llvm/test/MC/AMDGPU/wave_any.s b/llvm/test/MC/AMDGPU/wave_any.s --- a/llvm/test/MC/AMDGPU/wave_any.s +++ b/llvm/test/MC/AMDGPU/wave_any.s @@ -198,17 +198,17 @@ v_add_co_ci_u32_e64 v4, s[0:1], v1, v5, vcc // GFX10: v_add_co_ci_u32_e64 v4, s[0:1], v1, v5, vcc ; encoding: [0x04,0x00,0x28,0xd5,0x01,0x0b,0xaa,0x01] -v_div_scale_f32 v2, s2, v0, v0, v2 -// GFX10: v_div_scale_f32 v2, s2, v0, v0, v2 ; encoding: [0x02,0x02,0x6d,0xd5,0x00,0x01,0x0a,0x04] +v_div_scale_f32 v2, vcc_lo, v0, v0, v2 +// GFX10: v_div_scale_f32 v2, vcc, v0, v0, v2 ; encoding: [0x02,0x6a,0x6d,0xd5,0x00,0x01,0x0a,0x04] -v_div_scale_f32 v2, s[2:3], v0, v0, v2 -// GFX10: v_div_scale_f32 v2, s[2:3], v0, v0, v2 ; encoding: [0x02,0x02,0x6d,0xd5,0x00,0x01,0x0a,0x04] +v_div_scale_f32 v2, vcc, v0, v0, v2 +// GFX10: v_div_scale_f32 v2, vcc, v0, v0, v2 ; encoding: [0x02,0x6a,0x6d,0xd5,0x00,0x01,0x0a,0x04] -v_div_scale_f64 v[2:3], s2, v[0:1], v[0:1], v[2:3] -// GFX10: v_div_scale_f64 v[2:3], s2, v[0:1], v[0:1], v[2:3] ; encoding: [0x02,0x02,0x6e,0xd5,0x00,0x01,0x0a,0x04] +v_div_scale_f64 v[2:3], vcc_lo, v[0:1], v[0:1], v[2:3] +// GFX10: v_div_scale_f64 v[2:3], vcc, v[0:1], v[0:1], v[2:3] ; encoding: [0x02,0x6a,0x6e,0xd5,0x00,0x01,0x0a,0x04] -v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], v[2:3] -// GFX10: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], v[2:3] ; encoding: [0x02,0x02,0x6e,0xd5,0x00,0x01,0x0a,0x04] +v_div_scale_f64 v[2:3], vcc, v[0:1], v[0:1], v[2:3] +// GFX10: v_div_scale_f64 v[2:3], vcc, v[0:1], v[0:1], v[2:3] ; encoding: [0x02,0x6a,0x6e,0xd5,0x00,0x01,0x0a,0x04] v_mad_i64_i32 v[0:1], s6, v0, v1, v[2:3] // GFX10: v_mad_i64_i32 v[0:1], s6, v0, v1, v[2:3] ; encoding: [0x00,0x06,0x77,0xd5,0x00,0x03,0x0a,0x04] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10-wave32.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10-wave32.txt --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10-wave32.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10-wave32.txt @@ -139,13 +139,13 @@ # GFX1064: v_add_co_ci_u32_e64 v4, s[0:1], v1, v5, vcc ; 0x04,0x00,0x28,0xd5,0x01,0x0b,0xaa,0x01 -# GFX1032: v_div_scale_f32 v2, s2, v0, v0, v2 -# GFX1064: v_div_scale_f32 v2, s[2:3], v0, v0, v2 -0x02,0x02,0x6d,0xd5,0x00,0x01,0x0a,0x04 +# GFX1032: v_div_scale_f32 v2, vcc_lo, v0, v0, v2 +# GFX1064: v_div_scale_f32 v2, vcc, v0, v0, v2 +0x02,0x6a,0x6d,0xd5,0x00,0x01,0x0a,0x04 -# GFX1032: v_div_scale_f64 v[2:3], s2, v[0:1], v[0:1], v[2:3] -# GFX1064: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], v[2:3] -0x02,0x02,0x6e,0xd5,0x00,0x01,0x0a,0x04 +# GFX1032: v_div_scale_f64 v[2:3], vcc_lo, v[0:1], v[0:1], v[2:3] +# GFX1064: v_div_scale_f64 v[2:3], vcc, v[0:1], v[0:1], v[2:3] +0x02,0x6a,0x6e,0xd5,0x00,0x01,0x0a,0x04 # GFX1032: v_mad_i64_i32 v[0:1], s6, v0, v1, v[2:3] # GFX1064: v_mad_i64_i32 v[0:1], s[6:7], v0, v1, v[2:3] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt @@ -1,5 +1,5 @@ -# RUN: llvm-mc -arch=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX10,W32 %s -# RUN: llvm-mc -arch=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefixes=GFX10,W64 %s +# RUN: llvm-mc -arch=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX10,W32 %s +# RUN: llvm-mc -arch=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX10,W64 %s # GFX10: v_add3_u32 v255, v1, v2, v3 ; encoding: [0xff,0x00,0x6d,0xd7,0x01,0x05,0x0e,0x04] @@ -5838,169 +5838,169 @@ # GFX10: v_div_fmas_f64 v[5:6], |v[1:2]|, |v[2:3]|, |v[3:4]| ; encoding: [0x05,0x07,0x70,0xd5,0x01,0x05,0x0e,0x04] 0x05,0x07,0x70,0xd5,0x01,0x05,0x0e,0x04 -# W32: v_div_scale_f32 v255, s0, v1, v2, v3 ; encoding: [0xff,0x00,0x6d,0xd5,0x01,0x05,0x0e,0x04] -# W64: v_div_scale_f32 v255, s[0:1], v1, v2, v3 ; encoding: [0xff,0x00,0x6d,0xd5,0x01,0x05,0x0e,0x04] -0xff,0x00,0x6d,0xd5,0x01,0x05,0x0e,0x04 +# W32: v_div_scale_f32 v255, vcc_lo, v1, v2, v3 ; encoding: [0xff,0x6a,0x6d,0xd5,0x01,0x05,0x0e,0x04] +# W64: v_div_scale_f32 v255, vcc, v1, v2, v3 ; encoding: [0xff,0x6a,0x6d,0xd5,0x01,0x05,0x0e,0x04] +0xff,0x6a,0x6d,0xd5,0x01,0x05,0x0e,0x04 -# W32: v_div_scale_f32 v5, s0, -1, v2, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0xc1,0x04,0x0e,0x04] -# W64: v_div_scale_f32 v5, s[0:1], -1, v2, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0xc1,0x04,0x0e,0x04] -0x05,0x00,0x6d,0xd5,0xc1,0x04,0x0e,0x04 +# W32: v_div_scale_f32 v5, vcc_lo, -1, v2, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0xc1,0x04,0x0e,0x04] +# W64: v_div_scale_f32 v5, vcc, -1, v2, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0xc1,0x04,0x0e,0x04] +0x05,0x6a,0x6d,0xd5,0xc1,0x04,0x0e,0x04 -# W32: v_div_scale_f32 v5, s0, -4.0, v2, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0xf7,0x04,0x0e,0x04] -# W64: v_div_scale_f32 v5, s[0:1], -4.0, v2, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0xf7,0x04,0x0e,0x04] -0x05,0x00,0x6d,0xd5,0xf7,0x04,0x0e,0x04 +# W32: v_div_scale_f32 v5, vcc_lo, -4.0, v2, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0xf7,0x04,0x0e,0x04] +# W64: v_div_scale_f32 v5, vcc, -4.0, v2, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0xf7,0x04,0x0e,0x04] +0x05,0x6a,0x6d,0xd5,0xf7,0x04,0x0e,0x04 -# W32: v_div_scale_f32 v5, s0, 0, v2, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x80,0x04,0x0e,0x04] -# W64: v_div_scale_f32 v5, s[0:1], 0, v2, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x80,0x04,0x0e,0x04] -0x05,0x00,0x6d,0xd5,0x80,0x04,0x0e,0x04 +# W32: v_div_scale_f32 v5, vcc_lo, 0, v2, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x80,0x04,0x0e,0x04] +# W64: v_div_scale_f32 v5, vcc, 0, v2, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x80,0x04,0x0e,0x04] +0x05,0x6a,0x6d,0xd5,0x80,0x04,0x0e,0x04 -# W32: v_div_scale_f32 v5, s0, 0.5, v2, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0xf0,0x04,0x0e,0x04] -# W64: v_div_scale_f32 v5, s[0:1], 0.5, v2, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0xf0,0x04,0x0e,0x04] -0x05,0x00,0x6d,0xd5,0xf0,0x04,0x0e,0x04 +# W32: v_div_scale_f32 v5, vcc_lo, 0.5, v2, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0xf0,0x04,0x0e,0x04] +# W64: v_div_scale_f32 v5, vcc, 0.5, v2, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0xf0,0x04,0x0e,0x04] +0x05,0x6a,0x6d,0xd5,0xf0,0x04,0x0e,0x04 -# W32: v_div_scale_f32 v5, s0, exec_hi, v2, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x7f,0x04,0x0e,0x04] -# W64: v_div_scale_f32 v5, s[0:1], exec_hi, v2, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x7f,0x04,0x0e,0x04] -0x05,0x00,0x6d,0xd5,0x7f,0x04,0x0e,0x04 +# W32: v_div_scale_f32 v5, vcc_lo, exec_hi, v2, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x7f,0x04,0x0e,0x04] +# W64: v_div_scale_f32 v5, vcc, exec_hi, v2, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x7f,0x04,0x0e,0x04] +0x05,0x6a,0x6d,0xd5,0x7f,0x04,0x0e,0x04 -# W32: v_div_scale_f32 v5, s0, exec_lo, v2, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x7e,0x04,0x0e,0x04] -# W64: v_div_scale_f32 v5, s[0:1], exec_lo, v2, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x7e,0x04,0x0e,0x04] -0x05,0x00,0x6d,0xd5,0x7e,0x04,0x0e,0x04 +# W32: v_div_scale_f32 v5, vcc_lo, exec_lo, v2, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x7e,0x04,0x0e,0x04] +# W64: v_div_scale_f32 v5, vcc, exec_lo, v2, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x7e,0x04,0x0e,0x04] +0x05,0x6a,0x6d,0xd5,0x7e,0x04,0x0e,0x04 -# W32: v_div_scale_f32 v5, s0, m0, v2, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x7c,0x04,0x0e,0x04] -# W64: v_div_scale_f32 v5, s[0:1], m0, v2, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x7c,0x04,0x0e,0x04] -0x05,0x00,0x6d,0xd5,0x7c,0x04,0x0e,0x04 +# W32: v_div_scale_f32 v5, vcc_lo, m0, v2, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x7c,0x04,0x0e,0x04] +# W64: v_div_scale_f32 v5, vcc, m0, v2, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x7c,0x04,0x0e,0x04] +0x05,0x6a,0x6d,0xd5,0x7c,0x04,0x0e,0x04 -# W32: v_div_scale_f32 v5, s0, s1, v2, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0x04,0x0e,0x04] -# W64: v_div_scale_f32 v5, s[0:1], s1, v2, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0x04,0x0e,0x04] -0x05,0x00,0x6d,0xd5,0x01,0x04,0x0e,0x04 +# W32: v_div_scale_f32 v5, vcc_lo, s1, v2, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x04,0x0e,0x04] +# W64: v_div_scale_f32 v5, vcc, s1, v2, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x04,0x0e,0x04] +0x05,0x6a,0x6d,0xd5,0x01,0x04,0x0e,0x04 -# W32: v_div_scale_f32 v5, s0, s103, v2, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x67,0x04,0x0e,0x04] -# W64: v_div_scale_f32 v5, s[0:1], s103, v2, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x67,0x04,0x0e,0x04] -0x05,0x00,0x6d,0xd5,0x67,0x04,0x0e,0x04 +# W32: v_div_scale_f32 v5, vcc_lo, s103, v2, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x67,0x04,0x0e,0x04] +# W64: v_div_scale_f32 v5, vcc, s103, v2, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x67,0x04,0x0e,0x04] +0x05,0x6a,0x6d,0xd5,0x67,0x04,0x0e,0x04 -# W32: v_div_scale_f32 v5, s0, ttmp11, v2, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x77,0x04,0x0e,0x04] -# W64: v_div_scale_f32 v5, s[0:1], ttmp11, v2, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x77,0x04,0x0e,0x04] -0x05,0x00,0x6d,0xd5,0x77,0x04,0x0e,0x04 +# W32: v_div_scale_f32 v5, vcc_lo, ttmp11, v2, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x77,0x04,0x0e,0x04] +# W64: v_div_scale_f32 v5, vcc, ttmp11, v2, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x77,0x04,0x0e,0x04] +0x05,0x6a,0x6d,0xd5,0x77,0x04,0x0e,0x04 -# W32: v_div_scale_f32 v5, s0, v1, -1, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0x83,0x0d,0x04] -# W64: v_div_scale_f32 v5, s[0:1], v1, -1, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0x83,0x0d,0x04] -0x05,0x00,0x6d,0xd5,0x01,0x83,0x0d,0x04 +# W32: v_div_scale_f32 v5, vcc_lo, v1, -1, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x83,0x0d,0x04] +# W64: v_div_scale_f32 v5, vcc, v1, -1, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x83,0x0d,0x04] +0x05,0x6a,0x6d,0xd5,0x01,0x83,0x0d,0x04 -# W32: v_div_scale_f32 v5, s0, v1, -4.0, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0xef,0x0d,0x04] -# W64: v_div_scale_f32 v5, s[0:1], v1, -4.0, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0xef,0x0d,0x04] -0x05,0x00,0x6d,0xd5,0x01,0xef,0x0d,0x04 +# W32: v_div_scale_f32 v5, vcc_lo, v1, -4.0, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0xef,0x0d,0x04] +# W64: v_div_scale_f32 v5, vcc, v1, -4.0, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0xef,0x0d,0x04] +0x05,0x6a,0x6d,0xd5,0x01,0xef,0x0d,0x04 -# W32: v_div_scale_f32 v5, s0, v1, 0, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0x01,0x0d,0x04] -# W64: v_div_scale_f32 v5, s[0:1], v1, 0, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0x01,0x0d,0x04] -0x05,0x00,0x6d,0xd5,0x01,0x01,0x0d,0x04 +# W32: v_div_scale_f32 v5, vcc_lo, v1, 0, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x01,0x0d,0x04] +# W64: v_div_scale_f32 v5, vcc, v1, 0, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x01,0x0d,0x04] +0x05,0x6a,0x6d,0xd5,0x01,0x01,0x0d,0x04 -# W32: v_div_scale_f32 v5, s0, v1, 0.5, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0xe1,0x0d,0x04] -# W64: v_div_scale_f32 v5, s[0:1], v1, 0.5, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0xe1,0x0d,0x04] -0x05,0x00,0x6d,0xd5,0x01,0xe1,0x0d,0x04 +# W32: v_div_scale_f32 v5, vcc_lo, v1, 0.5, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0xe1,0x0d,0x04] +# W64: v_div_scale_f32 v5, vcc, v1, 0.5, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0xe1,0x0d,0x04] +0x05,0x6a,0x6d,0xd5,0x01,0xe1,0x0d,0x04 -# W32: v_div_scale_f32 v5, s0, v1, exec_hi, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0xff,0x0c,0x04] -# W64: v_div_scale_f32 v5, s[0:1], v1, exec_hi, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0xff,0x0c,0x04] -0x05,0x00,0x6d,0xd5,0x01,0xff,0x0c,0x04 +# W32: v_div_scale_f32 v5, vcc_lo, v1, exec_hi, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0xff,0x0c,0x04] +# W64: v_div_scale_f32 v5, vcc, v1, exec_hi, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0xff,0x0c,0x04] +0x05,0x6a,0x6d,0xd5,0x01,0xff,0x0c,0x04 -# W32: v_div_scale_f32 v5, s0, v1, exec_lo, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0xfd,0x0c,0x04] -# W64: v_div_scale_f32 v5, s[0:1], v1, exec_lo, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0xfd,0x0c,0x04] -0x05,0x00,0x6d,0xd5,0x01,0xfd,0x0c,0x04 +# W32: v_div_scale_f32 v5, vcc_lo, v1, exec_lo, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0xfd,0x0c,0x04] +# W64: v_div_scale_f32 v5, vcc, v1, exec_lo, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0xfd,0x0c,0x04] +0x05,0x6a,0x6d,0xd5,0x01,0xfd,0x0c,0x04 -# W32: v_div_scale_f32 v5, s0, v1, m0, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0xf9,0x0c,0x04] -# W64: v_div_scale_f32 v5, s[0:1], v1, m0, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0xf9,0x0c,0x04] -0x05,0x00,0x6d,0xd5,0x01,0xf9,0x0c,0x04 +# W32: v_div_scale_f32 v5, vcc_lo, v1, m0, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0xf9,0x0c,0x04] +# W64: v_div_scale_f32 v5, vcc, v1, m0, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0xf9,0x0c,0x04] +0x05,0x6a,0x6d,0xd5,0x01,0xf9,0x0c,0x04 -# W32: v_div_scale_f32 v5, s0, v1, s103, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0xcf,0x0c,0x04] -# W64: v_div_scale_f32 v5, s[0:1], v1, s103, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0xcf,0x0c,0x04] -0x05,0x00,0x6d,0xd5,0x01,0xcf,0x0c,0x04 +# W32: v_div_scale_f32 v5, vcc_lo, v1, s103, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0xcf,0x0c,0x04] +# W64: v_div_scale_f32 v5, vcc, v1, s103, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0xcf,0x0c,0x04] +0x05,0x6a,0x6d,0xd5,0x01,0xcf,0x0c,0x04 -# W32: v_div_scale_f32 v5, s0, v1, s2, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0x0c,0x04] -# W64: v_div_scale_f32 v5, s[0:1], v1, s2, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0x0c,0x04] -0x05,0x00,0x6d,0xd5,0x01,0x05,0x0c,0x04 +# W32: v_div_scale_f32 v5, vcc_lo, v1, s2, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0x0c,0x04] +# W64: v_div_scale_f32 v5, vcc, v1, s2, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0x0c,0x04] +0x05,0x6a,0x6d,0xd5,0x01,0x05,0x0c,0x04 -# W32: v_div_scale_f32 v5, s0, v1, ttmp11, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0xef,0x0c,0x04] -# W64: v_div_scale_f32 v5, s[0:1], v1, ttmp11, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0xef,0x0c,0x04] -0x05,0x00,0x6d,0xd5,0x01,0xef,0x0c,0x04 +# W32: v_div_scale_f32 v5, vcc_lo, v1, ttmp11, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0xef,0x0c,0x04] +# W64: v_div_scale_f32 v5, vcc, v1, ttmp11, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0xef,0x0c,0x04] +0x05,0x6a,0x6d,0xd5,0x01,0xef,0x0c,0x04 -# W32: v_div_scale_f32 v5, s0, v1, v2, -1 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0x06,0x03] -# W64: v_div_scale_f32 v5, s[0:1], v1, v2, -1 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0x06,0x03] -0x05,0x00,0x6d,0xd5,0x01,0x05,0x06,0x03 +# W32: v_div_scale_f32 v5, vcc_lo, v1, v2, -1 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0x06,0x03] +# W64: v_div_scale_f32 v5, vcc, v1, v2, -1 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0x06,0x03] +0x05,0x6a,0x6d,0xd5,0x01,0x05,0x06,0x03 -# W32: v_div_scale_f32 v5, s0, v1, v2, -4.0 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0xde,0x03] -# W64: v_div_scale_f32 v5, s[0:1], v1, v2, -4.0 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0xde,0x03] -0x05,0x00,0x6d,0xd5,0x01,0x05,0xde,0x03 +# W32: v_div_scale_f32 v5, vcc_lo, v1, v2, -4.0 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0xde,0x03] +# W64: v_div_scale_f32 v5, vcc, v1, v2, -4.0 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0xde,0x03] +0x05,0x6a,0x6d,0xd5,0x01,0x05,0xde,0x03 -# W32: v_div_scale_f32 v5, s0, v1, v2, 0 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0x02,0x02] -# W64: v_div_scale_f32 v5, s[0:1], v1, v2, 0 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0x02,0x02] -0x05,0x00,0x6d,0xd5,0x01,0x05,0x02,0x02 +# W32: v_div_scale_f32 v5, vcc_lo, v1, v2, 0 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0x02,0x02] +# W64: v_div_scale_f32 v5, vcc, v1, v2, 0 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0x02,0x02] +0x05,0x6a,0x6d,0xd5,0x01,0x05,0x02,0x02 -# W32: v_div_scale_f32 v5, s0, v1, v2, 0.5 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0xc2,0x03] -# W64: v_div_scale_f32 v5, s[0:1], v1, v2, 0.5 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0xc2,0x03] -0x05,0x00,0x6d,0xd5,0x01,0x05,0xc2,0x03 +# W32: v_div_scale_f32 v5, vcc_lo, v1, v2, 0.5 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0xc2,0x03] +# W64: v_div_scale_f32 v5, vcc, v1, v2, 0.5 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0xc2,0x03] +0x05,0x6a,0x6d,0xd5,0x01,0x05,0xc2,0x03 -# W32: v_div_scale_f32 v5, s0, v1, v2, exec_hi ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0xfe,0x01] -# W64: v_div_scale_f32 v5, s[0:1], v1, v2, exec_hi ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0xfe,0x01] -0x05,0x00,0x6d,0xd5,0x01,0x05,0xfe,0x01 +# W32: v_div_scale_f32 v5, vcc_lo, v1, v2, exec_hi ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0xfe,0x01] +# W64: v_div_scale_f32 v5, vcc, v1, v2, exec_hi ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0xfe,0x01] +0x05,0x6a,0x6d,0xd5,0x01,0x05,0xfe,0x01 -# W32: v_div_scale_f32 v5, s0, v1, v2, exec_lo ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0xfa,0x01] -# W64: v_div_scale_f32 v5, s[0:1], v1, v2, exec_lo ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0xfa,0x01] -0x05,0x00,0x6d,0xd5,0x01,0x05,0xfa,0x01 +# W32: v_div_scale_f32 v5, vcc_lo, v1, v2, exec_lo ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0xfa,0x01] +# W64: v_div_scale_f32 v5, vcc, v1, v2, exec_lo ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0xfa,0x01] +0x05,0x6a,0x6d,0xd5,0x01,0x05,0xfa,0x01 -# W32: v_div_scale_f32 v5, s0, v1, v2, m0 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0xf2,0x01] -# W64: v_div_scale_f32 v5, s[0:1], v1, v2, m0 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0xf2,0x01] -0x05,0x00,0x6d,0xd5,0x01,0x05,0xf2,0x01 +# W32: v_div_scale_f32 v5, vcc_lo, v1, v2, m0 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0xf2,0x01] +# W64: v_div_scale_f32 v5, vcc, v1, v2, m0 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0xf2,0x01] +0x05,0x6a,0x6d,0xd5,0x01,0x05,0xf2,0x01 -# W32: v_div_scale_f32 v5, s0, v1, v2, s103 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0x9e,0x01] -# W64: v_div_scale_f32 v5, s[0:1], v1, v2, s103 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0x9e,0x01] -0x05,0x00,0x6d,0xd5,0x01,0x05,0x9e,0x01 +# W32: v_div_scale_f32 v5, vcc_lo, v1, v2, s103 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0x9e,0x01] +# W64: v_div_scale_f32 v5, vcc, v1, v2, s103 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0x9e,0x01] +0x05,0x6a,0x6d,0xd5,0x01,0x05,0x9e,0x01 -# W32: v_div_scale_f32 v5, s0, v1, v2, s3 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0x0e,0x00] -# W64: v_div_scale_f32 v5, s[0:1], v1, v2, s3 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0x0e,0x00] -0x05,0x00,0x6d,0xd5,0x01,0x05,0x0e,0x00 +# W32: v_div_scale_f32 v5, vcc_lo, v1, v2, s3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0x0e,0x00] +# W64: v_div_scale_f32 v5, vcc, v1, v2, s3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0x0e,0x00] +0x05,0x6a,0x6d,0xd5,0x01,0x05,0x0e,0x00 -# W32: v_div_scale_f32 v5, s0, v1, v2, ttmp11 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0xde,0x01] -# W64: v_div_scale_f32 v5, s[0:1], v1, v2, ttmp11 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0xde,0x01] -0x05,0x00,0x6d,0xd5,0x01,0x05,0xde,0x01 +# W32: v_div_scale_f32 v5, vcc_lo, v1, v2, ttmp11 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0xde,0x01] +# W64: v_div_scale_f32 v5, vcc, v1, v2, ttmp11 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0xde,0x01] +0x05,0x6a,0x6d,0xd5,0x01,0x05,0xde,0x01 -# W32: v_div_scale_f32 v5, s0, v1, v2, v255 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0xfe,0x07] -# W64: v_div_scale_f32 v5, s[0:1], v1, v2, v255 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0xfe,0x07] -0x05,0x00,0x6d,0xd5,0x01,0x05,0xfe,0x07 +# W32: v_div_scale_f32 v5, vcc_lo, v1, v2, v255 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0xfe,0x07] +# W64: v_div_scale_f32 v5, vcc, v1, v2, v255 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0xfe,0x07] +0x05,0x6a,0x6d,0xd5,0x01,0x05,0xfe,0x07 -# W32: v_div_scale_f32 v5, s0, v1, v2, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0x0e,0x04] -# W64: v_div_scale_f32 v5, s[0:1], v1, v2, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0x0e,0x04] -0x05,0x00,0x6d,0xd5,0x01,0x05,0x0e,0x04 +# W32: v_div_scale_f32 v5, vcc_lo, v1, v2, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0x0e,0x04] +# W64: v_div_scale_f32 v5, vcc, v1, v2, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0x0e,0x04] +0x05,0x6a,0x6d,0xd5,0x01,0x05,0x0e,0x04 -# W32: v_div_scale_f32 v5, s0, v1, v2, vcc_hi ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0xae,0x01] -# W64: v_div_scale_f32 v5, s[0:1], v1, v2, vcc_hi ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0xae,0x01] -0x05,0x00,0x6d,0xd5,0x01,0x05,0xae,0x01 +# W32: v_div_scale_f32 v5, vcc_lo, v1, v2, vcc_hi ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0xae,0x01] +# W64: v_div_scale_f32 v5, vcc, v1, v2, vcc_hi ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0xae,0x01] +0x05,0x6a,0x6d,0xd5,0x01,0x05,0xae,0x01 -# W32: v_div_scale_f32 v5, s0, v1, v2, vcc_lo ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0xaa,0x01] -# W64: v_div_scale_f32 v5, s[0:1], v1, v2, vcc_lo ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0x05,0xaa,0x01] -0x05,0x00,0x6d,0xd5,0x01,0x05,0xaa,0x01 +# W32: v_div_scale_f32 v5, vcc_lo, v1, v2, vcc_lo ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0xaa,0x01] +# W64: v_div_scale_f32 v5, vcc, v1, v2, vcc_lo ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0x05,0xaa,0x01] +0x05,0x6a,0x6d,0xd5,0x01,0x05,0xaa,0x01 -# W32: v_div_scale_f32 v5, s0, v1, v255, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0xff,0x0f,0x04] -# W64: v_div_scale_f32 v5, s[0:1], v1, v255, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0xff,0x0f,0x04] -0x05,0x00,0x6d,0xd5,0x01,0xff,0x0f,0x04 +# W32: v_div_scale_f32 v5, vcc_lo, v1, v255, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0xff,0x0f,0x04] +# W64: v_div_scale_f32 v5, vcc, v1, v255, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0xff,0x0f,0x04] +0x05,0x6a,0x6d,0xd5,0x01,0xff,0x0f,0x04 -# W32: v_div_scale_f32 v5, s0, v1, vcc_hi, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0xd7,0x0c,0x04] -# W64: v_div_scale_f32 v5, s[0:1], v1, vcc_hi, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0xd7,0x0c,0x04] -0x05,0x00,0x6d,0xd5,0x01,0xd7,0x0c,0x04 +# W32: v_div_scale_f32 v5, vcc_lo, v1, vcc_hi, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0xd7,0x0c,0x04] +# W64: v_div_scale_f32 v5, vcc, v1, vcc_hi, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0xd7,0x0c,0x04] +0x05,0x6a,0x6d,0xd5,0x01,0xd7,0x0c,0x04 -# W32: v_div_scale_f32 v5, s0, v1, vcc_lo, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0xd5,0x0c,0x04] -# W64: v_div_scale_f32 v5, s[0:1], v1, vcc_lo, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x01,0xd5,0x0c,0x04] -0x05,0x00,0x6d,0xd5,0x01,0xd5,0x0c,0x04 +# W32: v_div_scale_f32 v5, vcc_lo, v1, vcc_lo, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0xd5,0x0c,0x04] +# W64: v_div_scale_f32 v5, vcc, v1, vcc_lo, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x01,0xd5,0x0c,0x04] +0x05,0x6a,0x6d,0xd5,0x01,0xd5,0x0c,0x04 -# W32: v_div_scale_f32 v5, s0, v255, v2, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0xff,0x05,0x0e,0x04] -# W64: v_div_scale_f32 v5, s[0:1], v255, v2, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0xff,0x05,0x0e,0x04] -0x05,0x00,0x6d,0xd5,0xff,0x05,0x0e,0x04 +# W32: v_div_scale_f32 v5, vcc_lo, v255, v2, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0xff,0x05,0x0e,0x04] +# W64: v_div_scale_f32 v5, vcc, v255, v2, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0xff,0x05,0x0e,0x04] +0x05,0x6a,0x6d,0xd5,0xff,0x05,0x0e,0x04 -# W32: v_div_scale_f32 v5, s0, vcc_hi, v2, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x6b,0x04,0x0e,0x04] -# W64: v_div_scale_f32 v5, s[0:1], vcc_hi, v2, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x6b,0x04,0x0e,0x04] -0x05,0x00,0x6d,0xd5,0x6b,0x04,0x0e,0x04 +# W32: v_div_scale_f32 v5, vcc_lo, vcc_hi, v2, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x6b,0x04,0x0e,0x04] +# W64: v_div_scale_f32 v5, vcc, vcc_hi, v2, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x6b,0x04,0x0e,0x04] +0x05,0x6a,0x6d,0xd5,0x6b,0x04,0x0e,0x04 -# W32: v_div_scale_f32 v5, s0, vcc_lo, v2, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x6a,0x04,0x0e,0x04] -# W64: v_div_scale_f32 v5, s[0:1], vcc_lo, v2, v3 ; encoding: [0x05,0x00,0x6d,0xd5,0x6a,0x04,0x0e,0x04] -0x05,0x00,0x6d,0xd5,0x6a,0x04,0x0e,0x04 +# W32: v_div_scale_f32 v5, vcc_lo, vcc_lo, v2, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x6a,0x04,0x0e,0x04] +# W64: v_div_scale_f32 v5, vcc, vcc_lo, v2, v3 ; encoding: [0x05,0x6a,0x6d,0xd5,0x6a,0x04,0x0e,0x04] +0x05,0x6a,0x6d,0xd5,0x6a,0x04,0x0e,0x04 # GFX10: v_exp_f16_e64 v255, v1 ; encoding: [0xff,0x00,0xd8,0xd5,0x01,0x01,0x00,0x00] 0xff,0x00,0xd8,0xd5,0x01,0x01,0x00,0x00