diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -985,6 +985,7 @@ // We need to handle this here because tablegen doesn't support matching // instructions with multiple outputs. void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { + const GCNSubtarget *ST = static_cast(Subtarget); SDLoc SL(N); EVT VT = N->getValueType(0); @@ -1000,6 +1001,18 @@ SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]); SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]); CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); + + // DIV_SCALE as a DAG node returns two values, a f32/f64 and also a i1 + // which is meant to represent VCC. However, in MIR, VCC is an implicit + // def. We need to replace all uses of that i1 with VCC directly. + if (!SDValue(N, 1).use_empty()) { + auto *RI = ST->getRegisterInfo(); + const auto VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC; + const auto Reg = + CurDAG->getMachineFunction().addLiveIn(VCC, RI->getWaveMaskRegClass()); + CurDAG->ReplaceAllUsesOfValueWith( + SDValue(N, 1), CurDAG->getCopyFromReg(SDValue(N, 0), SL, Reg, MVT::i1)); + } } // We need to handle this here because tablegen doesn't support matching diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -937,17 +937,26 @@ Register Src0 = ChooseDenom != 0 ? Numer : Denom; auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0) - .addDef(Dst1) - .addImm(0) // $src0_modifiers - .addUse(Src0) // $src0 - .addImm(0) // $src1_modifiers - .addUse(Denom) // $src1 - .addImm(0) // $src2_modifiers - .addUse(Numer) // $src2 - .addImm(0) // $clamp - .addImm(0); // $omod + .addImm(0) // $src0_modifiers + .addUse(Src0) // $src0 + .addImm(0) // $src1_modifiers + .addUse(Denom) // $src1 + .addImm(0) // $src2_modifiers + .addUse(Numer) // $src2 + .addImm(0) // $clamp + .addImm(0); // $omod MI.eraseFromParent(); + if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI)) { + return false; + } + + // We implicitely def VCC in V_DIV_SCALE, but some insts may still use + // the VReg that the intrinsic returned (Dst1). Create a copy of VCC + // to that register so the value can still be used. + MIB = BuildMI(*MBB, MIB->getNextNode(), DL, TII.get(AMDGPU::COPY), Dst1) + .addReg(STI.isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC); + MRI->setRegClass(Dst1, TRI.getWaveMaskRegClass()); return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -2414,7 +2414,7 @@ field Operand Src0ModSDWA = getSrcModSDWA.ret; field Operand Src1ModSDWA = getSrcModSDWA.ret; - + field bit SDstIsAlwaysVCC = false; field bit HasDst = !ne(DstVT.Value, untyped.Value); field bit HasDst32 = HasDst; field bit EmitDst = HasDst; // force dst encoding, see v_movreld_b32 special case diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -20,15 +20,15 @@ } class VOP3b_Profile : VOPProfile<[vt, vt, vt, vt]> { - let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst); - let Asm64 = "$vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$omod"; + let Asm64 = "$vdst, vcc, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$omod"; let IsSingle = 1; let HasExtVOP3DPP = 0; let HasExtDPP = 0; + let SDstIsAlwaysVCC = true; } -def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile; -def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile; +def VOP3b_F32_VCC_F32_F32_F32 : VOP3b_Profile; +def VOP3b_F64_VCC_F64_F64_F64 : VOP3b_Profile; def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> { let HasClamp = 1; @@ -220,13 +220,13 @@ } // End isReMaterializable = 1 -let mayRaiseFPException = 0 in { // Seems suspicious but manual doesn't say it does. +let Defs = [VCC], mayRaiseFPException = 0 in { // Seems suspicious but manual doesn't say it does. let SchedRW = [WriteFloatFMA, WriteSALU] in - defm V_DIV_SCALE_F32 : VOP3Inst_Pseudo_Wrapper <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32> ; + defm V_DIV_SCALE_F32 : VOP3Inst_Pseudo_Wrapper <"v_div_scale_f32", VOP3b_F32_VCC_F32_F32_F32> ; // Double precision division pre-scale. let SchedRW = [WriteDouble, WriteSALU], FPDPRounding = 1 in - defm V_DIV_SCALE_F64 : VOP3Inst_Pseudo_Wrapper <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64>; + defm V_DIV_SCALE_F64 : VOP3Inst_Pseudo_Wrapper <"v_div_scale_f64", VOP3b_F64_VCC_F64_F64_F64>; } // End mayRaiseFPException = 0 let isReMaterializable = 1 in diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -322,7 +322,8 @@ bits<2> omod; let Inst{7-0} = vdst; - let Inst{14-8} = sdst; + // FIXME: Not sure this works. + let Inst{14-8} = !if(P.SDstIsAlwaysVCC, 0, sdst); let Inst{31-26} = 0x34; //encoding let Inst{40-32} = !if(P.HasSrc0, src0, 0); let Inst{49-41} = !if(P.HasSrc1, src1, 0); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/constant-bus-restriction.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/constant-bus-restriction.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/constant-bus-restriction.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/constant-bus-restriction.ll @@ -216,18 +216,13 @@ ; GFX9-LABEL: div_scale_s_s_true: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_div_scale_f32 v0, s[0:1], s2, v0, s2 +; GFX9-NEXT: v_div_scale_f32 v0, vcc, s2, v0, s2 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: div_scale_s_s_true: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_div_scale_f32 v0, s0, s2, s3, s2 -; GFX10-NEXT: ; return to shader part epilog -; -; GFX11-LABEL: div_scale_s_s_true: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_div_scale_f32 v0, null, s2, s3, s2 -; GFX11-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: div_scale_s_s_true: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_div_scale_f32 v0, vcc, s2, s3, s2 +; GFX10PLUS-NEXT: ; return to shader part epilog %div.scale = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %src0, float %src1, i1 true) %result = extractvalue { float, i1 } %div.scale, 0 ret float %result @@ -237,18 +232,13 @@ ; GFX9-LABEL: div_scale_s_s_false: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_div_scale_f32 v0, s[0:1], v0, v0, s2 +; GFX9-NEXT: v_div_scale_f32 v0, vcc, v0, v0, s2 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10-LABEL: div_scale_s_s_false: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_div_scale_f32 v0, s0, s3, s3, s2 -; GFX10-NEXT: ; return to shader part epilog -; -; GFX11-LABEL: div_scale_s_s_false: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s2 -; GFX11-NEXT: ; return to shader part epilog +; GFX10PLUS-LABEL: div_scale_s_s_false: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: v_div_scale_f32 v0, vcc, s3, s3, s2 +; GFX10PLUS-NEXT: ; return to shader part epilog %div.scale = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %src0, float %src1, i1 false) %result = extractvalue { float, i1 } %div.scale, 0 ret float %result @@ -261,3 +251,6 @@ attributes #0 = { nounwind readnone speculatable willreturn } attributes #1 = { nounwind readnone speculatable } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX10: {{.*}} +; GFX11: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll @@ -21,7 +21,7 @@ ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 @@ -40,7 +40,7 @@ ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -140,7 +140,7 @@ ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 @@ -159,7 +159,7 @@ ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -221,7 +221,7 @@ ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v0, v0, v1 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 @@ -240,7 +240,7 @@ ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v0, v0, v1 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -302,7 +302,7 @@ ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v0, v0, v1 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 @@ -321,7 +321,7 @@ ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v0, v0, v1 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -410,7 +410,7 @@ ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v0, v0, v1 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 @@ -429,7 +429,7 @@ ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v0, v0, v1 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -506,7 +506,7 @@ ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 @@ -525,7 +525,7 @@ ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -589,7 +589,7 @@ ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4 ; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0 @@ -600,7 +600,7 @@ ; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v3, v3, v1 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -620,7 +620,7 @@ ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v5, v4 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -637,7 +637,7 @@ ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v3, v3, v1 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -820,7 +820,7 @@ ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4 ; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0 @@ -831,7 +831,7 @@ ; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v3, v3, v1 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -851,7 +851,7 @@ ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v5, v4 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -868,7 +868,7 @@ ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v3, v3, v1 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -982,7 +982,7 @@ ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, v0, v0, v2 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 @@ -993,7 +993,7 @@ ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, v1, v1, v2 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1013,7 +1013,7 @@ ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v0, v0, v2 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -1030,7 +1030,7 @@ ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v4 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -1137,7 +1137,7 @@ ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, v0, v0, v2 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 @@ -1148,7 +1148,7 @@ ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, v1, v1, v2 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1168,7 +1168,7 @@ ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v0, v0, v2 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -1185,7 +1185,7 @@ ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v4 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -1350,7 +1350,7 @@ ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, v0, v0, v2 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 ; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0 @@ -1361,7 +1361,7 @@ ; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, v1, v1, v2 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1381,7 +1381,7 @@ ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v0, v0, v2 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -1398,7 +1398,7 @@ ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v4 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -1533,7 +1533,7 @@ ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4 ; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0 @@ -1544,7 +1544,7 @@ ; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v3, v3, v1 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2 ; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 ; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 @@ -1564,7 +1564,7 @@ ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v5, v4 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -1581,7 +1581,7 @@ ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v3, v3, v1 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll @@ -18,7 +18,7 @@ ; GFX6-IEEE-LABEL: v_fdiv_f32: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 @@ -34,7 +34,7 @@ ; GFX6-FLUSH-LABEL: v_fdiv_f32: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -52,7 +52,7 @@ ; GFX89-IEEE-LABEL: v_fdiv_f32: ; GFX89-IEEE: ; %bb.0: ; GFX89-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; GFX89-IEEE-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 ; GFX89-IEEE-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 ; GFX89-IEEE-NEXT: v_rcp_f32_e32 v4, v2 ; GFX89-IEEE-NEXT: v_fma_f32 v5, -v2, v4, 1.0 @@ -68,18 +68,18 @@ ; GFX89-FLUSH-LABEL: v_fdiv_f32: ; GFX89-FLUSH: ; %bb.0: ; GFX89-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX89-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 -; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 +; GFX89-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 +; GFX89-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 ; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX89-FLUSH-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; GFX89-FLUSH-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX89-FLUSH-NEXT: v_mul_f32_e32 v5, v3, v4 -; GFX89-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v3 -; GFX89-FLUSH-NEXT: v_fma_f32 v5, v6, v4, v5 -; GFX89-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX89-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX89-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX89-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX89-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX89-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX89-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX89-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX89-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX89-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0 ; GFX89-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -87,8 +87,8 @@ ; GFX10-IEEE: ; %bb.0: ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX10-IEEE-NEXT: v_fma_f32 v4, -v2, v3, 1.0 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v4, v3 @@ -104,8 +104,8 @@ ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 -; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 ; GFX10-FLUSH-NEXT: s_denorm_mode 3 ; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 @@ -123,8 +123,8 @@ ; GFX11-IEEE: ; %bb.0: ; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0 ; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff @@ -145,8 +145,8 @@ ; GFX11-FLUSH: ; %bb.0: ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 -; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 ; GFX11-FLUSH-NEXT: s_denorm_mode 3 @@ -200,7 +200,7 @@ ; GFX6-IEEE-LABEL: v_fdiv_f32_ulp25: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 @@ -229,7 +229,7 @@ ; GFX89-IEEE-LABEL: v_fdiv_f32_ulp25: ; GFX89-IEEE: ; %bb.0: ; GFX89-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; GFX89-IEEE-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 ; GFX89-IEEE-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 ; GFX89-IEEE-NEXT: v_rcp_f32_e32 v4, v2 ; GFX89-IEEE-NEXT: v_fma_f32 v5, -v2, v4, 1.0 @@ -246,8 +246,8 @@ ; GFX10-IEEE: ; %bb.0: ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX10-IEEE-NEXT: v_fma_f32 v4, -v2, v3, 1.0 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v4, v3 @@ -275,8 +275,8 @@ ; GFX11-IEEE: ; %bb.0: ; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0 ; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff @@ -315,7 +315,7 @@ ; GFX6-IEEE-LABEL: v_rcp_f32: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v1, vcc, v0, v0, 1.0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 ; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 ; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 @@ -331,7 +331,7 @@ ; GFX6-FLUSH-LABEL: v_rcp_f32: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, vcc, v0, v0, 1.0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -349,7 +349,7 @@ ; GFX89-IEEE-LABEL: v_rcp_f32: ; GFX89-IEEE: ; %bb.0: ; GFX89-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX89-IEEE-NEXT: v_div_scale_f32 v1, vcc, v0, v0, 1.0 ; GFX89-IEEE-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 ; GFX89-IEEE-NEXT: v_rcp_f32_e32 v3, v1 ; GFX89-IEEE-NEXT: v_fma_f32 v4, -v1, v3, 1.0 @@ -365,18 +365,18 @@ ; GFX89-FLUSH-LABEL: v_rcp_f32: ; GFX89-FLUSH: ; %bb.0: ; GFX89-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 -; GFX89-FLUSH-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 -; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v3, v1 +; GFX89-FLUSH-NEXT: v_div_scale_f32 v1, vcc, v0, v0, 1.0 +; GFX89-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 ; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX89-FLUSH-NEXT: v_fma_f32 v4, -v1, v3, 1.0 -; GFX89-FLUSH-NEXT: v_fma_f32 v3, v4, v3, v3 -; GFX89-FLUSH-NEXT: v_mul_f32_e32 v4, v2, v3 -; GFX89-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v2 -; GFX89-FLUSH-NEXT: v_fma_f32 v4, v5, v3, v4 -; GFX89-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v2 +; GFX89-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX89-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX89-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX89-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX89-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX89-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX89-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v3, v4 +; GFX89-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; GFX89-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX89-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -384,8 +384,8 @@ ; GFX10-IEEE: ; %bb.0: ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v1, s4, v0, v0, 1.0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v1, vcc, v0, v0, 1.0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v1 ; GFX10-IEEE-NEXT: v_fma_f32 v3, -v1, v2, 1.0 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v2, v3, v2 @@ -401,8 +401,8 @@ ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-FLUSH-NEXT: v_div_scale_f32 v1, s4, v0, v0, 1.0 -; GFX10-FLUSH-NEXT: v_div_scale_f32 v3, vcc_lo, 1.0, v0, 1.0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v1, vcc, v0, v0, 1.0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 ; GFX10-FLUSH-NEXT: s_denorm_mode 3 ; GFX10-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 @@ -420,8 +420,8 @@ ; GFX11-IEEE: ; %bb.0: ; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v1, null, v0, v0, 1.0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v1, vcc, v0, v0, 1.0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 ; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-IEEE-NEXT: v_rcp_f32_e32 v2, v1 ; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff @@ -442,8 +442,8 @@ ; GFX11-FLUSH: ; %bb.0: ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FLUSH-NEXT: v_div_scale_f32 v1, null, v0, v0, 1.0 -; GFX11-FLUSH-NEXT: v_div_scale_f32 v3, vcc_lo, 1.0, v0, 1.0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v1, vcc, v0, v0, 1.0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 ; GFX11-FLUSH-NEXT: s_denorm_mode 3 @@ -469,7 +469,7 @@ ; GFX6-IEEE-LABEL: v_rcp_f32_arcp: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v1, vcc, v0, v0, 1.0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1 ; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 ; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0 @@ -485,7 +485,7 @@ ; GFX6-FLUSH-LABEL: v_rcp_f32_arcp: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, vcc, v0, v0, 1.0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -503,7 +503,7 @@ ; GFX89-IEEE-LABEL: v_rcp_f32_arcp: ; GFX89-IEEE: ; %bb.0: ; GFX89-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX89-IEEE-NEXT: v_div_scale_f32 v1, vcc, v0, v0, 1.0 ; GFX89-IEEE-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 ; GFX89-IEEE-NEXT: v_rcp_f32_e32 v3, v1 ; GFX89-IEEE-NEXT: v_fma_f32 v4, -v1, v3, 1.0 @@ -519,18 +519,18 @@ ; GFX89-FLUSH-LABEL: v_rcp_f32_arcp: ; GFX89-FLUSH: ; %bb.0: ; GFX89-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 -; GFX89-FLUSH-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 -; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v3, v1 +; GFX89-FLUSH-NEXT: v_div_scale_f32 v1, vcc, v0, v0, 1.0 +; GFX89-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 ; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX89-FLUSH-NEXT: v_fma_f32 v4, -v1, v3, 1.0 -; GFX89-FLUSH-NEXT: v_fma_f32 v3, v4, v3, v3 -; GFX89-FLUSH-NEXT: v_mul_f32_e32 v4, v2, v3 -; GFX89-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v2 -; GFX89-FLUSH-NEXT: v_fma_f32 v4, v5, v3, v4 -; GFX89-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v2 +; GFX89-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 +; GFX89-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2 +; GFX89-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX89-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX89-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4 +; GFX89-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3 ; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX89-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v3, v4 +; GFX89-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; GFX89-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX89-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -538,8 +538,8 @@ ; GFX10-IEEE: ; %bb.0: ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v1, s4, v0, v0, 1.0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v1, vcc, v0, v0, 1.0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v1 ; GFX10-IEEE-NEXT: v_fma_f32 v3, -v1, v2, 1.0 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v2, v3, v2 @@ -555,8 +555,8 @@ ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-FLUSH-NEXT: v_div_scale_f32 v1, s4, v0, v0, 1.0 -; GFX10-FLUSH-NEXT: v_div_scale_f32 v3, vcc_lo, 1.0, v0, 1.0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v1, vcc, v0, v0, 1.0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 ; GFX10-FLUSH-NEXT: s_denorm_mode 3 ; GFX10-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0 @@ -574,8 +574,8 @@ ; GFX11-IEEE: ; %bb.0: ; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v1, null, v0, v0, 1.0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v1, vcc, v0, v0, 1.0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 ; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-IEEE-NEXT: v_rcp_f32_e32 v2, v1 ; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff @@ -596,8 +596,8 @@ ; GFX11-FLUSH: ; %bb.0: ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FLUSH-NEXT: v_div_scale_f32 v1, null, v0, v0, 1.0 -; GFX11-FLUSH-NEXT: v_div_scale_f32 v3, vcc_lo, 1.0, v0, 1.0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v1, vcc, v0, v0, 1.0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v2, v1 ; GFX11-FLUSH-NEXT: s_denorm_mode 3 @@ -739,7 +739,7 @@ ; GFX6-IEEE-LABEL: v_fdiv_f32_arcp_ulp25: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 @@ -768,7 +768,7 @@ ; GFX89-IEEE-LABEL: v_fdiv_f32_arcp_ulp25: ; GFX89-IEEE: ; %bb.0: ; GFX89-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; GFX89-IEEE-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 ; GFX89-IEEE-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 ; GFX89-IEEE-NEXT: v_rcp_f32_e32 v4, v2 ; GFX89-IEEE-NEXT: v_fma_f32 v5, -v2, v4, 1.0 @@ -785,8 +785,8 @@ ; GFX10-IEEE: ; %bb.0: ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX10-IEEE-NEXT: v_fma_f32 v4, -v2, v3, 1.0 ; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v4, v3 @@ -814,8 +814,8 @@ ; GFX11-IEEE: ; %bb.0: ; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0 ; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff @@ -854,7 +854,7 @@ ; GFX6-IEEE-LABEL: v_fdiv_v2f32: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4 ; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0 @@ -864,7 +864,7 @@ ; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7 ; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v3, v3, v1 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v5 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1 @@ -881,7 +881,7 @@ ; GFX6-FLUSH-LABEL: v_fdiv_v2f32: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v5, v4 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -893,17 +893,17 @@ ; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v6, v5 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v3, v3, v1 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v5, v6, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, v4, v6, v6 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v2, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v5, v6, v2 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v4, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5 ; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v5, v6, v2 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v6 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1 @@ -912,59 +912,58 @@ ; GFX89-IEEE-LABEL: v_fdiv_v2f32: ; GFX89-IEEE: ; %bb.0: ; GFX89-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 -; GFX89-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 -; GFX89-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 -; GFX89-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], v1, v3, v1 -; GFX89-IEEE-NEXT: v_rcp_f32_e32 v8, v4 -; GFX89-IEEE-NEXT: v_rcp_f32_e32 v9, v5 -; GFX89-IEEE-NEXT: v_fma_f32 v10, -v4, v8, 1.0 -; GFX89-IEEE-NEXT: v_fma_f32 v8, v10, v8, v8 -; GFX89-IEEE-NEXT: v_fma_f32 v11, -v5, v9, 1.0 -; GFX89-IEEE-NEXT: v_fma_f32 v9, v11, v9, v9 -; GFX89-IEEE-NEXT: v_mul_f32_e32 v10, v6, v8 -; GFX89-IEEE-NEXT: v_mul_f32_e32 v11, v7, v9 -; GFX89-IEEE-NEXT: v_fma_f32 v12, -v4, v10, v6 -; GFX89-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v7 -; GFX89-IEEE-NEXT: v_fma_f32 v10, v12, v8, v10 -; GFX89-IEEE-NEXT: v_fma_f32 v11, v13, v9, v11 -; GFX89-IEEE-NEXT: v_fma_f32 v4, -v4, v10, v6 -; GFX89-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v7 -; GFX89-IEEE-NEXT: v_div_fmas_f32 v4, v4, v8, v10 -; GFX89-IEEE-NEXT: s_mov_b64 vcc, s[4:5] -; GFX89-IEEE-NEXT: v_div_fmas_f32 v5, v5, v9, v11 +; GFX89-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v0 +; GFX89-IEEE-NEXT: v_div_scale_f32 v5, vcc, v0, v2, v0 +; GFX89-IEEE-NEXT: v_rcp_f32_e32 v6, v4 +; GFX89-IEEE-NEXT: v_fma_f32 v7, -v4, v6, 1.0 +; GFX89-IEEE-NEXT: v_fma_f32 v6, v7, v6, v6 +; GFX89-IEEE-NEXT: v_mul_f32_e32 v7, v5, v6 +; GFX89-IEEE-NEXT: v_fma_f32 v8, -v4, v7, v5 +; GFX89-IEEE-NEXT: v_fma_f32 v7, v8, v6, v7 +; GFX89-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v5 +; GFX89-IEEE-NEXT: v_div_fmas_f32 v4, v4, v6, v7 +; GFX89-IEEE-NEXT: v_div_scale_f32 v5, vcc, v3, v3, v1 +; GFX89-IEEE-NEXT: v_div_scale_f32 v6, vcc, v1, v3, v1 ; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 +; GFX89-IEEE-NEXT: v_rcp_f32_e32 v7, v5 +; GFX89-IEEE-NEXT: v_fma_f32 v8, -v5, v7, 1.0 +; GFX89-IEEE-NEXT: v_fma_f32 v7, v8, v7, v7 +; GFX89-IEEE-NEXT: v_mul_f32_e32 v8, v6, v7 +; GFX89-IEEE-NEXT: v_fma_f32 v9, -v5, v8, v6 +; GFX89-IEEE-NEXT: v_fma_f32 v8, v9, v7, v8 +; GFX89-IEEE-NEXT: v_fma_f32 v5, -v5, v8, v6 +; GFX89-IEEE-NEXT: v_div_fmas_f32 v5, v5, v7, v8 ; GFX89-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 ; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX89-FLUSH-LABEL: v_fdiv_v2f32: ; GFX89-FLUSH: ; %bb.0: ; GFX89-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 -; GFX89-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v0, v2, v0 -; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v6, v4 +; GFX89-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v0 +; GFX89-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 +; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v5, v4 ; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX89-FLUSH-NEXT: v_fma_f32 v7, -v4, v6, 1.0 -; GFX89-FLUSH-NEXT: v_fma_f32 v6, v7, v6, v6 -; GFX89-FLUSH-NEXT: v_mul_f32_e32 v7, v5, v6 -; GFX89-FLUSH-NEXT: v_fma_f32 v8, -v4, v7, v5 -; GFX89-FLUSH-NEXT: v_fma_f32 v7, v8, v6, v7 -; GFX89-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v5 +; GFX89-FLUSH-NEXT: v_fma_f32 v7, -v4, v5, 1.0 +; GFX89-FLUSH-NEXT: v_fma_f32 v5, v7, v5, v5 +; GFX89-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v5 +; GFX89-FLUSH-NEXT: v_fma_f32 v8, -v4, v7, v6 +; GFX89-FLUSH-NEXT: v_fma_f32 v7, v8, v5, v7 +; GFX89-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6 ; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX89-FLUSH-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 -; GFX89-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v6, v7 -; GFX89-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v1, v3, v1 -; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v7, v5 +; GFX89-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7 +; GFX89-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v3, v3, v1 ; GFX89-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0 +; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v2, v5 +; GFX89-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v3, v1 ; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX89-FLUSH-NEXT: v_fma_f32 v2, -v5, v7, 1.0 -; GFX89-FLUSH-NEXT: v_fma_f32 v2, v2, v7, v7 -; GFX89-FLUSH-NEXT: v_mul_f32_e32 v4, v6, v2 -; GFX89-FLUSH-NEXT: v_fma_f32 v7, -v5, v4, v6 -; GFX89-FLUSH-NEXT: v_fma_f32 v4, v7, v2, v4 -; GFX89-FLUSH-NEXT: v_fma_f32 v5, -v5, v4, v6 +; GFX89-FLUSH-NEXT: v_fma_f32 v6, -v5, v2, 1.0 +; GFX89-FLUSH-NEXT: v_fma_f32 v2, v6, v2, v2 +; GFX89-FLUSH-NEXT: v_mul_f32_e32 v6, v4, v2 +; GFX89-FLUSH-NEXT: v_fma_f32 v7, -v5, v6, v4 +; GFX89-FLUSH-NEXT: v_fma_f32 v6, v7, v2, v6 +; GFX89-FLUSH-NEXT: v_fma_f32 v4, -v5, v6, v4 ; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX89-FLUSH-NEXT: v_div_fmas_f32 v2, v5, v2, v4 +; GFX89-FLUSH-NEXT: v_div_fmas_f32 v2, v4, v2, v6 ; GFX89-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1 ; GFX89-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -972,28 +971,27 @@ ; GFX10-IEEE: ; %bb.0: ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v4, s4, v2, v2, v0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v5, s4, v3, v3, v1 -; GFX10-IEEE-NEXT: v_div_scale_f32 v10, vcc_lo, v0, v2, v0 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v6, v4 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v7, v5 -; GFX10-IEEE-NEXT: v_fma_f32 v8, -v4, v6, 1.0 -; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v7, 1.0 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v8, v6 -; GFX10-IEEE-NEXT: v_div_scale_f32 v8, s4, v1, v3, v1 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v7 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7 -; GFX10-IEEE-NEXT: v_fma_f32 v12, -v4, v9, v10 -; GFX10-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v8 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7 -; GFX10-IEEE-NEXT: v_fma_f32 v4, -v4, v9, v10 -; GFX10-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v8 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v4, v6, v9 -; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v5, v7, v11 +; GFX10-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v7, vcc, v0, v2, v0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v4 +; GFX10-IEEE-NEXT: v_fma_f32 v6, -v4, v5, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v5 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v7, v5 +; GFX10-IEEE-NEXT: v_fma_f32 v8, -v4, v6, v7 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v8, v5 +; GFX10-IEEE-NEXT: v_fma_f32 v4, -v4, v6, v7 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v6 +; GFX10-IEEE-NEXT: v_div_scale_f32 v5, vcc, v3, v3, v1 +; GFX10-IEEE-NEXT: v_div_scale_f32 v8, vcc, v1, v3, v1 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v6, v5 +; GFX10-IEEE-NEXT: v_fma_f32 v7, -v5, v6, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v7, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v8, v6 +; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v7, v8 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v6 +; GFX10-IEEE-NEXT: v_fma_f32 v5, -v5, v7, v8 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v5, v6, v7 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -1001,8 +999,8 @@ ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, s4, v2, v2, v0 -; GFX10-FLUSH-NEXT: v_div_scale_f32 v6, vcc_lo, v0, v2, v0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v4 ; GFX10-FLUSH-NEXT: s_denorm_mode 3 ; GFX10-FLUSH-NEXT: v_fma_f32 v7, -v4, v5, 1.0 @@ -1012,20 +1010,20 @@ ; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v7, v8, v5 ; GFX10-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6 ; GFX10-FLUSH-NEXT: s_denorm_mode 0 -; GFX10-FLUSH-NEXT: v_div_scale_f32 v6, s4, v3, v3, v1 ; GFX10-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7 -; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v6 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v3, v3, v1 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v7, vcc, v1, v3, v1 ; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0 -; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, v1, v3, v1 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v6, v5 ; GFX10-FLUSH-NEXT: s_denorm_mode 3 -; GFX10-FLUSH-NEXT: v_fma_f32 v4, -v6, v5, 1.0 -; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v4, v5 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v4, v2, v5 -; GFX10-FLUSH-NEXT: v_fma_f32 v7, -v6, v4, v2 -; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v4, v7, v5 -; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v6, v4, v2 +; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v5, v6, 1.0 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v6, v2, v6 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v7, v6 +; GFX10-FLUSH-NEXT: v_fma_f32 v4, -v5, v2, v7 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v2, v4, v6 +; GFX10-FLUSH-NEXT: v_fma_f32 v4, -v5, v2, v7 ; GFX10-FLUSH-NEXT: s_denorm_mode 0 -; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v5, v4 +; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v4, v6, v2 ; GFX10-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -1033,36 +1031,37 @@ ; GFX11-IEEE: ; %bb.0: ; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v4, null, v2, v2, v0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v5, null, v3, v3, v1 -; GFX11-IEEE-NEXT: v_div_scale_f32 v10, vcc_lo, v0, v2, v0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_rcp_f32_e32 v6, v4 -; GFX11-IEEE-NEXT: v_rcp_f32_e32 v7, v5 +; GFX11-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v7, vcc, v0, v2, v0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v5, v4 ; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff -; GFX11-IEEE-NEXT: v_fma_f32 v8, -v4, v6, 1.0 -; GFX11-IEEE-NEXT: v_fma_f32 v9, -v5, v7, 1.0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_dual_fmac_f32 v6, v8, v6 :: v_dual_fmac_f32 v7, v9, v7 -; GFX11-IEEE-NEXT: v_div_scale_f32 v8, s0, v1, v3, v1 -; GFX11-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7 -; GFX11-IEEE-NEXT: v_fma_f32 v12, -v4, v9, v10 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v8 -; GFX11-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7 -; GFX11-IEEE-NEXT: v_fma_f32 v4, -v4, v9, v10 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v8 -; GFX11-IEEE-NEXT: v_div_fmas_f32 v4, v4, v6, v9 -; GFX11-IEEE-NEXT: s_mov_b32 vcc_lo, s0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_div_fmas_f32 v5, v5, v7, v11 +; GFX11-IEEE-NEXT: v_fma_f32 v6, -v4, v5, 1.0 +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v5 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_mul_f32_e32 v6, v7, v5 +; GFX11-IEEE-NEXT: v_fma_f32 v8, -v4, v6, v7 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v6, v8, v5 +; GFX11-IEEE-NEXT: v_fma_f32 v4, -v4, v6, v7 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v6 +; GFX11-IEEE-NEXT: v_div_scale_f32 v5, vcc, v3, v3, v1 +; GFX11-IEEE-NEXT: v_div_scale_f32 v8, vcc, v1, v3, v1 ; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v6, v5 +; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff +; GFX11-IEEE-NEXT: v_fma_f32 v7, -v5, v6, 1.0 +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v6, v7, v6 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_mul_f32_e32 v7, v8, v6 +; GFX11-IEEE-NEXT: v_fma_f32 v9, -v5, v7, v8 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v6 +; GFX11-IEEE-NEXT: v_fma_f32 v5, -v5, v7, v8 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_div_fmas_f32 v5, v5, v6, v7 ; GFX11-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 ; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -1070,8 +1069,8 @@ ; GFX11-FLUSH: ; %bb.0: ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, null, v2, v2, v0 -; GFX11-FLUSH-NEXT: v_div_scale_f32 v6, vcc_lo, v0, v2, v0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v5, v4 ; GFX11-FLUSH-NEXT: s_denorm_mode 3 @@ -1085,26 +1084,26 @@ ; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v7, v8, v5 ; GFX11-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6 ; GFX11-FLUSH-NEXT: s_denorm_mode 0 -; GFX11-FLUSH-NEXT: v_div_scale_f32 v6, null, v3, v3, v1 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7 -; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v5, v6 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v3, v3, v1 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v7, vcc, v1, v3, v1 ; GFX11-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0 -; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, v1, v3, v1 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v6, v5 ; GFX11-FLUSH-NEXT: s_denorm_mode 3 ; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff -; GFX11-FLUSH-NEXT: v_fma_f32 v4, -v6, v5, 1.0 -; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v5, v4, v5 +; GFX11-FLUSH-NEXT: v_fma_f32 v2, -v5, v6, 1.0 +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v6, v2, v6 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FLUSH-NEXT: v_mul_f32_e32 v4, v2, v5 -; GFX11-FLUSH-NEXT: v_fma_f32 v7, -v6, v4, v2 +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v2, v7, v6 +; GFX11-FLUSH-NEXT: v_fma_f32 v4, -v5, v2, v7 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v4, v7, v5 -; GFX11-FLUSH-NEXT: v_fma_f32 v2, -v6, v4, v2 +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v2, v4, v6 +; GFX11-FLUSH-NEXT: v_fma_f32 v4, -v5, v2, v7 ; GFX11-FLUSH-NEXT: s_denorm_mode 0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v5, v4 +; GFX11-FLUSH-NEXT: v_div_fmas_f32 v2, v4, v6, v2 ; GFX11-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1 ; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x float> %a, %b @@ -1148,7 +1147,7 @@ ; GFX6-IEEE-LABEL: v_fdiv_v2f32_ulp25: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4 ; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0 @@ -1158,7 +1157,7 @@ ; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7 ; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v3, v3, v1 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v5 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1 @@ -1194,28 +1193,27 @@ ; GFX89-IEEE-LABEL: v_fdiv_v2f32_ulp25: ; GFX89-IEEE: ; %bb.0: ; GFX89-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 -; GFX89-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 -; GFX89-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 -; GFX89-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], v1, v3, v1 -; GFX89-IEEE-NEXT: v_rcp_f32_e32 v8, v4 -; GFX89-IEEE-NEXT: v_rcp_f32_e32 v9, v5 -; GFX89-IEEE-NEXT: v_fma_f32 v10, -v4, v8, 1.0 -; GFX89-IEEE-NEXT: v_fma_f32 v8, v10, v8, v8 -; GFX89-IEEE-NEXT: v_fma_f32 v11, -v5, v9, 1.0 -; GFX89-IEEE-NEXT: v_fma_f32 v9, v11, v9, v9 -; GFX89-IEEE-NEXT: v_mul_f32_e32 v10, v6, v8 -; GFX89-IEEE-NEXT: v_mul_f32_e32 v11, v7, v9 -; GFX89-IEEE-NEXT: v_fma_f32 v12, -v4, v10, v6 -; GFX89-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v7 -; GFX89-IEEE-NEXT: v_fma_f32 v10, v12, v8, v10 -; GFX89-IEEE-NEXT: v_fma_f32 v11, v13, v9, v11 -; GFX89-IEEE-NEXT: v_fma_f32 v4, -v4, v10, v6 -; GFX89-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v7 -; GFX89-IEEE-NEXT: v_div_fmas_f32 v4, v4, v8, v10 -; GFX89-IEEE-NEXT: s_mov_b64 vcc, s[4:5] -; GFX89-IEEE-NEXT: v_div_fmas_f32 v5, v5, v9, v11 +; GFX89-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v0 +; GFX89-IEEE-NEXT: v_div_scale_f32 v5, vcc, v0, v2, v0 +; GFX89-IEEE-NEXT: v_rcp_f32_e32 v6, v4 +; GFX89-IEEE-NEXT: v_fma_f32 v7, -v4, v6, 1.0 +; GFX89-IEEE-NEXT: v_fma_f32 v6, v7, v6, v6 +; GFX89-IEEE-NEXT: v_mul_f32_e32 v7, v5, v6 +; GFX89-IEEE-NEXT: v_fma_f32 v8, -v4, v7, v5 +; GFX89-IEEE-NEXT: v_fma_f32 v7, v8, v6, v7 +; GFX89-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v5 +; GFX89-IEEE-NEXT: v_div_fmas_f32 v4, v4, v6, v7 +; GFX89-IEEE-NEXT: v_div_scale_f32 v5, vcc, v3, v3, v1 +; GFX89-IEEE-NEXT: v_div_scale_f32 v6, vcc, v1, v3, v1 ; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 +; GFX89-IEEE-NEXT: v_rcp_f32_e32 v7, v5 +; GFX89-IEEE-NEXT: v_fma_f32 v8, -v5, v7, 1.0 +; GFX89-IEEE-NEXT: v_fma_f32 v7, v8, v7, v7 +; GFX89-IEEE-NEXT: v_mul_f32_e32 v8, v6, v7 +; GFX89-IEEE-NEXT: v_fma_f32 v9, -v5, v8, v6 +; GFX89-IEEE-NEXT: v_fma_f32 v8, v9, v7, v8 +; GFX89-IEEE-NEXT: v_fma_f32 v5, -v5, v8, v6 +; GFX89-IEEE-NEXT: v_div_fmas_f32 v5, v5, v7, v8 ; GFX89-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 ; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -1223,28 +1221,27 @@ ; GFX10-IEEE: ; %bb.0: ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v4, s4, v2, v2, v0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v5, s4, v3, v3, v1 -; GFX10-IEEE-NEXT: v_div_scale_f32 v10, vcc_lo, v0, v2, v0 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v6, v4 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v7, v5 -; GFX10-IEEE-NEXT: v_fma_f32 v8, -v4, v6, 1.0 -; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v7, 1.0 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v8, v6 -; GFX10-IEEE-NEXT: v_div_scale_f32 v8, s4, v1, v3, v1 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v7 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7 -; GFX10-IEEE-NEXT: v_fma_f32 v12, -v4, v9, v10 -; GFX10-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v8 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7 -; GFX10-IEEE-NEXT: v_fma_f32 v4, -v4, v9, v10 -; GFX10-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v8 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v4, v6, v9 -; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v5, v7, v11 +; GFX10-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v7, vcc, v0, v2, v0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v4 +; GFX10-IEEE-NEXT: v_fma_f32 v6, -v4, v5, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v5 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v7, v5 +; GFX10-IEEE-NEXT: v_fma_f32 v8, -v4, v6, v7 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v8, v5 +; GFX10-IEEE-NEXT: v_fma_f32 v4, -v4, v6, v7 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v6 +; GFX10-IEEE-NEXT: v_div_scale_f32 v5, vcc, v3, v3, v1 +; GFX10-IEEE-NEXT: v_div_scale_f32 v8, vcc, v1, v3, v1 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v6, v5 +; GFX10-IEEE-NEXT: v_fma_f32 v7, -v5, v6, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v7, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v8, v6 +; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v7, v8 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v6 +; GFX10-IEEE-NEXT: v_fma_f32 v5, -v5, v7, v8 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v5, v6, v7 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -1270,36 +1267,37 @@ ; GFX11-IEEE: ; %bb.0: ; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v4, null, v2, v2, v0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v5, null, v3, v3, v1 -; GFX11-IEEE-NEXT: v_div_scale_f32 v10, vcc_lo, v0, v2, v0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_rcp_f32_e32 v6, v4 -; GFX11-IEEE-NEXT: v_rcp_f32_e32 v7, v5 +; GFX11-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v7, vcc, v0, v2, v0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v5, v4 ; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff -; GFX11-IEEE-NEXT: v_fma_f32 v8, -v4, v6, 1.0 -; GFX11-IEEE-NEXT: v_fma_f32 v9, -v5, v7, 1.0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_dual_fmac_f32 v6, v8, v6 :: v_dual_fmac_f32 v7, v9, v7 -; GFX11-IEEE-NEXT: v_div_scale_f32 v8, s0, v1, v3, v1 -; GFX11-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7 -; GFX11-IEEE-NEXT: v_fma_f32 v12, -v4, v9, v10 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v8 -; GFX11-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7 -; GFX11-IEEE-NEXT: v_fma_f32 v4, -v4, v9, v10 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v8 -; GFX11-IEEE-NEXT: v_div_fmas_f32 v4, v4, v6, v9 -; GFX11-IEEE-NEXT: s_mov_b32 vcc_lo, s0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_div_fmas_f32 v5, v5, v7, v11 +; GFX11-IEEE-NEXT: v_fma_f32 v6, -v4, v5, 1.0 +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v5 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_mul_f32_e32 v6, v7, v5 +; GFX11-IEEE-NEXT: v_fma_f32 v8, -v4, v6, v7 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v6, v8, v5 +; GFX11-IEEE-NEXT: v_fma_f32 v4, -v4, v6, v7 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v6 +; GFX11-IEEE-NEXT: v_div_scale_f32 v5, vcc, v3, v3, v1 +; GFX11-IEEE-NEXT: v_div_scale_f32 v8, vcc, v1, v3, v1 ; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v6, v5 +; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff +; GFX11-IEEE-NEXT: v_fma_f32 v7, -v5, v6, 1.0 +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v6, v7, v6 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_mul_f32_e32 v7, v8, v6 +; GFX11-IEEE-NEXT: v_fma_f32 v9, -v5, v7, v8 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v6 +; GFX11-IEEE-NEXT: v_fma_f32 v5, -v5, v7, v8 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_div_fmas_f32 v5, v5, v6, v7 ; GFX11-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 ; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -1329,7 +1327,7 @@ ; GFX6-IEEE-LABEL: v_rcp_v2f32: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v0, v0, 1.0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 @@ -1339,7 +1337,7 @@ ; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, v1, v1, 1.0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, 1.0, v1, 1.0 @@ -1356,7 +1354,7 @@ ; GFX6-FLUSH-LABEL: v_rcp_v2f32: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v0, v0, 1.0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -1368,72 +1366,71 @@ ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, 1.0, v1, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v1, v1, 1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v3, v4, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v5, v2 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v4, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v3, v5, v2 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX89-IEEE-LABEL: v_rcp_v2f32: ; GFX89-IEEE: ; %bb.0: ; GFX89-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 -; GFX89-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 -; GFX89-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 -; GFX89-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], 1.0, v1, 1.0 -; GFX89-IEEE-NEXT: v_rcp_f32_e32 v6, v2 -; GFX89-IEEE-NEXT: v_rcp_f32_e32 v7, v3 -; GFX89-IEEE-NEXT: v_fma_f32 v8, -v2, v6, 1.0 -; GFX89-IEEE-NEXT: v_fma_f32 v6, v8, v6, v6 -; GFX89-IEEE-NEXT: v_fma_f32 v9, -v3, v7, 1.0 -; GFX89-IEEE-NEXT: v_fma_f32 v7, v9, v7, v7 -; GFX89-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6 -; GFX89-IEEE-NEXT: v_mul_f32_e32 v9, v5, v7 -; GFX89-IEEE-NEXT: v_fma_f32 v10, -v2, v8, v4 -; GFX89-IEEE-NEXT: v_fma_f32 v11, -v3, v9, v5 -; GFX89-IEEE-NEXT: v_fma_f32 v8, v10, v6, v8 -; GFX89-IEEE-NEXT: v_fma_f32 v9, v11, v7, v9 -; GFX89-IEEE-NEXT: v_fma_f32 v2, -v2, v8, v4 -; GFX89-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v5 -; GFX89-IEEE-NEXT: v_div_fmas_f32 v2, v2, v6, v8 -; GFX89-IEEE-NEXT: s_mov_b64 vcc, s[4:5] -; GFX89-IEEE-NEXT: v_div_fmas_f32 v3, v3, v7, v9 +; GFX89-IEEE-NEXT: v_div_scale_f32 v2, vcc, v0, v0, 1.0 +; GFX89-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX89-IEEE-NEXT: v_rcp_f32_e32 v4, v2 +; GFX89-IEEE-NEXT: v_fma_f32 v5, -v2, v4, 1.0 +; GFX89-IEEE-NEXT: v_fma_f32 v4, v5, v4, v4 +; GFX89-IEEE-NEXT: v_mul_f32_e32 v5, v3, v4 +; GFX89-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v3 +; GFX89-IEEE-NEXT: v_fma_f32 v5, v6, v4, v5 +; GFX89-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX89-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX89-IEEE-NEXT: v_div_scale_f32 v3, vcc, v1, v1, 1.0 +; GFX89-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 ; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX89-IEEE-NEXT: v_rcp_f32_e32 v5, v3 +; GFX89-IEEE-NEXT: v_fma_f32 v6, -v3, v5, 1.0 +; GFX89-IEEE-NEXT: v_fma_f32 v5, v6, v5, v5 +; GFX89-IEEE-NEXT: v_mul_f32_e32 v6, v4, v5 +; GFX89-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v4 +; GFX89-IEEE-NEXT: v_fma_f32 v6, v7, v5, v6 +; GFX89-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v4 +; GFX89-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v6 ; GFX89-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0 ; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX89-FLUSH-LABEL: v_rcp_v2f32: ; GFX89-FLUSH: ; %bb.0: ; GFX89-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 -; GFX89-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 -; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 +; GFX89-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v0, v0, 1.0 +; GFX89-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 +; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 ; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX89-FLUSH-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; GFX89-FLUSH-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX89-FLUSH-NEXT: v_mul_f32_e32 v5, v3, v4 -; GFX89-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v3 -; GFX89-FLUSH-NEXT: v_fma_f32 v5, v6, v4, v5 -; GFX89-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX89-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX89-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX89-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX89-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX89-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX89-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX89-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 -; GFX89-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX89-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX89-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v1, v1, 1.0 ; GFX89-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 -; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 ; GFX89-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v2, v3 ; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX89-FLUSH-NEXT: v_fma_f32 v2, -v3, v5, 1.0 -; GFX89-FLUSH-NEXT: v_fma_f32 v2, v2, v5, v5 +; GFX89-FLUSH-NEXT: v_fma_f32 v5, -v3, v2, 1.0 +; GFX89-FLUSH-NEXT: v_fma_f32 v2, v5, v2, v2 ; GFX89-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2 ; GFX89-FLUSH-NEXT: v_fma_f32 v6, -v3, v5, v4 ; GFX89-FLUSH-NEXT: v_fma_f32 v5, v6, v2, v5 @@ -1447,28 +1444,27 @@ ; GFX10-IEEE: ; %bb.0: ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v2, s4, v0, v0, 1.0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v3, s4, v1, v1, 1.0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v8, vcc_lo, 1.0, v0, 1.0 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0 -; GFX10-IEEE-NEXT: v_fma_f32 v7, -v3, v5, 1.0 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v4 -; GFX10-IEEE-NEXT: v_div_scale_f32 v6, s4, 1.0, v1, 1.0 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v7, v5 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v8, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v6, v5 -; GFX10-IEEE-NEXT: v_fma_f32 v10, -v2, v7, v8 -; GFX10-IEEE-NEXT: v_fma_f32 v11, -v3, v9, v6 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v10, v4 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v11, v5 -; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v8 -; GFX10-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v6 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v7 -; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v9 +; GFX10-IEEE-NEXT: v_div_scale_f32 v2, vcc, v0, v0, 1.0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v5, vcc, 1.0, v0, 1.0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX10-IEEE-NEXT: v_fma_f32 v4, -v2, v3, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v4, v3 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v5, v3 +; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v4, v5 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v3 +; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v4, v5 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v4 +; GFX10-IEEE-NEXT: v_div_scale_f32 v3, vcc, v1, v1, 1.0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v6, vcc, 1.0, v1, 1.0 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v3 +; GFX10-IEEE-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v5, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v6, v4 +; GFX10-IEEE-NEXT: v_fma_f32 v7, -v3, v5, v6 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v7, v4 +; GFX10-IEEE-NEXT: v_fma_f32 v3, -v3, v5, v6 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v5 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -1476,8 +1472,8 @@ ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, s4, v0, v0, 1.0 -; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v0, v0, 1.0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 ; GFX10-FLUSH-NEXT: s_denorm_mode 3 ; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 @@ -1487,20 +1483,20 @@ ; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 ; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX10-FLUSH-NEXT: s_denorm_mode 0 -; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, s4, v1, v1, 1.0 ; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v4 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v1, v1, 1.0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v5, vcc, 1.0, v1, 1.0 ; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 -; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, v1, 1.0 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 ; GFX10-FLUSH-NEXT: s_denorm_mode 3 -; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v4, v3, 1.0 -; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v3 -; GFX10-FLUSH-NEXT: v_fma_f32 v6, -v4, v5, v2 -; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 -; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v4, v5, v2 +; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v3, v4, 1.0 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v4, v2, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v5, v4 +; GFX10-FLUSH-NEXT: v_fma_f32 v6, -v3, v2, v5 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v2, v6, v4 +; GFX10-FLUSH-NEXT: v_fma_f32 v3, -v3, v2, v5 ; GFX10-FLUSH-NEXT: s_denorm_mode 0 -; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v3, v4, v2 ; GFX10-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -1508,36 +1504,37 @@ ; GFX11-IEEE: ; %bb.0: ; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v2, null, v0, v0, 1.0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v3, null, v1, v1, 1.0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v8, vcc_lo, 1.0, v0, 1.0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_rcp_f32_e32 v4, v2 -; GFX11-IEEE-NEXT: v_rcp_f32_e32 v5, v3 +; GFX11-IEEE-NEXT: v_div_scale_f32 v2, vcc, v0, v0, 1.0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v5, vcc, 1.0, v0, 1.0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff -; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0 -; GFX11-IEEE-NEXT: v_fma_f32 v7, -v3, v5, 1.0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_dual_fmac_f32 v4, v6, v4 :: v_dual_fmac_f32 v5, v7, v5 -; GFX11-IEEE-NEXT: v_div_scale_f32 v6, s0, 1.0, v1, 1.0 -; GFX11-IEEE-NEXT: v_mul_f32_e32 v7, v8, v4 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_mul_f32_e32 v9, v6, v5 -; GFX11-IEEE-NEXT: v_fma_f32 v10, -v2, v7, v8 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_fma_f32 v11, -v3, v9, v6 -; GFX11-IEEE-NEXT: v_fmac_f32_e32 v7, v10, v4 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_fmac_f32_e32 v9, v11, v5 -; GFX11-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v8 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v6 -; GFX11-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v7 -; GFX11-IEEE-NEXT: s_mov_b32 vcc_lo, s0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v9 +; GFX11-IEEE-NEXT: v_fma_f32 v4, -v2, v3, 1.0 +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v4, v3 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_mul_f32_e32 v4, v5, v3 +; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v4, v5 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v3 +; GFX11-IEEE-NEXT: v_fma_f32 v2, -v2, v4, v5 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v4 +; GFX11-IEEE-NEXT: v_div_scale_f32 v3, vcc, v1, v1, 1.0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v6, vcc, 1.0, v1, 1.0 ; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v4, v3 +; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff +; GFX11-IEEE-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v4, v5, v4 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_mul_f32_e32 v5, v6, v4 +; GFX11-IEEE-NEXT: v_fma_f32 v7, -v3, v5, v6 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v5, v7, v4 +; GFX11-IEEE-NEXT: v_fma_f32 v3, -v3, v5, v6 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v5 ; GFX11-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0 ; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -1545,8 +1542,8 @@ ; GFX11-FLUSH: ; %bb.0: ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, null, v0, v0, 1.0 -; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v0, v0, 1.0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 ; GFX11-FLUSH-NEXT: s_denorm_mode 3 @@ -1560,26 +1557,26 @@ ; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 ; GFX11-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX11-FLUSH-NEXT: s_denorm_mode 0 -; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, null, v1, v1, 1.0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v4 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v1, v1, 1.0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v5, vcc, 1.0, v1, 1.0 ; GFX11-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 -; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, v1, 1.0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 ; GFX11-FLUSH-NEXT: s_denorm_mode 3 ; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff -; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v4, v3, 1.0 -; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3 +; GFX11-FLUSH-NEXT: v_fma_f32 v2, -v3, v4, 1.0 +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v4, v2, v4 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v3 -; GFX11-FLUSH-NEXT: v_fma_f32 v6, -v4, v5, v2 +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v2, v5, v4 +; GFX11-FLUSH-NEXT: v_fma_f32 v6, -v3, v2, v5 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 -; GFX11-FLUSH-NEXT: v_fma_f32 v2, -v4, v5, v2 +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v2, v6, v4 +; GFX11-FLUSH-NEXT: v_fma_f32 v3, -v3, v2, v5 ; GFX11-FLUSH-NEXT: s_denorm_mode 0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX11-FLUSH-NEXT: v_div_fmas_f32 v2, v3, v4, v2 ; GFX11-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x float> , %x @@ -1590,7 +1587,7 @@ ; GFX6-IEEE-LABEL: v_rcp_v2f32_arcp: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v0, v0, 1.0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 ; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 @@ -1600,7 +1597,7 @@ ; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5 ; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, v1, v1, 1.0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, 1.0, v1, 1.0 @@ -1617,7 +1614,7 @@ ; GFX6-FLUSH-LABEL: v_rcp_v2f32_arcp: ; GFX6-FLUSH: ; %bb.0: ; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v0, v0, 1.0 ; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 ; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -1629,72 +1626,71 @@ ; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 -; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 -; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, 1.0, v1, 1.0 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v1, v1, 1.0 +; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 +; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v3, v4, 1.0 -; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v4 -; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v5, v2 -; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v4, v5 -; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v3, v5, v2 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 ; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] ; ; GFX89-IEEE-LABEL: v_rcp_v2f32_arcp: ; GFX89-IEEE: ; %bb.0: ; GFX89-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 -; GFX89-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 -; GFX89-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 -; GFX89-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], 1.0, v1, 1.0 -; GFX89-IEEE-NEXT: v_rcp_f32_e32 v6, v2 -; GFX89-IEEE-NEXT: v_rcp_f32_e32 v7, v3 -; GFX89-IEEE-NEXT: v_fma_f32 v8, -v2, v6, 1.0 -; GFX89-IEEE-NEXT: v_fma_f32 v6, v8, v6, v6 -; GFX89-IEEE-NEXT: v_fma_f32 v9, -v3, v7, 1.0 -; GFX89-IEEE-NEXT: v_fma_f32 v7, v9, v7, v7 -; GFX89-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6 -; GFX89-IEEE-NEXT: v_mul_f32_e32 v9, v5, v7 -; GFX89-IEEE-NEXT: v_fma_f32 v10, -v2, v8, v4 -; GFX89-IEEE-NEXT: v_fma_f32 v11, -v3, v9, v5 -; GFX89-IEEE-NEXT: v_fma_f32 v8, v10, v6, v8 -; GFX89-IEEE-NEXT: v_fma_f32 v9, v11, v7, v9 -; GFX89-IEEE-NEXT: v_fma_f32 v2, -v2, v8, v4 -; GFX89-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v5 -; GFX89-IEEE-NEXT: v_div_fmas_f32 v2, v2, v6, v8 -; GFX89-IEEE-NEXT: s_mov_b64 vcc, s[4:5] -; GFX89-IEEE-NEXT: v_div_fmas_f32 v3, v3, v7, v9 +; GFX89-IEEE-NEXT: v_div_scale_f32 v2, vcc, v0, v0, 1.0 +; GFX89-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 +; GFX89-IEEE-NEXT: v_rcp_f32_e32 v4, v2 +; GFX89-IEEE-NEXT: v_fma_f32 v5, -v2, v4, 1.0 +; GFX89-IEEE-NEXT: v_fma_f32 v4, v5, v4, v4 +; GFX89-IEEE-NEXT: v_mul_f32_e32 v5, v3, v4 +; GFX89-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v3 +; GFX89-IEEE-NEXT: v_fma_f32 v5, v6, v4, v5 +; GFX89-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX89-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX89-IEEE-NEXT: v_div_scale_f32 v3, vcc, v1, v1, 1.0 +; GFX89-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 ; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX89-IEEE-NEXT: v_rcp_f32_e32 v5, v3 +; GFX89-IEEE-NEXT: v_fma_f32 v6, -v3, v5, 1.0 +; GFX89-IEEE-NEXT: v_fma_f32 v5, v6, v5, v5 +; GFX89-IEEE-NEXT: v_mul_f32_e32 v6, v4, v5 +; GFX89-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v4 +; GFX89-IEEE-NEXT: v_fma_f32 v6, v7, v5, v6 +; GFX89-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v4 +; GFX89-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v6 ; GFX89-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0 ; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX89-FLUSH-LABEL: v_rcp_v2f32_arcp: ; GFX89-FLUSH: ; %bb.0: ; GFX89-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 -; GFX89-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 -; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 +; GFX89-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v0, v0, 1.0 +; GFX89-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 +; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 ; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX89-FLUSH-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; GFX89-FLUSH-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX89-FLUSH-NEXT: v_mul_f32_e32 v5, v3, v4 -; GFX89-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v3 -; GFX89-FLUSH-NEXT: v_fma_f32 v5, v6, v4, v5 -; GFX89-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX89-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; GFX89-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3 +; GFX89-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX89-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX89-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5 +; GFX89-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; GFX89-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 -; GFX89-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX89-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX89-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v1, v1, 1.0 ; GFX89-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0 -; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 ; GFX89-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v2, v3 ; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; GFX89-FLUSH-NEXT: v_fma_f32 v2, -v3, v5, 1.0 -; GFX89-FLUSH-NEXT: v_fma_f32 v2, v2, v5, v5 +; GFX89-FLUSH-NEXT: v_fma_f32 v5, -v3, v2, 1.0 +; GFX89-FLUSH-NEXT: v_fma_f32 v2, v5, v2, v2 ; GFX89-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2 ; GFX89-FLUSH-NEXT: v_fma_f32 v6, -v3, v5, v4 ; GFX89-FLUSH-NEXT: v_fma_f32 v5, v6, v2, v5 @@ -1708,28 +1704,27 @@ ; GFX10-IEEE: ; %bb.0: ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v2, s4, v0, v0, 1.0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v3, s4, v1, v1, 1.0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v8, vcc_lo, 1.0, v0, 1.0 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0 -; GFX10-IEEE-NEXT: v_fma_f32 v7, -v3, v5, 1.0 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v4 -; GFX10-IEEE-NEXT: v_div_scale_f32 v6, s4, 1.0, v1, 1.0 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v7, v5 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v8, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v6, v5 -; GFX10-IEEE-NEXT: v_fma_f32 v10, -v2, v7, v8 -; GFX10-IEEE-NEXT: v_fma_f32 v11, -v3, v9, v6 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v10, v4 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v11, v5 -; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v8 -; GFX10-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v6 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v7 -; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v9 +; GFX10-IEEE-NEXT: v_div_scale_f32 v2, vcc, v0, v0, 1.0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v5, vcc, 1.0, v0, 1.0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v2 +; GFX10-IEEE-NEXT: v_fma_f32 v4, -v2, v3, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v4, v3 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v5, v3 +; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v4, v5 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v3 +; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v4, v5 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v4 +; GFX10-IEEE-NEXT: v_div_scale_f32 v3, vcc, v1, v1, 1.0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v6, vcc, 1.0, v1, 1.0 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v3 +; GFX10-IEEE-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v5, v4 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v6, v4 +; GFX10-IEEE-NEXT: v_fma_f32 v7, -v3, v5, v6 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v7, v4 +; GFX10-IEEE-NEXT: v_fma_f32 v3, -v3, v5, v6 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v5 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -1737,8 +1732,8 @@ ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, s4, v0, v0, 1.0 -; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v0, v0, 1.0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 ; GFX10-FLUSH-NEXT: s_denorm_mode 3 ; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0 @@ -1748,20 +1743,20 @@ ; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 ; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX10-FLUSH-NEXT: s_denorm_mode 0 -; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, s4, v1, v1, 1.0 ; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v4 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v1, v1, 1.0 +; GFX10-FLUSH-NEXT: v_div_scale_f32 v5, vcc, 1.0, v1, 1.0 ; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 -; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, v1, 1.0 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 ; GFX10-FLUSH-NEXT: s_denorm_mode 3 -; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v4, v3, 1.0 -; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v3 -; GFX10-FLUSH-NEXT: v_fma_f32 v6, -v4, v5, v2 -; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 -; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v4, v5, v2 +; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v3, v4, 1.0 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v4, v2, v4 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v5, v4 +; GFX10-FLUSH-NEXT: v_fma_f32 v6, -v3, v2, v5 +; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v2, v6, v4 +; GFX10-FLUSH-NEXT: v_fma_f32 v3, -v3, v2, v5 ; GFX10-FLUSH-NEXT: s_denorm_mode 0 -; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v3, v4, v2 ; GFX10-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -1769,36 +1764,37 @@ ; GFX11-IEEE: ; %bb.0: ; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v2, null, v0, v0, 1.0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v3, null, v1, v1, 1.0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v8, vcc_lo, 1.0, v0, 1.0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_rcp_f32_e32 v4, v2 -; GFX11-IEEE-NEXT: v_rcp_f32_e32 v5, v3 +; GFX11-IEEE-NEXT: v_div_scale_f32 v2, vcc, v0, v0, 1.0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v5, vcc, 1.0, v0, 1.0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v2 ; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff -; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0 -; GFX11-IEEE-NEXT: v_fma_f32 v7, -v3, v5, 1.0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_dual_fmac_f32 v4, v6, v4 :: v_dual_fmac_f32 v5, v7, v5 -; GFX11-IEEE-NEXT: v_div_scale_f32 v6, s0, 1.0, v1, 1.0 -; GFX11-IEEE-NEXT: v_mul_f32_e32 v7, v8, v4 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_mul_f32_e32 v9, v6, v5 -; GFX11-IEEE-NEXT: v_fma_f32 v10, -v2, v7, v8 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_fma_f32 v11, -v3, v9, v6 -; GFX11-IEEE-NEXT: v_fmac_f32_e32 v7, v10, v4 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_fmac_f32_e32 v9, v11, v5 -; GFX11-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v8 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v6 -; GFX11-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v7 -; GFX11-IEEE-NEXT: s_mov_b32 vcc_lo, s0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v9 +; GFX11-IEEE-NEXT: v_fma_f32 v4, -v2, v3, 1.0 +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v4, v3 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_mul_f32_e32 v4, v5, v3 +; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v4, v5 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v3 +; GFX11-IEEE-NEXT: v_fma_f32 v2, -v2, v4, v5 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v4 +; GFX11-IEEE-NEXT: v_div_scale_f32 v3, vcc, v1, v1, 1.0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v6, vcc, 1.0, v1, 1.0 ; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v4, v3 +; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff +; GFX11-IEEE-NEXT: v_fma_f32 v5, -v3, v4, 1.0 +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v4, v5, v4 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_mul_f32_e32 v5, v6, v4 +; GFX11-IEEE-NEXT: v_fma_f32 v7, -v3, v5, v6 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v5, v7, v4 +; GFX11-IEEE-NEXT: v_fma_f32 v3, -v3, v5, v6 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v5 ; GFX11-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0 ; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -1806,8 +1802,8 @@ ; GFX11-FLUSH: ; %bb.0: ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, null, v0, v0, 1.0 -; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, 1.0, v0, 1.0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, vcc, v0, v0, 1.0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 ; GFX11-FLUSH-NEXT: s_denorm_mode 3 @@ -1821,26 +1817,26 @@ ; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 ; GFX11-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4 ; GFX11-FLUSH-NEXT: s_denorm_mode 0 -; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, null, v1, v1, 1.0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v4 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v1, v1, 1.0 +; GFX11-FLUSH-NEXT: v_div_scale_f32 v5, vcc, 1.0, v1, 1.0 ; GFX11-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 -; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, v1, 1.0 +; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 ; GFX11-FLUSH-NEXT: s_denorm_mode 3 ; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff -; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v4, v3, 1.0 -; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3 +; GFX11-FLUSH-NEXT: v_fma_f32 v2, -v3, v4, 1.0 +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v4, v2, v4 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v3 -; GFX11-FLUSH-NEXT: v_fma_f32 v6, -v4, v5, v2 +; GFX11-FLUSH-NEXT: v_mul_f32_e32 v2, v5, v4 +; GFX11-FLUSH-NEXT: v_fma_f32 v6, -v3, v2, v5 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3 -; GFX11-FLUSH-NEXT: v_fma_f32 v2, -v4, v5, v2 +; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v2, v6, v4 +; GFX11-FLUSH-NEXT: v_fma_f32 v3, -v3, v2, v5 ; GFX11-FLUSH-NEXT: s_denorm_mode 0 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX11-FLUSH-NEXT: v_div_fmas_f32 v2, v3, v4, v2 ; GFX11-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 ; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x float> , %x @@ -1994,7 +1990,7 @@ ; GFX6-IEEE-LABEL: v_fdiv_v2f32_arcp_ulp25: ; GFX6-IEEE: ; %bb.0: ; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 +; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v0 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4 ; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 ; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0 @@ -2004,7 +2000,7 @@ ; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7 ; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6 ; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7 -; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 +; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v3, v3, v1 ; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v5 ; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 ; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1 @@ -2040,28 +2036,27 @@ ; GFX89-IEEE-LABEL: v_fdiv_v2f32_arcp_ulp25: ; GFX89-IEEE: ; %bb.0: ; GFX89-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 -; GFX89-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 -; GFX89-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0 -; GFX89-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], v1, v3, v1 -; GFX89-IEEE-NEXT: v_rcp_f32_e32 v8, v4 -; GFX89-IEEE-NEXT: v_rcp_f32_e32 v9, v5 -; GFX89-IEEE-NEXT: v_fma_f32 v10, -v4, v8, 1.0 -; GFX89-IEEE-NEXT: v_fma_f32 v8, v10, v8, v8 -; GFX89-IEEE-NEXT: v_fma_f32 v11, -v5, v9, 1.0 -; GFX89-IEEE-NEXT: v_fma_f32 v9, v11, v9, v9 -; GFX89-IEEE-NEXT: v_mul_f32_e32 v10, v6, v8 -; GFX89-IEEE-NEXT: v_mul_f32_e32 v11, v7, v9 -; GFX89-IEEE-NEXT: v_fma_f32 v12, -v4, v10, v6 -; GFX89-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v7 -; GFX89-IEEE-NEXT: v_fma_f32 v10, v12, v8, v10 -; GFX89-IEEE-NEXT: v_fma_f32 v11, v13, v9, v11 -; GFX89-IEEE-NEXT: v_fma_f32 v4, -v4, v10, v6 -; GFX89-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v7 -; GFX89-IEEE-NEXT: v_div_fmas_f32 v4, v4, v8, v10 -; GFX89-IEEE-NEXT: s_mov_b64 vcc, s[4:5] -; GFX89-IEEE-NEXT: v_div_fmas_f32 v5, v5, v9, v11 +; GFX89-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v0 +; GFX89-IEEE-NEXT: v_div_scale_f32 v5, vcc, v0, v2, v0 +; GFX89-IEEE-NEXT: v_rcp_f32_e32 v6, v4 +; GFX89-IEEE-NEXT: v_fma_f32 v7, -v4, v6, 1.0 +; GFX89-IEEE-NEXT: v_fma_f32 v6, v7, v6, v6 +; GFX89-IEEE-NEXT: v_mul_f32_e32 v7, v5, v6 +; GFX89-IEEE-NEXT: v_fma_f32 v8, -v4, v7, v5 +; GFX89-IEEE-NEXT: v_fma_f32 v7, v8, v6, v7 +; GFX89-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v5 +; GFX89-IEEE-NEXT: v_div_fmas_f32 v4, v4, v6, v7 +; GFX89-IEEE-NEXT: v_div_scale_f32 v5, vcc, v3, v3, v1 +; GFX89-IEEE-NEXT: v_div_scale_f32 v6, vcc, v1, v3, v1 ; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 +; GFX89-IEEE-NEXT: v_rcp_f32_e32 v7, v5 +; GFX89-IEEE-NEXT: v_fma_f32 v8, -v5, v7, 1.0 +; GFX89-IEEE-NEXT: v_fma_f32 v7, v8, v7, v7 +; GFX89-IEEE-NEXT: v_mul_f32_e32 v8, v6, v7 +; GFX89-IEEE-NEXT: v_fma_f32 v9, -v5, v8, v6 +; GFX89-IEEE-NEXT: v_fma_f32 v8, v9, v7, v8 +; GFX89-IEEE-NEXT: v_fma_f32 v5, -v5, v8, v6 +; GFX89-IEEE-NEXT: v_div_fmas_f32 v5, v5, v7, v8 ; GFX89-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 ; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -2069,28 +2064,27 @@ ; GFX10-IEEE: ; %bb.0: ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v4, s4, v2, v2, v0 -; GFX10-IEEE-NEXT: v_div_scale_f32 v5, s4, v3, v3, v1 -; GFX10-IEEE-NEXT: v_div_scale_f32 v10, vcc_lo, v0, v2, v0 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v6, v4 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v7, v5 -; GFX10-IEEE-NEXT: v_fma_f32 v8, -v4, v6, 1.0 -; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v7, 1.0 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v8, v6 -; GFX10-IEEE-NEXT: v_div_scale_f32 v8, s4, v1, v3, v1 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v7 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7 -; GFX10-IEEE-NEXT: v_fma_f32 v12, -v4, v9, v10 -; GFX10-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v8 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7 -; GFX10-IEEE-NEXT: v_fma_f32 v4, -v4, v9, v10 -; GFX10-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v8 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v4, v6, v9 -; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v5, v7, v11 +; GFX10-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v0 +; GFX10-IEEE-NEXT: v_div_scale_f32 v7, vcc, v0, v2, v0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v4 +; GFX10-IEEE-NEXT: v_fma_f32 v6, -v4, v5, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v5 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v7, v5 +; GFX10-IEEE-NEXT: v_fma_f32 v8, -v4, v6, v7 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v8, v5 +; GFX10-IEEE-NEXT: v_fma_f32 v4, -v4, v6, v7 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v6 +; GFX10-IEEE-NEXT: v_div_scale_f32 v5, vcc, v3, v3, v1 +; GFX10-IEEE-NEXT: v_div_scale_f32 v8, vcc, v1, v3, v1 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v6, v5 +; GFX10-IEEE-NEXT: v_fma_f32 v7, -v5, v6, 1.0 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v6, v7, v6 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v8, v6 +; GFX10-IEEE-NEXT: v_fma_f32 v9, -v5, v7, v8 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v6 +; GFX10-IEEE-NEXT: v_fma_f32 v5, -v5, v7, v8 +; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v5, v6, v7 ; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -2116,36 +2110,37 @@ ; GFX11-IEEE: ; %bb.0: ; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v4, null, v2, v2, v0 -; GFX11-IEEE-NEXT: v_div_scale_f32 v5, null, v3, v3, v1 -; GFX11-IEEE-NEXT: v_div_scale_f32 v10, vcc_lo, v0, v2, v0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_rcp_f32_e32 v6, v4 -; GFX11-IEEE-NEXT: v_rcp_f32_e32 v7, v5 +; GFX11-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v0 +; GFX11-IEEE-NEXT: v_div_scale_f32 v7, vcc, v0, v2, v0 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v5, v4 ; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff -; GFX11-IEEE-NEXT: v_fma_f32 v8, -v4, v6, 1.0 -; GFX11-IEEE-NEXT: v_fma_f32 v9, -v5, v7, 1.0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_dual_fmac_f32 v6, v8, v6 :: v_dual_fmac_f32 v7, v9, v7 -; GFX11-IEEE-NEXT: v_div_scale_f32 v8, s0, v1, v3, v1 -; GFX11-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7 -; GFX11-IEEE-NEXT: v_fma_f32 v12, -v4, v9, v10 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v8 -; GFX11-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7 -; GFX11-IEEE-NEXT: v_fma_f32 v4, -v4, v9, v10 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v8 -; GFX11-IEEE-NEXT: v_div_fmas_f32 v4, v4, v6, v9 -; GFX11-IEEE-NEXT: s_mov_b32 vcc_lo, s0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-IEEE-NEXT: v_div_fmas_f32 v5, v5, v7, v11 +; GFX11-IEEE-NEXT: v_fma_f32 v6, -v4, v5, 1.0 +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v5 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_mul_f32_e32 v6, v7, v5 +; GFX11-IEEE-NEXT: v_fma_f32 v8, -v4, v6, v7 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v6, v8, v5 +; GFX11-IEEE-NEXT: v_fma_f32 v4, -v4, v6, v7 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v6 +; GFX11-IEEE-NEXT: v_div_scale_f32 v5, vcc, v3, v3, v1 +; GFX11-IEEE-NEXT: v_div_scale_f32 v8, vcc, v1, v3, v1 ; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0 -; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_rcp_f32_e32 v6, v5 +; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff +; GFX11-IEEE-NEXT: v_fma_f32 v7, -v5, v6, 1.0 +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v6, v7, v6 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_mul_f32_e32 v7, v8, v6 +; GFX11-IEEE-NEXT: v_fma_f32 v9, -v5, v7, v8 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v6 +; GFX11-IEEE-NEXT: v_fma_f32 v5, -v5, v7, v8 +; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-IEEE-NEXT: v_div_fmas_f32 v5, v5, v6, v7 ; GFX11-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1 ; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll @@ -18,8 +18,8 @@ ; GFX6-LABEL: v_fdiv_f64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] -; GFX6-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[0:1], v[2:3], v[0:1] +; GFX6-NEXT: v_div_scale_f64 v[4:5], vcc, v[2:3], v[2:3], v[0:1] +; GFX6-NEXT: v_div_scale_f64 v[10:11], vcc, v[0:1], v[2:3], v[0:1] ; GFX6-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v5 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v11 @@ -37,7 +37,7 @@ ; GFX8-LABEL: v_fdiv_f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] +; GFX8-NEXT: v_div_scale_f64 v[4:5], vcc, v[2:3], v[2:3], v[0:1] ; GFX8-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX8-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX8-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] @@ -53,7 +53,7 @@ ; GFX9-LABEL: v_fdiv_f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] +; GFX9-NEXT: v_div_scale_f64 v[4:5], vcc, v[2:3], v[2:3], v[0:1] ; GFX9-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX9-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] @@ -70,8 +70,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_div_scale_f64 v[4:5], s4, v[2:3], v[2:3], v[0:1] -; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[2:3], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[4:5], vcc, v[2:3], v[2:3], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc, v[0:1], v[2:3], v[0:1] ; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] @@ -87,8 +87,8 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1] -; GFX11-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[2:3], v[0:1] +; GFX11-NEXT: v_div_scale_f64 v[4:5], vcc, v[2:3], v[2:3], v[0:1] +; GFX11-NEXT: v_div_scale_f64 v[10:11], vcc, v[0:1], v[2:3], v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -161,8 +161,8 @@ ; GFX6-LABEL: v_fdiv_f64_ulp25: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] -; GFX6-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[0:1], v[2:3], v[0:1] +; GFX6-NEXT: v_div_scale_f64 v[4:5], vcc, v[2:3], v[2:3], v[0:1] +; GFX6-NEXT: v_div_scale_f64 v[10:11], vcc, v[0:1], v[2:3], v[0:1] ; GFX6-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v5 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v11 @@ -180,7 +180,7 @@ ; GFX8-LABEL: v_fdiv_f64_ulp25: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] +; GFX8-NEXT: v_div_scale_f64 v[4:5], vcc, v[2:3], v[2:3], v[0:1] ; GFX8-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX8-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX8-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] @@ -196,7 +196,7 @@ ; GFX9-LABEL: v_fdiv_f64_ulp25: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] +; GFX9-NEXT: v_div_scale_f64 v[4:5], vcc, v[2:3], v[2:3], v[0:1] ; GFX9-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX9-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] @@ -213,8 +213,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_div_scale_f64 v[4:5], s4, v[2:3], v[2:3], v[0:1] -; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[2:3], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[4:5], vcc, v[2:3], v[2:3], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc, v[0:1], v[2:3], v[0:1] ; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] @@ -230,8 +230,8 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1] -; GFX11-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[2:3], v[0:1] +; GFX11-NEXT: v_div_scale_f64 v[4:5], vcc, v[2:3], v[2:3], v[0:1] +; GFX11-NEXT: v_div_scale_f64 v[10:11], vcc, v[0:1], v[2:3], v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -255,8 +255,8 @@ ; GFX6-LABEL: v_rcp_f64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 -; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[2:3], vcc, v[0:1], v[0:1], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[8:9], vcc, 1.0, v[0:1], 1.0 ; GFX6-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX6-NEXT: v_mov_b32_e32 v10, 0x3ff00000 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v10, v9 @@ -275,7 +275,7 @@ ; GFX8-LABEL: v_rcp_f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; GFX8-NEXT: v_div_scale_f64 v[2:3], vcc, v[0:1], v[0:1], 1.0 ; GFX8-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX8-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 ; GFX8-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] @@ -291,7 +291,7 @@ ; GFX9-LABEL: v_rcp_f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_div_scale_f64 v[2:3], vcc, v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 ; GFX9-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] @@ -308,8 +308,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_div_scale_f64 v[2:3], s4, v[0:1], v[0:1], 1.0 -; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc_lo, 1.0, v[0:1], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[2:3], vcc, v[0:1], v[0:1], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc, 1.0, v[0:1], 1.0 ; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 ; GFX10-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] @@ -325,8 +325,8 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_div_scale_f64 v[2:3], null, v[0:1], v[0:1], 1.0 -; GFX11-NEXT: v_div_scale_f64 v[8:9], vcc_lo, 1.0, v[0:1], 1.0 +; GFX11-NEXT: v_div_scale_f64 v[2:3], vcc, v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_div_scale_f64 v[8:9], vcc, 1.0, v[0:1], 1.0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -350,8 +350,8 @@ ; GFX6-LABEL: v_rcp_f64_arcp: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 -; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[2:3], vcc, v[0:1], v[0:1], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[8:9], vcc, 1.0, v[0:1], 1.0 ; GFX6-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX6-NEXT: v_mov_b32_e32 v10, 0x3ff00000 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v10, v9 @@ -370,7 +370,7 @@ ; GFX8-LABEL: v_rcp_f64_arcp: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; GFX8-NEXT: v_div_scale_f64 v[2:3], vcc, v[0:1], v[0:1], 1.0 ; GFX8-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX8-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 ; GFX8-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] @@ -386,7 +386,7 @@ ; GFX9-LABEL: v_rcp_f64_arcp: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_div_scale_f64 v[2:3], vcc, v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 ; GFX9-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] @@ -403,8 +403,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_div_scale_f64 v[2:3], s4, v[0:1], v[0:1], 1.0 -; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc_lo, 1.0, v[0:1], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[2:3], vcc, v[0:1], v[0:1], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc, 1.0, v[0:1], 1.0 ; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 ; GFX10-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] @@ -420,8 +420,8 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_div_scale_f64 v[2:3], null, v[0:1], v[0:1], 1.0 -; GFX11-NEXT: v_div_scale_f64 v[8:9], vcc_lo, 1.0, v[0:1], 1.0 +; GFX11-NEXT: v_div_scale_f64 v[2:3], vcc, v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_div_scale_f64 v[8:9], vcc, 1.0, v[0:1], 1.0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -494,8 +494,8 @@ ; GFX6-LABEL: v_rcp_f64_ulp25: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 -; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[2:3], vcc, v[0:1], v[0:1], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[8:9], vcc, 1.0, v[0:1], 1.0 ; GFX6-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX6-NEXT: v_mov_b32_e32 v10, 0x3ff00000 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v10, v9 @@ -514,7 +514,7 @@ ; GFX8-LABEL: v_rcp_f64_ulp25: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; GFX8-NEXT: v_div_scale_f64 v[2:3], vcc, v[0:1], v[0:1], 1.0 ; GFX8-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX8-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 ; GFX8-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] @@ -530,7 +530,7 @@ ; GFX9-LABEL: v_rcp_f64_ulp25: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_div_scale_f64 v[2:3], vcc, v[0:1], v[0:1], 1.0 ; GFX9-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX9-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 ; GFX9-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] @@ -547,8 +547,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_div_scale_f64 v[2:3], s4, v[0:1], v[0:1], 1.0 -; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc_lo, 1.0, v[0:1], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[2:3], vcc, v[0:1], v[0:1], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc, 1.0, v[0:1], 1.0 ; GFX10-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX10-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 ; GFX10-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] @@ -564,8 +564,8 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_div_scale_f64 v[2:3], null, v[0:1], v[0:1], 1.0 -; GFX11-NEXT: v_div_scale_f64 v[8:9], vcc_lo, 1.0, v[0:1], 1.0 +; GFX11-NEXT: v_div_scale_f64 v[2:3], vcc, v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_div_scale_f64 v[8:9], vcc, 1.0, v[0:1], 1.0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -638,8 +638,8 @@ ; GFX6-LABEL: v_fdiv_f64_arcp_ulp25: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] -; GFX6-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[0:1], v[2:3], v[0:1] +; GFX6-NEXT: v_div_scale_f64 v[4:5], vcc, v[2:3], v[2:3], v[0:1] +; GFX6-NEXT: v_div_scale_f64 v[10:11], vcc, v[0:1], v[2:3], v[0:1] ; GFX6-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v5 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v11 @@ -657,7 +657,7 @@ ; GFX8-LABEL: v_fdiv_f64_arcp_ulp25: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] +; GFX8-NEXT: v_div_scale_f64 v[4:5], vcc, v[2:3], v[2:3], v[0:1] ; GFX8-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX8-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX8-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] @@ -673,7 +673,7 @@ ; GFX9-LABEL: v_fdiv_f64_arcp_ulp25: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] +; GFX9-NEXT: v_div_scale_f64 v[4:5], vcc, v[2:3], v[2:3], v[0:1] ; GFX9-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX9-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] @@ -690,8 +690,8 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_div_scale_f64 v[4:5], s4, v[2:3], v[2:3], v[0:1] -; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[2:3], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[4:5], vcc, v[2:3], v[2:3], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc, v[0:1], v[2:3], v[0:1] ; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] @@ -707,8 +707,8 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1] -; GFX11-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[2:3], v[0:1] +; GFX11-NEXT: v_div_scale_f64 v[4:5], vcc, v[2:3], v[2:3], v[0:1] +; GFX11-NEXT: v_div_scale_f64 v[10:11], vcc, v[0:1], v[2:3], v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -732,90 +732,87 @@ ; GFX6-LABEL: v_fdiv_v2f64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] -; GFX6-NEXT: v_div_scale_f64 v[14:15], s[4:5], v[6:7], v[6:7], v[2:3] +; GFX6-NEXT: v_div_scale_f64 v[8:9], vcc, v[4:5], v[4:5], v[0:1] +; GFX6-NEXT: v_div_scale_f64 v[14:15], vcc, v[0:1], v[4:5], v[0:1] ; GFX6-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] -; GFX6-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[0:1], v[4:5], v[0:1] -; GFX6-NEXT: v_rcp_f64_e32 v[16:17], v[14:15] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v9 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v15 +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v19 ; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v9 ; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX6-NEXT: v_fma_f64 v[12:13], -v[14:15], v[16:17], 1.0 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v15 -; GFX6-NEXT: v_fma_f64 v[12:13], v[16:17], v[12:13], v[16:17] -; GFX6-NEXT: v_mul_f64 v[16:17], v[18:19], v[10:11] -; GFX6-NEXT: v_fma_f64 v[18:19], -v[8:9], v[16:17], v[18:19] -; GFX6-NEXT: v_fma_f64 v[8:9], -v[14:15], v[12:13], 1.0 -; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[18:19], v[10:11], v[16:17] -; GFX6-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13] -; GFX6-NEXT: v_div_scale_f64 v[12:13], s[6:7], v[2:3], v[6:7], v[2:3] -; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[10:11], v[4:5], v[0:1] -; GFX6-NEXT: v_mul_f64 v[16:17], v[12:13], v[8:9] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v13 -; GFX6-NEXT: v_fma_f64 v[18:19], -v[14:15], v[16:17], v[12:13] +; GFX6-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11] +; GFX6-NEXT: v_fma_f64 v[14:15], -v[8:9], v[12:13], v[14:15] +; GFX6-NEXT: v_div_fmas_f64 v[8:9], v[14:15], v[10:11], v[12:13] +; GFX6-NEXT: v_div_scale_f64 v[10:11], vcc, v[6:7], v[6:7], v[2:3] +; GFX6-NEXT: v_div_scale_f64 v[16:17], vcc, v[2:3], v[6:7], v[2:3] +; GFX6-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v11 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v17 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: s_nop 1 -; GFX6-NEXT: v_div_fmas_f64 v[8:9], v[18:19], v[8:9], v[16:17] -; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[6:7], v[2:3] +; GFX6-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX6-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX6-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX6-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX6-NEXT: v_mul_f64 v[14:15], v[16:17], v[12:13] +; GFX6-NEXT: v_fma_f64 v[10:11], -v[10:11], v[14:15], v[16:17] +; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[14:15] +; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_v2f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] -; GFX8-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3] -; GFX8-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] -; GFX8-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] -; GFX8-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX8-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX8-NEXT: v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1] -; GFX8-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX8-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 -; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] -; GFX8-NEXT: v_mul_f64 v[16:17], v[18:19], v[12:13] -; GFX8-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19] -; GFX8-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3] -; GFX8-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17] -; GFX8-NEXT: s_mov_b64 vcc, s[4:5] -; GFX8-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15] +; GFX8-NEXT: v_div_scale_f64 v[8:9], vcc, v[4:5], v[4:5], v[0:1] +; GFX8-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; GFX8-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX8-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX8-NEXT: v_div_scale_f64 v[12:13], vcc, v[0:1], v[4:5], v[0:1] +; GFX8-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; GFX8-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; GFX8-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] +; GFX8-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] +; GFX8-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] +; GFX8-NEXT: v_div_scale_f64 v[10:11], vcc, v[6:7], v[6:7], v[2:3] ; GFX8-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] -; GFX8-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19] -; GFX8-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21] +; GFX8-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; GFX8-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX8-NEXT: v_div_scale_f64 v[14:15], vcc, v[2:3], v[6:7], v[2:3] +; GFX8-NEXT: v_fma_f64 v[16:17], -v[10:11], v[12:13], 1.0 +; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX8-NEXT: v_mul_f64 v[16:17], v[14:15], v[12:13] +; GFX8-NEXT: v_fma_f64 v[10:11], -v[10:11], v[16:17], v[14:15] +; GFX8-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[16:17] ; GFX8-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fdiv_v2f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] -; GFX9-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3] -; GFX9-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] -; GFX9-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] -; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX9-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX9-NEXT: v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1] -; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX9-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 -; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] -; GFX9-NEXT: v_mul_f64 v[16:17], v[18:19], v[12:13] -; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19] -; GFX9-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3] -; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17] -; GFX9-NEXT: s_mov_b64 vcc, s[4:5] -; GFX9-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15] +; GFX9-NEXT: v_div_scale_f64 v[8:9], vcc, v[4:5], v[4:5], v[0:1] +; GFX9-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; GFX9-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX9-NEXT: v_div_scale_f64 v[12:13], vcc, v[0:1], v[4:5], v[0:1] +; GFX9-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; GFX9-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] +; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] +; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] +; GFX9-NEXT: v_div_scale_f64 v[10:11], vcc, v[6:7], v[6:7], v[2:3] ; GFX9-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] -; GFX9-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19] -; GFX9-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21] +; GFX9-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; GFX9-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX9-NEXT: v_div_scale_f64 v[14:15], vcc, v[2:3], v[6:7], v[2:3] +; GFX9-NEXT: v_fma_f64 v[16:17], -v[10:11], v[12:13], 1.0 +; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX9-NEXT: v_mul_f64 v[16:17], v[14:15], v[12:13] +; GFX9-NEXT: v_fma_f64 v[10:11], -v[10:11], v[16:17], v[14:15] +; GFX9-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[16:17] ; GFX9-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -823,28 +820,27 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_div_scale_f64 v[8:9], s4, v[4:5], v[4:5], v[0:1] -; GFX10-NEXT: v_div_scale_f64 v[10:11], s4, v[6:7], v[6:7], v[2:3] -; GFX10-NEXT: v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[4:5], v[0:1] -; GFX10-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] -; GFX10-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] -; GFX10-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX10-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX10-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX10-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX10-NEXT: v_div_scale_f64 v[16:17], s4, v[2:3], v[6:7], v[2:3] -; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX10-NEXT: v_mul_f64 v[18:19], v[20:21], v[12:13] -; GFX10-NEXT: v_mul_f64 v[22:23], v[16:17], v[14:15] -; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[20:21] -; GFX10-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[16:17] -; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] -; GFX10-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] +; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc, v[4:5], v[4:5], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[14:15], vcc, v[0:1], v[4:5], v[0:1] +; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX10-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[12:13], v[14:15] +; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[12:13] +; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc, v[6:7], v[6:7], v[2:3] +; GFX10-NEXT: v_div_scale_f64 v[16:17], vcc, v[2:3], v[6:7], v[2:3] ; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX10-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; GFX10-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX10-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX10-NEXT: v_mul_f64 v[14:15], v[16:17], v[12:13] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[10:11], v[14:15], v[16:17] +; GFX10-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[14:15] ; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -852,37 +848,37 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_div_scale_f64 v[8:9], null, v[4:5], v[4:5], v[0:1] -; GFX11-NEXT: v_div_scale_f64 v[10:11], null, v[6:7], v[6:7], v[2:3] -; GFX11-NEXT: v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[4:5], v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] -; GFX11-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] +; GFX11-NEXT: v_div_scale_f64 v[8:9], vcc, v[4:5], v[4:5], v[0:1] +; GFX11-NEXT: v_div_scale_f64 v[14:15], vcc, v[0:1], v[4:5], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX11-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX11-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX11-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX11-NEXT: v_div_scale_f64 v[16:17], s0, v[2:3], v[6:7], v[2:3] -; GFX11-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_f64 v[18:19], v[20:21], v[12:13] -; GFX11-NEXT: v_mul_f64 v[22:23], v[16:17], v[14:15] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[20:21] -; GFX11-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[16:17] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] -; GFX11-NEXT: s_mov_b32 vcc_lo, s0 -; GFX11-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11] +; GFX11-NEXT: v_fma_f64 v[8:9], -v[8:9], v[12:13], v[14:15] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[12:13] +; GFX11-NEXT: v_div_scale_f64 v[10:11], vcc, v[6:7], v[6:7], v[2:3] +; GFX11-NEXT: v_div_scale_f64 v[16:17], vcc, v[2:3], v[6:7], v[2:3] ; GFX11-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX11-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX11-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[14:15], v[16:17], v[12:13] +; GFX11-NEXT: v_fma_f64 v[10:11], -v[10:11], v[14:15], v[16:17] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[14:15] ; GFX11-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x double> %a, %b @@ -969,90 +965,87 @@ ; GFX6-LABEL: v_fdiv_v2f64_ulp25: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] -; GFX6-NEXT: v_div_scale_f64 v[14:15], s[4:5], v[6:7], v[6:7], v[2:3] +; GFX6-NEXT: v_div_scale_f64 v[8:9], vcc, v[4:5], v[4:5], v[0:1] +; GFX6-NEXT: v_div_scale_f64 v[14:15], vcc, v[0:1], v[4:5], v[0:1] ; GFX6-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] -; GFX6-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[0:1], v[4:5], v[0:1] -; GFX6-NEXT: v_rcp_f64_e32 v[16:17], v[14:15] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v9 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v15 +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v19 ; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v9 ; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX6-NEXT: v_fma_f64 v[12:13], -v[14:15], v[16:17], 1.0 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v15 -; GFX6-NEXT: v_fma_f64 v[12:13], v[16:17], v[12:13], v[16:17] -; GFX6-NEXT: v_mul_f64 v[16:17], v[18:19], v[10:11] -; GFX6-NEXT: v_fma_f64 v[18:19], -v[8:9], v[16:17], v[18:19] -; GFX6-NEXT: v_fma_f64 v[8:9], -v[14:15], v[12:13], 1.0 -; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[18:19], v[10:11], v[16:17] -; GFX6-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13] -; GFX6-NEXT: v_div_scale_f64 v[12:13], s[6:7], v[2:3], v[6:7], v[2:3] -; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[10:11], v[4:5], v[0:1] -; GFX6-NEXT: v_mul_f64 v[16:17], v[12:13], v[8:9] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v13 -; GFX6-NEXT: v_fma_f64 v[18:19], -v[14:15], v[16:17], v[12:13] +; GFX6-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11] +; GFX6-NEXT: v_fma_f64 v[14:15], -v[8:9], v[12:13], v[14:15] +; GFX6-NEXT: v_div_fmas_f64 v[8:9], v[14:15], v[10:11], v[12:13] +; GFX6-NEXT: v_div_scale_f64 v[10:11], vcc, v[6:7], v[6:7], v[2:3] +; GFX6-NEXT: v_div_scale_f64 v[16:17], vcc, v[2:3], v[6:7], v[2:3] +; GFX6-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v11 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v17 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: s_nop 1 -; GFX6-NEXT: v_div_fmas_f64 v[8:9], v[18:19], v[8:9], v[16:17] -; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[6:7], v[2:3] +; GFX6-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX6-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX6-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX6-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX6-NEXT: v_mul_f64 v[14:15], v[16:17], v[12:13] +; GFX6-NEXT: v_fma_f64 v[10:11], -v[10:11], v[14:15], v[16:17] +; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[14:15] +; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_v2f64_ulp25: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] -; GFX8-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3] -; GFX8-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] -; GFX8-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] -; GFX8-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX8-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX8-NEXT: v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1] -; GFX8-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX8-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 -; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] -; GFX8-NEXT: v_mul_f64 v[16:17], v[18:19], v[12:13] -; GFX8-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19] -; GFX8-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3] -; GFX8-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17] -; GFX8-NEXT: s_mov_b64 vcc, s[4:5] -; GFX8-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15] +; GFX8-NEXT: v_div_scale_f64 v[8:9], vcc, v[4:5], v[4:5], v[0:1] +; GFX8-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; GFX8-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX8-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX8-NEXT: v_div_scale_f64 v[12:13], vcc, v[0:1], v[4:5], v[0:1] +; GFX8-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; GFX8-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; GFX8-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] +; GFX8-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] +; GFX8-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] +; GFX8-NEXT: v_div_scale_f64 v[10:11], vcc, v[6:7], v[6:7], v[2:3] ; GFX8-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] -; GFX8-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19] -; GFX8-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21] +; GFX8-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; GFX8-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX8-NEXT: v_div_scale_f64 v[14:15], vcc, v[2:3], v[6:7], v[2:3] +; GFX8-NEXT: v_fma_f64 v[16:17], -v[10:11], v[12:13], 1.0 +; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX8-NEXT: v_mul_f64 v[16:17], v[14:15], v[12:13] +; GFX8-NEXT: v_fma_f64 v[10:11], -v[10:11], v[16:17], v[14:15] +; GFX8-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[16:17] ; GFX8-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fdiv_v2f64_ulp25: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] -; GFX9-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3] -; GFX9-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] -; GFX9-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] -; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX9-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX9-NEXT: v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1] -; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX9-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 -; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] -; GFX9-NEXT: v_mul_f64 v[16:17], v[18:19], v[12:13] -; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19] -; GFX9-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3] -; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17] -; GFX9-NEXT: s_mov_b64 vcc, s[4:5] -; GFX9-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15] +; GFX9-NEXT: v_div_scale_f64 v[8:9], vcc, v[4:5], v[4:5], v[0:1] +; GFX9-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; GFX9-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX9-NEXT: v_div_scale_f64 v[12:13], vcc, v[0:1], v[4:5], v[0:1] +; GFX9-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; GFX9-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] +; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] +; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] +; GFX9-NEXT: v_div_scale_f64 v[10:11], vcc, v[6:7], v[6:7], v[2:3] ; GFX9-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] -; GFX9-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19] -; GFX9-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21] +; GFX9-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; GFX9-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX9-NEXT: v_div_scale_f64 v[14:15], vcc, v[2:3], v[6:7], v[2:3] +; GFX9-NEXT: v_fma_f64 v[16:17], -v[10:11], v[12:13], 1.0 +; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX9-NEXT: v_mul_f64 v[16:17], v[14:15], v[12:13] +; GFX9-NEXT: v_fma_f64 v[10:11], -v[10:11], v[16:17], v[14:15] +; GFX9-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[16:17] ; GFX9-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1060,28 +1053,27 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_div_scale_f64 v[8:9], s4, v[4:5], v[4:5], v[0:1] -; GFX10-NEXT: v_div_scale_f64 v[10:11], s4, v[6:7], v[6:7], v[2:3] -; GFX10-NEXT: v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[4:5], v[0:1] -; GFX10-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] -; GFX10-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] -; GFX10-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX10-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX10-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX10-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX10-NEXT: v_div_scale_f64 v[16:17], s4, v[2:3], v[6:7], v[2:3] -; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX10-NEXT: v_mul_f64 v[18:19], v[20:21], v[12:13] -; GFX10-NEXT: v_mul_f64 v[22:23], v[16:17], v[14:15] -; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[20:21] -; GFX10-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[16:17] -; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] -; GFX10-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] +; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc, v[4:5], v[4:5], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[14:15], vcc, v[0:1], v[4:5], v[0:1] +; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX10-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[12:13], v[14:15] +; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[12:13] +; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc, v[6:7], v[6:7], v[2:3] +; GFX10-NEXT: v_div_scale_f64 v[16:17], vcc, v[2:3], v[6:7], v[2:3] ; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX10-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; GFX10-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX10-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX10-NEXT: v_mul_f64 v[14:15], v[16:17], v[12:13] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[10:11], v[14:15], v[16:17] +; GFX10-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[14:15] ; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1089,37 +1081,37 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_div_scale_f64 v[8:9], null, v[4:5], v[4:5], v[0:1] -; GFX11-NEXT: v_div_scale_f64 v[10:11], null, v[6:7], v[6:7], v[2:3] -; GFX11-NEXT: v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[4:5], v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] -; GFX11-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] +; GFX11-NEXT: v_div_scale_f64 v[8:9], vcc, v[4:5], v[4:5], v[0:1] +; GFX11-NEXT: v_div_scale_f64 v[14:15], vcc, v[0:1], v[4:5], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX11-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX11-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX11-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX11-NEXT: v_div_scale_f64 v[16:17], s0, v[2:3], v[6:7], v[2:3] -; GFX11-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_f64 v[18:19], v[20:21], v[12:13] -; GFX11-NEXT: v_mul_f64 v[22:23], v[16:17], v[14:15] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[20:21] -; GFX11-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[16:17] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] -; GFX11-NEXT: s_mov_b32 vcc_lo, s0 -; GFX11-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11] +; GFX11-NEXT: v_fma_f64 v[8:9], -v[8:9], v[12:13], v[14:15] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[12:13] +; GFX11-NEXT: v_div_scale_f64 v[10:11], vcc, v[6:7], v[6:7], v[2:3] +; GFX11-NEXT: v_div_scale_f64 v[16:17], vcc, v[2:3], v[6:7], v[2:3] ; GFX11-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX11-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX11-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[14:15], v[16:17], v[12:13] +; GFX11-NEXT: v_fma_f64 v[10:11], -v[10:11], v[14:15], v[16:17] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[14:15] ; GFX11-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x double> %a, %b, !fpmath !0 @@ -1130,91 +1122,88 @@ ; GFX6-LABEL: v_rcp_v2f64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 -; GFX6-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[4:5], vcc, v[0:1], v[0:1], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[10:11], vcc, 1.0, v[0:1], 1.0 ; GFX6-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] -; GFX6-NEXT: v_mov_b32_e32 v18, 0x3ff00000 -; GFX6-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] -; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 -; GFX6-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 -; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0 -; GFX6-NEXT: v_fma_f64 v[12:13], -v[4:5], v[6:7], 1.0 -; GFX6-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] -; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v18, v9 -; GFX6-NEXT: v_mul_f64 v[12:13], v[8:9], v[6:7] +; GFX6-NEXT: v_mov_b32_e32 v14, 0x3ff00000 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v14, v11 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5 -; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[12:13], v[8:9] -; GFX6-NEXT: v_fma_f64 v[4:5], -v[10:11], v[14:15], 1.0 -; GFX6-NEXT: v_div_scale_f64 v[16:17], s[6:7], 1.0, v[2:3], 1.0 -; GFX6-NEXT: v_fma_f64 v[4:5], v[14:15], v[4:5], v[14:15] +; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_mul_f64 v[14:15], v[16:17], v[4:5] -; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[8:9], v[6:7], v[12:13] -; GFX6-NEXT: v_fma_f64 v[8:9], -v[10:11], v[14:15], v[16:17] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v18, v17 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v11 +; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX6-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7] +; GFX6-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11] +; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; GFX6-NEXT: v_div_scale_f64 v[6:7], vcc, v[2:3], v[2:3], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[12:13], vcc, 1.0, v[2:3], 1.0 +; GFX6-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v14, v13 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0 -; GFX6-NEXT: s_nop 0 -; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[8:9], v[4:5], v[14:15] -; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0 +; GFX6-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; GFX6-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX6-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX6-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX6-NEXT: v_mul_f64 v[10:11], v[12:13], v[8:9] +; GFX6-NEXT: v_fma_f64 v[6:7], -v[6:7], v[10:11], v[12:13] +; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[10:11] +; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_rcp_v2f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 -; GFX8-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0 -; GFX8-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0 -; GFX8-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] -; GFX8-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] -; GFX8-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX8-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX8-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX8-NEXT: v_div_scale_f64 v[12:13], vcc, 1.0, v[0:1], 1.0 -; GFX8-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX8-NEXT: v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0 -; GFX8-NEXT: v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0 -; GFX8-NEXT: v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9] -; GFX8-NEXT: v_fma_f64 v[10:11], v[10:11], v[18:19], v[10:11] -; GFX8-NEXT: v_mul_f64 v[14:15], v[12:13], v[8:9] -; GFX8-NEXT: v_mul_f64 v[18:19], v[16:17], v[10:11] -; GFX8-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[12:13] -; GFX8-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[16:17] -; GFX8-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] -; GFX8-NEXT: s_mov_b64 vcc, s[4:5] -; GFX8-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] +; GFX8-NEXT: v_div_scale_f64 v[4:5], vcc, v[0:1], v[0:1], 1.0 +; GFX8-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; GFX8-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX8-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX8-NEXT: v_div_scale_f64 v[8:9], vcc, 1.0, v[0:1], 1.0 +; GFX8-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0 +; GFX8-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; GFX8-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] +; GFX8-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] +; GFX8-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] +; GFX8-NEXT: v_div_scale_f64 v[6:7], vcc, v[2:3], v[2:3], 1.0 ; GFX8-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; GFX8-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] +; GFX8-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX8-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX8-NEXT: v_div_scale_f64 v[10:11], vcc, 1.0, v[2:3], 1.0 +; GFX8-NEXT: v_fma_f64 v[12:13], -v[6:7], v[8:9], 1.0 +; GFX8-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; GFX8-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] +; GFX8-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] +; GFX8-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] ; GFX8-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_rcp_v2f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0 -; GFX9-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0 -; GFX9-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] -; GFX9-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] -; GFX9-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX9-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX9-NEXT: v_div_scale_f64 v[12:13], vcc, 1.0, v[0:1], 1.0 -; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX9-NEXT: v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0 -; GFX9-NEXT: v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0 -; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9] -; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[18:19], v[10:11] -; GFX9-NEXT: v_mul_f64 v[14:15], v[12:13], v[8:9] -; GFX9-NEXT: v_mul_f64 v[18:19], v[16:17], v[10:11] -; GFX9-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[12:13] -; GFX9-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[16:17] -; GFX9-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] -; GFX9-NEXT: s_mov_b64 vcc, s[4:5] -; GFX9-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] +; GFX9-NEXT: v_div_scale_f64 v[4:5], vcc, v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; GFX9-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX9-NEXT: v_div_scale_f64 v[8:9], vcc, 1.0, v[0:1], 1.0 +; GFX9-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0 +; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; GFX9-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] +; GFX9-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] +; GFX9-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] +; GFX9-NEXT: v_div_scale_f64 v[6:7], vcc, v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; GFX9-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] +; GFX9-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX9-NEXT: v_div_scale_f64 v[10:11], vcc, 1.0, v[2:3], 1.0 +; GFX9-NEXT: v_fma_f64 v[12:13], -v[6:7], v[8:9], 1.0 +; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; GFX9-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] +; GFX9-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] +; GFX9-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] ; GFX9-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1222,28 +1211,27 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_div_scale_f64 v[4:5], s4, v[0:1], v[0:1], 1.0 -; GFX10-NEXT: v_div_scale_f64 v[6:7], s4, v[2:3], v[2:3], 1.0 -; GFX10-NEXT: v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0 -; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] -; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] -; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX10-NEXT: v_div_scale_f64 v[12:13], s4, 1.0, v[2:3], 1.0 -; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX10-NEXT: v_mul_f64 v[14:15], v[16:17], v[8:9] -; GFX10-NEXT: v_mul_f64 v[18:19], v[12:13], v[10:11] -; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17] -; GFX10-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[12:13] -; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] -; GFX10-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] +; GFX10-NEXT: v_div_scale_f64 v[4:5], vcc, v[0:1], v[0:1], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc, 1.0, v[0:1], 1.0 +; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX10-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7] +; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11] +; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; GFX10-NEXT: v_div_scale_f64 v[6:7], vcc, v[2:3], v[2:3], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[12:13], vcc, 1.0, v[2:3], 1.0 ; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX10-NEXT: v_mul_f64 v[10:11], v[12:13], v[8:9] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[6:7], v[10:11], v[12:13] +; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[10:11] ; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1251,37 +1239,37 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_div_scale_f64 v[4:5], null, v[0:1], v[0:1], 1.0 -; GFX11-NEXT: v_div_scale_f64 v[6:7], null, v[2:3], v[2:3], 1.0 -; GFX11-NEXT: v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] -; GFX11-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] +; GFX11-NEXT: v_div_scale_f64 v[4:5], vcc, v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_div_scale_f64 v[10:11], vcc, 1.0, v[0:1], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX11-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX11-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX11-NEXT: v_div_scale_f64 v[12:13], s0, 1.0, v[2:3], 1.0 -; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_f64 v[14:15], v[16:17], v[8:9] -; GFX11-NEXT: v_mul_f64 v[18:19], v[12:13], v[10:11] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17] -; GFX11-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[12:13] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] -; GFX11-NEXT: s_mov_b32 vcc_lo, s0 -; GFX11-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX11-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX11-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7] +; GFX11-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; GFX11-NEXT: v_div_scale_f64 v[6:7], vcc, v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_div_scale_f64 v[12:13], vcc, 1.0, v[2:3], 1.0 ; GFX11-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[10:11], v[12:13], v[8:9] +; GFX11-NEXT: v_fma_f64 v[6:7], -v[6:7], v[10:11], v[12:13] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[10:11] ; GFX11-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x double> , %x @@ -1292,91 +1280,88 @@ ; GFX6-LABEL: v_rcp_v2f64_arcp: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 -; GFX6-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[4:5], vcc, v[0:1], v[0:1], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[10:11], vcc, 1.0, v[0:1], 1.0 ; GFX6-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] -; GFX6-NEXT: v_mov_b32_e32 v18, 0x3ff00000 -; GFX6-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] -; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 -; GFX6-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 -; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0 -; GFX6-NEXT: v_fma_f64 v[12:13], -v[4:5], v[6:7], 1.0 -; GFX6-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] -; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v18, v9 -; GFX6-NEXT: v_mul_f64 v[12:13], v[8:9], v[6:7] +; GFX6-NEXT: v_mov_b32_e32 v14, 0x3ff00000 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v14, v11 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5 -; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[12:13], v[8:9] -; GFX6-NEXT: v_fma_f64 v[4:5], -v[10:11], v[14:15], 1.0 -; GFX6-NEXT: v_div_scale_f64 v[16:17], s[6:7], 1.0, v[2:3], 1.0 -; GFX6-NEXT: v_fma_f64 v[4:5], v[14:15], v[4:5], v[14:15] +; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_mul_f64 v[14:15], v[16:17], v[4:5] -; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[8:9], v[6:7], v[12:13] -; GFX6-NEXT: v_fma_f64 v[8:9], -v[10:11], v[14:15], v[16:17] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v18, v17 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v11 +; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX6-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7] +; GFX6-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11] +; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; GFX6-NEXT: v_div_scale_f64 v[6:7], vcc, v[2:3], v[2:3], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[12:13], vcc, 1.0, v[2:3], 1.0 +; GFX6-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v14, v13 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0 -; GFX6-NEXT: s_nop 0 -; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[8:9], v[4:5], v[14:15] -; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0 +; GFX6-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; GFX6-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX6-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX6-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX6-NEXT: v_mul_f64 v[10:11], v[12:13], v[8:9] +; GFX6-NEXT: v_fma_f64 v[6:7], -v[6:7], v[10:11], v[12:13] +; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[10:11] +; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_rcp_v2f64_arcp: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 -; GFX8-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0 -; GFX8-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0 -; GFX8-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] -; GFX8-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] -; GFX8-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX8-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX8-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX8-NEXT: v_div_scale_f64 v[12:13], vcc, 1.0, v[0:1], 1.0 -; GFX8-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX8-NEXT: v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0 -; GFX8-NEXT: v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0 -; GFX8-NEXT: v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9] -; GFX8-NEXT: v_fma_f64 v[10:11], v[10:11], v[18:19], v[10:11] -; GFX8-NEXT: v_mul_f64 v[14:15], v[12:13], v[8:9] -; GFX8-NEXT: v_mul_f64 v[18:19], v[16:17], v[10:11] -; GFX8-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[12:13] -; GFX8-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[16:17] -; GFX8-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] -; GFX8-NEXT: s_mov_b64 vcc, s[4:5] -; GFX8-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] +; GFX8-NEXT: v_div_scale_f64 v[4:5], vcc, v[0:1], v[0:1], 1.0 +; GFX8-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; GFX8-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX8-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX8-NEXT: v_div_scale_f64 v[8:9], vcc, 1.0, v[0:1], 1.0 +; GFX8-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0 +; GFX8-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; GFX8-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] +; GFX8-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] +; GFX8-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] +; GFX8-NEXT: v_div_scale_f64 v[6:7], vcc, v[2:3], v[2:3], 1.0 ; GFX8-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; GFX8-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] +; GFX8-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX8-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX8-NEXT: v_div_scale_f64 v[10:11], vcc, 1.0, v[2:3], 1.0 +; GFX8-NEXT: v_fma_f64 v[12:13], -v[6:7], v[8:9], 1.0 +; GFX8-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; GFX8-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] +; GFX8-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] +; GFX8-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] ; GFX8-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_rcp_v2f64_arcp: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0 -; GFX9-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0 -; GFX9-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] -; GFX9-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] -; GFX9-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX9-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX9-NEXT: v_div_scale_f64 v[12:13], vcc, 1.0, v[0:1], 1.0 -; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX9-NEXT: v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0 -; GFX9-NEXT: v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0 -; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9] -; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[18:19], v[10:11] -; GFX9-NEXT: v_mul_f64 v[14:15], v[12:13], v[8:9] -; GFX9-NEXT: v_mul_f64 v[18:19], v[16:17], v[10:11] -; GFX9-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[12:13] -; GFX9-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[16:17] -; GFX9-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] -; GFX9-NEXT: s_mov_b64 vcc, s[4:5] -; GFX9-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] +; GFX9-NEXT: v_div_scale_f64 v[4:5], vcc, v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; GFX9-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX9-NEXT: v_div_scale_f64 v[8:9], vcc, 1.0, v[0:1], 1.0 +; GFX9-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0 +; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; GFX9-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] +; GFX9-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] +; GFX9-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] +; GFX9-NEXT: v_div_scale_f64 v[6:7], vcc, v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; GFX9-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] +; GFX9-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX9-NEXT: v_div_scale_f64 v[10:11], vcc, 1.0, v[2:3], 1.0 +; GFX9-NEXT: v_fma_f64 v[12:13], -v[6:7], v[8:9], 1.0 +; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; GFX9-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] +; GFX9-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] +; GFX9-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] ; GFX9-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1384,28 +1369,27 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_div_scale_f64 v[4:5], s4, v[0:1], v[0:1], 1.0 -; GFX10-NEXT: v_div_scale_f64 v[6:7], s4, v[2:3], v[2:3], 1.0 -; GFX10-NEXT: v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0 -; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] -; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] -; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX10-NEXT: v_div_scale_f64 v[12:13], s4, 1.0, v[2:3], 1.0 -; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX10-NEXT: v_mul_f64 v[14:15], v[16:17], v[8:9] -; GFX10-NEXT: v_mul_f64 v[18:19], v[12:13], v[10:11] -; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17] -; GFX10-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[12:13] -; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] -; GFX10-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] +; GFX10-NEXT: v_div_scale_f64 v[4:5], vcc, v[0:1], v[0:1], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc, 1.0, v[0:1], 1.0 +; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX10-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7] +; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11] +; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; GFX10-NEXT: v_div_scale_f64 v[6:7], vcc, v[2:3], v[2:3], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[12:13], vcc, 1.0, v[2:3], 1.0 ; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX10-NEXT: v_mul_f64 v[10:11], v[12:13], v[8:9] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[6:7], v[10:11], v[12:13] +; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[10:11] ; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1413,37 +1397,37 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_div_scale_f64 v[4:5], null, v[0:1], v[0:1], 1.0 -; GFX11-NEXT: v_div_scale_f64 v[6:7], null, v[2:3], v[2:3], 1.0 -; GFX11-NEXT: v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] -; GFX11-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] +; GFX11-NEXT: v_div_scale_f64 v[4:5], vcc, v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_div_scale_f64 v[10:11], vcc, 1.0, v[0:1], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX11-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX11-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX11-NEXT: v_div_scale_f64 v[12:13], s0, 1.0, v[2:3], 1.0 -; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_f64 v[14:15], v[16:17], v[8:9] -; GFX11-NEXT: v_mul_f64 v[18:19], v[12:13], v[10:11] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17] -; GFX11-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[12:13] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] -; GFX11-NEXT: s_mov_b32 vcc_lo, s0 -; GFX11-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX11-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX11-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7] +; GFX11-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; GFX11-NEXT: v_div_scale_f64 v[6:7], vcc, v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_div_scale_f64 v[12:13], vcc, 1.0, v[2:3], 1.0 ; GFX11-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[10:11], v[12:13], v[8:9] +; GFX11-NEXT: v_fma_f64 v[6:7], -v[6:7], v[10:11], v[12:13] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[10:11] ; GFX11-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x double> , %x @@ -1530,91 +1514,88 @@ ; GFX6-LABEL: v_rcp_v2f64_ulp25: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 -; GFX6-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[4:5], vcc, v[0:1], v[0:1], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[10:11], vcc, 1.0, v[0:1], 1.0 ; GFX6-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] -; GFX6-NEXT: v_mov_b32_e32 v18, 0x3ff00000 -; GFX6-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] -; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 -; GFX6-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 -; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0 -; GFX6-NEXT: v_fma_f64 v[12:13], -v[4:5], v[6:7], 1.0 -; GFX6-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] -; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v18, v9 -; GFX6-NEXT: v_mul_f64 v[12:13], v[8:9], v[6:7] +; GFX6-NEXT: v_mov_b32_e32 v14, 0x3ff00000 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v14, v11 ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5 -; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[12:13], v[8:9] -; GFX6-NEXT: v_fma_f64 v[4:5], -v[10:11], v[14:15], 1.0 -; GFX6-NEXT: v_div_scale_f64 v[16:17], s[6:7], 1.0, v[2:3], 1.0 -; GFX6-NEXT: v_fma_f64 v[4:5], v[14:15], v[4:5], v[14:15] +; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_mul_f64 v[14:15], v[16:17], v[4:5] -; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[8:9], v[6:7], v[12:13] -; GFX6-NEXT: v_fma_f64 v[8:9], -v[10:11], v[14:15], v[16:17] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v18, v17 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v11 +; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX6-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX6-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX6-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7] +; GFX6-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11] +; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; GFX6-NEXT: v_div_scale_f64 v[6:7], vcc, v[2:3], v[2:3], 1.0 +; GFX6-NEXT: v_div_scale_f64 v[12:13], vcc, 1.0, v[2:3], 1.0 +; GFX6-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v7 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v14, v13 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0 -; GFX6-NEXT: s_nop 0 -; GFX6-NEXT: v_div_fmas_f64 v[4:5], v[8:9], v[4:5], v[14:15] -; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0 +; GFX6-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; GFX6-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX6-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX6-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX6-NEXT: v_mul_f64 v[10:11], v[12:13], v[8:9] +; GFX6-NEXT: v_fma_f64 v[6:7], -v[6:7], v[10:11], v[12:13] +; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[10:11] +; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_rcp_v2f64_ulp25: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 -; GFX8-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0 -; GFX8-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0 -; GFX8-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] -; GFX8-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] -; GFX8-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX8-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX8-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX8-NEXT: v_div_scale_f64 v[12:13], vcc, 1.0, v[0:1], 1.0 -; GFX8-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX8-NEXT: v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0 -; GFX8-NEXT: v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0 -; GFX8-NEXT: v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9] -; GFX8-NEXT: v_fma_f64 v[10:11], v[10:11], v[18:19], v[10:11] -; GFX8-NEXT: v_mul_f64 v[14:15], v[12:13], v[8:9] -; GFX8-NEXT: v_mul_f64 v[18:19], v[16:17], v[10:11] -; GFX8-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[12:13] -; GFX8-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[16:17] -; GFX8-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] -; GFX8-NEXT: s_mov_b64 vcc, s[4:5] -; GFX8-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] +; GFX8-NEXT: v_div_scale_f64 v[4:5], vcc, v[0:1], v[0:1], 1.0 +; GFX8-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; GFX8-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX8-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX8-NEXT: v_div_scale_f64 v[8:9], vcc, 1.0, v[0:1], 1.0 +; GFX8-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0 +; GFX8-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; GFX8-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] +; GFX8-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] +; GFX8-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] +; GFX8-NEXT: v_div_scale_f64 v[6:7], vcc, v[2:3], v[2:3], 1.0 ; GFX8-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; GFX8-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] +; GFX8-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX8-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX8-NEXT: v_div_scale_f64 v[10:11], vcc, 1.0, v[2:3], 1.0 +; GFX8-NEXT: v_fma_f64 v[12:13], -v[6:7], v[8:9], 1.0 +; GFX8-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; GFX8-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] +; GFX8-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] +; GFX8-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] ; GFX8-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_rcp_v2f64_ulp25: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 -; GFX9-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0 -; GFX9-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0 -; GFX9-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] -; GFX9-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] -; GFX9-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX9-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX9-NEXT: v_div_scale_f64 v[12:13], vcc, 1.0, v[0:1], 1.0 -; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX9-NEXT: v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0 -; GFX9-NEXT: v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0 -; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9] -; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[18:19], v[10:11] -; GFX9-NEXT: v_mul_f64 v[14:15], v[12:13], v[8:9] -; GFX9-NEXT: v_mul_f64 v[18:19], v[16:17], v[10:11] -; GFX9-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[12:13] -; GFX9-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[16:17] -; GFX9-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] -; GFX9-NEXT: s_mov_b64 vcc, s[4:5] -; GFX9-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] +; GFX9-NEXT: v_div_scale_f64 v[4:5], vcc, v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; GFX9-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX9-NEXT: v_div_scale_f64 v[8:9], vcc, 1.0, v[0:1], 1.0 +; GFX9-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0 +; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; GFX9-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] +; GFX9-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] +; GFX9-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] +; GFX9-NEXT: v_div_scale_f64 v[6:7], vcc, v[2:3], v[2:3], 1.0 ; GFX9-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; GFX9-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] +; GFX9-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX9-NEXT: v_div_scale_f64 v[10:11], vcc, 1.0, v[2:3], 1.0 +; GFX9-NEXT: v_fma_f64 v[12:13], -v[6:7], v[8:9], 1.0 +; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; GFX9-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] +; GFX9-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] +; GFX9-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] ; GFX9-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1622,28 +1603,27 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_div_scale_f64 v[4:5], s4, v[0:1], v[0:1], 1.0 -; GFX10-NEXT: v_div_scale_f64 v[6:7], s4, v[2:3], v[2:3], 1.0 -; GFX10-NEXT: v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0 -; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] -; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] -; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX10-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX10-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX10-NEXT: v_div_scale_f64 v[12:13], s4, 1.0, v[2:3], 1.0 -; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX10-NEXT: v_mul_f64 v[14:15], v[16:17], v[8:9] -; GFX10-NEXT: v_mul_f64 v[18:19], v[12:13], v[10:11] -; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17] -; GFX10-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[12:13] -; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] -; GFX10-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] +; GFX10-NEXT: v_div_scale_f64 v[4:5], vcc, v[0:1], v[0:1], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc, 1.0, v[0:1], 1.0 +; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX10-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7] +; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11] +; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; GFX10-NEXT: v_div_scale_f64 v[6:7], vcc, v[2:3], v[2:3], 1.0 +; GFX10-NEXT: v_div_scale_f64 v[12:13], vcc, 1.0, v[2:3], 1.0 ; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX10-NEXT: v_mul_f64 v[10:11], v[12:13], v[8:9] +; GFX10-NEXT: v_fma_f64 v[6:7], -v[6:7], v[10:11], v[12:13] +; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[10:11] ; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1651,37 +1631,37 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_div_scale_f64 v[4:5], null, v[0:1], v[0:1], 1.0 -; GFX11-NEXT: v_div_scale_f64 v[6:7], null, v[2:3], v[2:3], 1.0 -; GFX11-NEXT: v_div_scale_f64 v[16:17], vcc_lo, 1.0, v[0:1], 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] -; GFX11-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] +; GFX11-NEXT: v_div_scale_f64 v[4:5], vcc, v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_div_scale_f64 v[10:11], vcc, 1.0, v[0:1], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX11-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 -; GFX11-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX11-NEXT: v_div_scale_f64 v[12:13], s0, 1.0, v[2:3], 1.0 -; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_f64 v[14:15], v[16:17], v[8:9] -; GFX11-NEXT: v_mul_f64 v[18:19], v[12:13], v[10:11] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[16:17] -; GFX11-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[12:13] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] -; GFX11-NEXT: s_mov_b32 vcc_lo, s0 -; GFX11-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX11-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; GFX11-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7] +; GFX11-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; GFX11-NEXT: v_div_scale_f64 v[6:7], vcc, v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_div_scale_f64 v[12:13], vcc, 1.0, v[2:3], 1.0 ; GFX11-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 +; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[10:11], v[12:13], v[8:9] +; GFX11-NEXT: v_fma_f64 v[6:7], -v[6:7], v[10:11], v[12:13] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[10:11] ; GFX11-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x double> , %x, !fpmath !0 @@ -1768,90 +1748,87 @@ ; GFX6-LABEL: v_fdiv_v2f64_arcp_ulp25: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] -; GFX6-NEXT: v_div_scale_f64 v[14:15], s[4:5], v[6:7], v[6:7], v[2:3] +; GFX6-NEXT: v_div_scale_f64 v[8:9], vcc, v[4:5], v[4:5], v[0:1] +; GFX6-NEXT: v_div_scale_f64 v[14:15], vcc, v[0:1], v[4:5], v[0:1] ; GFX6-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] -; GFX6-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[0:1], v[4:5], v[0:1] -; GFX6-NEXT: v_rcp_f64_e32 v[16:17], v[14:15] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v9 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v15 +; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v19 ; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v9 ; GFX6-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] ; GFX6-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX6-NEXT: v_fma_f64 v[12:13], -v[14:15], v[16:17], 1.0 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v15 -; GFX6-NEXT: v_fma_f64 v[12:13], v[16:17], v[12:13], v[16:17] -; GFX6-NEXT: v_mul_f64 v[16:17], v[18:19], v[10:11] -; GFX6-NEXT: v_fma_f64 v[18:19], -v[8:9], v[16:17], v[18:19] -; GFX6-NEXT: v_fma_f64 v[8:9], -v[14:15], v[12:13], 1.0 -; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[18:19], v[10:11], v[16:17] -; GFX6-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13] -; GFX6-NEXT: v_div_scale_f64 v[12:13], s[6:7], v[2:3], v[6:7], v[2:3] -; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[10:11], v[4:5], v[0:1] -; GFX6-NEXT: v_mul_f64 v[16:17], v[12:13], v[8:9] -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v13 -; GFX6-NEXT: v_fma_f64 v[18:19], -v[14:15], v[16:17], v[12:13] +; GFX6-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11] +; GFX6-NEXT: v_fma_f64 v[14:15], -v[8:9], v[12:13], v[14:15] +; GFX6-NEXT: v_div_fmas_f64 v[8:9], v[14:15], v[10:11], v[12:13] +; GFX6-NEXT: v_div_scale_f64 v[10:11], vcc, v[6:7], v[6:7], v[2:3] +; GFX6-NEXT: v_div_scale_f64 v[16:17], vcc, v[2:3], v[6:7], v[2:3] +; GFX6-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v11 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v17 ; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: s_nop 1 -; GFX6-NEXT: v_div_fmas_f64 v[8:9], v[18:19], v[8:9], v[16:17] -; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[8:9], v[6:7], v[2:3] +; GFX6-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX6-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX6-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX6-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX6-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX6-NEXT: v_mul_f64 v[14:15], v[16:17], v[12:13] +; GFX6-NEXT: v_fma_f64 v[10:11], -v[10:11], v[14:15], v[16:17] +; GFX6-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[14:15] +; GFX6-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fdiv_v2f64_arcp_ulp25: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] -; GFX8-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3] -; GFX8-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] -; GFX8-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] -; GFX8-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX8-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX8-NEXT: v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1] -; GFX8-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX8-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 -; GFX8-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] -; GFX8-NEXT: v_mul_f64 v[16:17], v[18:19], v[12:13] -; GFX8-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19] -; GFX8-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3] -; GFX8-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17] -; GFX8-NEXT: s_mov_b64 vcc, s[4:5] -; GFX8-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15] +; GFX8-NEXT: v_div_scale_f64 v[8:9], vcc, v[4:5], v[4:5], v[0:1] +; GFX8-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; GFX8-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX8-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX8-NEXT: v_div_scale_f64 v[12:13], vcc, v[0:1], v[4:5], v[0:1] +; GFX8-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; GFX8-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; GFX8-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] +; GFX8-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] +; GFX8-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] +; GFX8-NEXT: v_div_scale_f64 v[10:11], vcc, v[6:7], v[6:7], v[2:3] ; GFX8-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] -; GFX8-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19] -; GFX8-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21] +; GFX8-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; GFX8-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX8-NEXT: v_div_scale_f64 v[14:15], vcc, v[2:3], v[6:7], v[2:3] +; GFX8-NEXT: v_fma_f64 v[16:17], -v[10:11], v[12:13], 1.0 +; GFX8-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX8-NEXT: v_mul_f64 v[16:17], v[14:15], v[12:13] +; GFX8-NEXT: v_fma_f64 v[10:11], -v[10:11], v[16:17], v[14:15] +; GFX8-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[16:17] ; GFX8-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fdiv_v2f64_arcp_ulp25: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[4:5], v[4:5], v[0:1] -; GFX9-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[6:7], v[6:7], v[2:3] -; GFX9-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] -; GFX9-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] -; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX9-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX9-NEXT: v_div_scale_f64 v[18:19], vcc, v[0:1], v[4:5], v[0:1] -; GFX9-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX9-NEXT: v_fma_f64 v[16:17], -v[10:11], v[14:15], 1.0 -; GFX9-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] -; GFX9-NEXT: v_mul_f64 v[16:17], v[18:19], v[12:13] -; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[16:17], v[18:19] -; GFX9-NEXT: v_div_scale_f64 v[18:19], s[4:5], v[2:3], v[6:7], v[2:3] -; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[16:17] -; GFX9-NEXT: s_mov_b64 vcc, s[4:5] -; GFX9-NEXT: v_mul_f64 v[20:21], v[18:19], v[14:15] +; GFX9-NEXT: v_div_scale_f64 v[8:9], vcc, v[4:5], v[4:5], v[0:1] +; GFX9-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; GFX9-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX9-NEXT: v_div_scale_f64 v[12:13], vcc, v[0:1], v[4:5], v[0:1] +; GFX9-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; GFX9-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] +; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] +; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] +; GFX9-NEXT: v_div_scale_f64 v[10:11], vcc, v[6:7], v[6:7], v[2:3] ; GFX9-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] -; GFX9-NEXT: v_fma_f64 v[10:11], -v[10:11], v[20:21], v[18:19] -; GFX9-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[20:21] +; GFX9-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; GFX9-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX9-NEXT: v_div_scale_f64 v[14:15], vcc, v[2:3], v[6:7], v[2:3] +; GFX9-NEXT: v_fma_f64 v[16:17], -v[10:11], v[12:13], 1.0 +; GFX9-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; GFX9-NEXT: v_mul_f64 v[16:17], v[14:15], v[12:13] +; GFX9-NEXT: v_fma_f64 v[10:11], -v[10:11], v[16:17], v[14:15] +; GFX9-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[16:17] ; GFX9-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1859,28 +1836,27 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_div_scale_f64 v[8:9], s4, v[4:5], v[4:5], v[0:1] -; GFX10-NEXT: v_div_scale_f64 v[10:11], s4, v[6:7], v[6:7], v[2:3] -; GFX10-NEXT: v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[4:5], v[0:1] -; GFX10-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] -; GFX10-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] -; GFX10-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX10-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX10-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX10-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX10-NEXT: v_div_scale_f64 v[16:17], s4, v[2:3], v[6:7], v[2:3] -; GFX10-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX10-NEXT: v_mul_f64 v[18:19], v[20:21], v[12:13] -; GFX10-NEXT: v_mul_f64 v[22:23], v[16:17], v[14:15] -; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[20:21] -; GFX10-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[16:17] -; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] -; GFX10-NEXT: s_mov_b32 vcc_lo, s4 -; GFX10-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] +; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc, v[4:5], v[4:5], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[14:15], vcc, v[0:1], v[4:5], v[0:1] +; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX10-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11] +; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[12:13], v[14:15] +; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[12:13] +; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc, v[6:7], v[6:7], v[2:3] +; GFX10-NEXT: v_div_scale_f64 v[16:17], vcc, v[2:3], v[6:7], v[2:3] ; GFX10-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX10-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; GFX10-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX10-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX10-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX10-NEXT: v_mul_f64 v[14:15], v[16:17], v[12:13] +; GFX10-NEXT: v_fma_f64 v[10:11], -v[10:11], v[14:15], v[16:17] +; GFX10-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[14:15] ; GFX10-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -1888,37 +1864,37 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_div_scale_f64 v[8:9], null, v[4:5], v[4:5], v[0:1] -; GFX11-NEXT: v_div_scale_f64 v[10:11], null, v[6:7], v[6:7], v[2:3] -; GFX11-NEXT: v_div_scale_f64 v[20:21], vcc_lo, v[0:1], v[4:5], v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_rcp_f64_e32 v[12:13], v[8:9] -; GFX11-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] +; GFX11-NEXT: v_div_scale_f64 v[8:9], vcc, v[4:5], v[4:5], v[0:1] +; GFX11-NEXT: v_div_scale_f64 v[14:15], vcc, v[0:1], v[4:5], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX11-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX11-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f64 v[16:17], -v[8:9], v[12:13], 1.0 -; GFX11-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] -; GFX11-NEXT: v_div_scale_f64 v[16:17], s0, v[2:3], v[6:7], v[2:3] -; GFX11-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_f64 v[18:19], v[20:21], v[12:13] -; GFX11-NEXT: v_mul_f64 v[22:23], v[16:17], v[14:15] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_fma_f64 v[8:9], -v[8:9], v[18:19], v[20:21] -; GFX11-NEXT: v_fma_f64 v[10:11], -v[10:11], v[22:23], v[16:17] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[12:13], v[18:19] -; GFX11-NEXT: s_mov_b32 vcc_lo, s0 -; GFX11-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[14:15], v[22:23] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 +; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[12:13], v[14:15], v[10:11] +; GFX11-NEXT: v_fma_f64 v[8:9], -v[8:9], v[12:13], v[14:15] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[12:13] +; GFX11-NEXT: v_div_scale_f64 v[10:11], vcc, v[6:7], v[6:7], v[2:3] +; GFX11-NEXT: v_div_scale_f64 v[16:17], vcc, v[2:3], v[6:7], v[2:3] ; GFX11-NEXT: v_div_fixup_f64 v[0:1], v[8:9], v[4:5], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX11-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; GFX11-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f64 v[14:15], v[16:17], v[12:13] +; GFX11-NEXT: v_fma_f64 v[10:11], -v[10:11], v[14:15], v[16:17] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[14:15] ; GFX11-NEXT: v_div_fixup_f64 v[2:3], v[10:11], v[6:7], v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x double> %a, %b, !fpmath !0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll @@ -13,7 +13,7 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; CI-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0 +; CI-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 ; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 ; CI-NEXT: v_rcp_f32_e32 v4, v2 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -166,7 +166,7 @@ ; CI-NEXT: s_load_dword s0, s[0:1], 0x4 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, s2 +; CI-NEXT: v_div_scale_f32 v1, vcc, v0, v0, s2 ; CI-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2 ; CI-NEXT: v_rcp_f32_e32 v3, v1 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -195,7 +195,7 @@ ; VI-NEXT: s_load_dword s0, s[0:1], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, s2 +; VI-NEXT: v_div_scale_f32 v1, vcc, v0, v0, s2 ; VI-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2 ; VI-NEXT: v_rcp_f32_e32 v3, v1 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -321,7 +321,7 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], s[2:3] +; CI-NEXT: v_div_scale_f64 v[2:3], vcc, v[0:1], v[0:1], s[2:3] ; CI-NEXT: v_div_scale_f64 v[8:9], vcc, s[2:3], v[0:1], s[2:3] ; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -349,7 +349,7 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], s[2:3] +; VI-NEXT: v_div_scale_f64 v[2:3], vcc, v[0:1], v[0:1], s[2:3] ; VI-NEXT: v_div_scale_f64 v[8:9], vcc, s[2:3], v[0:1], s[2:3] ; VI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -499,9 +499,9 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; CI-NEXT: s_lshr_b32 s6, s0, 16 -; CI-NEXT: s_lshr_b32 s3, s2, 16 -; CI-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0 +; CI-NEXT: s_lshr_b32 s1, s2, 16 +; CI-NEXT: s_lshr_b32 s3, s0, 16 +; CI-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 ; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 ; CI-NEXT: v_rcp_f32_e32 v4, v2 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -516,11 +516,11 @@ ; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 ; CI-NEXT: v_trunc_f32_e32 v2, v2 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, s6 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s1 +; CI-NEXT: v_cvt_f32_f16_e32 v2, s3 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_div_scale_f32 v3, s[0:1], v2, v2, v1 +; CI-NEXT: v_div_scale_f32 v3, vcc, v2, v2, v1 ; CI-NEXT: v_div_scale_f32 v4, vcc, v1, v2, v1 ; CI-NEXT: v_rcp_f32_e32 v5, v3 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -599,11 +599,11 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; CI-NEXT: s_lshr_b32 s8, s2, 16 -; CI-NEXT: s_lshr_b32 s9, s3, 16 -; CI-NEXT: s_lshr_b32 s10, s0, 16 -; CI-NEXT: v_div_scale_f32 v2, s[6:7], v1, v1, v0 -; CI-NEXT: s_lshr_b32 s11, s1, 16 +; CI-NEXT: s_lshr_b32 s6, s2, 16 +; CI-NEXT: s_lshr_b32 s7, s3, 16 +; CI-NEXT: s_lshr_b32 s8, s0, 16 +; CI-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 +; CI-NEXT: s_lshr_b32 s9, s1, 16 ; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 ; CI-NEXT: v_rcp_f32_e32 v4, v2 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -618,11 +618,11 @@ ; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 ; CI-NEXT: v_trunc_f32_e32 v2, v2 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, s8 -; CI-NEXT: v_cvt_f32_f16_e32 v2, s10 +; CI-NEXT: v_cvt_f32_f16_e32 v1, s6 +; CI-NEXT: v_cvt_f32_f16_e32 v2, s8 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_div_scale_f32 v3, s[6:7], v2, v2, v1 +; CI-NEXT: v_div_scale_f32 v3, vcc, v2, v2, v1 ; CI-NEXT: v_div_scale_f32 v4, vcc, v1, v2, v1 ; CI-NEXT: v_rcp_f32_e32 v5, v3 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -640,7 +640,7 @@ ; CI-NEXT: v_cvt_f32_f16_e32 v2, s3 ; CI-NEXT: v_cvt_f32_f16_e32 v3, s1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_div_scale_f32 v4, s[0:1], v3, v3, v2 +; CI-NEXT: v_div_scale_f32 v4, vcc, v3, v3, v2 ; CI-NEXT: v_div_scale_f32 v5, vcc, v2, v3, v2 ; CI-NEXT: v_rcp_f32_e32 v6, v4 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -655,10 +655,10 @@ ; CI-NEXT: v_div_fixup_f32 v4, v4, v3, v2 ; CI-NEXT: v_trunc_f32_e32 v4, v4 ; CI-NEXT: v_fma_f32 v2, -v4, v3, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, s9 -; CI-NEXT: v_cvt_f32_f16_e32 v4, s11 +; CI-NEXT: v_cvt_f32_f16_e32 v3, s7 +; CI-NEXT: v_cvt_f32_f16_e32 v4, s9 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_div_scale_f32 v5, s[0:1], v4, v4, v3 +; CI-NEXT: v_div_scale_f32 v5, vcc, v4, v4, v3 ; CI-NEXT: v_div_scale_f32 v6, vcc, v3, v4, v3 ; CI-NEXT: v_rcp_f32_e32 v7, v5 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -762,7 +762,7 @@ ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_div_scale_f32 v1, s[6:7], v0, v0, s2 +; CI-NEXT: v_div_scale_f32 v1, vcc, v0, v0, s2 ; CI-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2 ; CI-NEXT: v_rcp_f32_e32 v3, v1 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -778,7 +778,7 @@ ; CI-NEXT: v_trunc_f32_e32 v1, v1 ; CI-NEXT: v_fma_f32 v0, -v1, v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, s3 +; CI-NEXT: v_div_scale_f32 v2, vcc, v1, v1, s3 ; CI-NEXT: v_div_scale_f32 v3, vcc, s3, v1, s3 ; CI-NEXT: v_rcp_f32_e32 v4, v2 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -807,7 +807,7 @@ ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x20 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_div_scale_f32 v1, s[6:7], v0, v0, s2 +; VI-NEXT: v_div_scale_f32 v1, vcc, v0, v0, s2 ; VI-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2 ; VI-NEXT: v_rcp_f32_e32 v3, v1 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -823,7 +823,7 @@ ; VI-NEXT: v_trunc_f32_e32 v1, v1 ; VI-NEXT: v_fma_f32 v0, -v1, v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, s3 +; VI-NEXT: v_div_scale_f32 v2, vcc, v1, v1, s3 ; VI-NEXT: v_div_scale_f32 v3, vcc, s3, v1, s3 ; VI-NEXT: v_rcp_f32_e32 v4, v2 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -860,7 +860,7 @@ ; CI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x10 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s8 -; CI-NEXT: v_div_scale_f32 v1, s[6:7], v0, v0, s0 +; CI-NEXT: v_div_scale_f32 v1, vcc, v0, v0, s0 ; CI-NEXT: v_div_scale_f32 v2, vcc, s0, v0, s0 ; CI-NEXT: v_rcp_f32_e32 v3, v1 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -876,7 +876,7 @@ ; CI-NEXT: v_trunc_f32_e32 v1, v1 ; CI-NEXT: v_fma_f32 v0, -v1, v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s9 -; CI-NEXT: v_div_scale_f32 v2, s[6:7], v1, v1, s1 +; CI-NEXT: v_div_scale_f32 v2, vcc, v1, v1, s1 ; CI-NEXT: v_div_scale_f32 v3, vcc, s1, v1, s1 ; CI-NEXT: v_rcp_f32_e32 v4, v2 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -892,7 +892,7 @@ ; CI-NEXT: v_trunc_f32_e32 v2, v2 ; CI-NEXT: v_fma_f32 v1, -v2, v1, s1 ; CI-NEXT: v_mov_b32_e32 v2, s10 -; CI-NEXT: v_div_scale_f32 v3, s[0:1], v2, v2, s2 +; CI-NEXT: v_div_scale_f32 v3, vcc, v2, v2, s2 ; CI-NEXT: v_div_scale_f32 v4, vcc, s2, v2, s2 ; CI-NEXT: v_rcp_f32_e32 v5, v3 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -908,7 +908,7 @@ ; CI-NEXT: v_trunc_f32_e32 v3, v3 ; CI-NEXT: v_fma_f32 v2, -v3, v2, s2 ; CI-NEXT: v_mov_b32_e32 v3, s11 -; CI-NEXT: v_div_scale_f32 v4, s[0:1], v3, v3, s3 +; CI-NEXT: v_div_scale_f32 v4, vcc, v3, v3, s3 ; CI-NEXT: v_div_scale_f32 v5, vcc, s3, v3, s3 ; CI-NEXT: v_rcp_f32_e32 v6, v4 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -937,7 +937,7 @@ ; VI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x40 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 -; VI-NEXT: v_div_scale_f32 v1, s[6:7], v0, v0, s0 +; VI-NEXT: v_div_scale_f32 v1, vcc, v0, v0, s0 ; VI-NEXT: v_div_scale_f32 v2, vcc, s0, v0, s0 ; VI-NEXT: v_rcp_f32_e32 v3, v1 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -953,7 +953,7 @@ ; VI-NEXT: v_trunc_f32_e32 v1, v1 ; VI-NEXT: v_fma_f32 v0, -v1, v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_div_scale_f32 v2, s[6:7], v1, v1, s1 +; VI-NEXT: v_div_scale_f32 v2, vcc, v1, v1, s1 ; VI-NEXT: v_div_scale_f32 v3, vcc, s1, v1, s1 ; VI-NEXT: v_rcp_f32_e32 v4, v2 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -969,7 +969,7 @@ ; VI-NEXT: v_trunc_f32_e32 v2, v2 ; VI-NEXT: v_fma_f32 v1, -v2, v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s10 -; VI-NEXT: v_div_scale_f32 v3, s[0:1], v2, v2, s2 +; VI-NEXT: v_div_scale_f32 v3, vcc, v2, v2, s2 ; VI-NEXT: v_div_scale_f32 v4, vcc, s2, v2, s2 ; VI-NEXT: v_rcp_f32_e32 v5, v3 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -985,7 +985,7 @@ ; VI-NEXT: v_trunc_f32_e32 v3, v3 ; VI-NEXT: v_fma_f32 v2, -v3, v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: v_div_scale_f32 v4, s[0:1], v3, v3, s3 +; VI-NEXT: v_div_scale_f32 v4, vcc, v3, v3, s3 ; VI-NEXT: v_div_scale_f32 v5, vcc, s3, v3, s3 ; VI-NEXT: v_rcp_f32_e32 v6, v4 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -1023,7 +1023,7 @@ ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s8 ; CI-NEXT: v_mov_b32_e32 v1, s9 -; CI-NEXT: v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], s[0:1] +; CI-NEXT: v_div_scale_f64 v[2:3], vcc, v[0:1], v[0:1], s[0:1] ; CI-NEXT: v_div_scale_f64 v[8:9], vcc, s[0:1], v[0:1], s[0:1] ; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -1038,7 +1038,7 @@ ; CI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[0:1] ; CI-NEXT: v_mov_b32_e32 v2, s10 ; CI-NEXT: v_mov_b32_e32 v3, s11 -; CI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], s[2:3] +; CI-NEXT: v_div_scale_f64 v[4:5], vcc, v[2:3], v[2:3], s[2:3] ; CI-NEXT: v_div_scale_f64 v[10:11], vcc, s[2:3], v[2:3], s[2:3] ; CI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 @@ -1066,7 +1066,7 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], s[0:1] +; VI-NEXT: v_div_scale_f64 v[2:3], vcc, v[0:1], v[0:1], s[0:1] ; VI-NEXT: v_div_scale_f64 v[8:9], vcc, s[0:1], v[0:1], s[0:1] ; VI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 @@ -1081,7 +1081,7 @@ ; VI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[0:1] ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s11 -; VI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], s[2:3] +; VI-NEXT: v_div_scale_f64 v[4:5], vcc, v[2:3], v[2:3], s[2:3] ; VI-NEXT: v_div_scale_f64 v[10:11], vcc, s[2:3], v[2:3], s[2:3] ; VI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; VI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll @@ -20,8 +20,8 @@ ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v0, v0, v2 ; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7] +; GFX7-NEXT: v_div_scale_f32 v0, vcc, v0, v0, v2 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -40,7 +40,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_load_dword v1, v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0 +; GFX8-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -55,7 +55,7 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s2, v2, v2, v1 +; GFX10-NEXT: v_div_scale_f32 v0, vcc, v2, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm @@ -69,7 +69,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, v1 +; GFX11-NEXT: v_div_scale_f32 v0, vcc, v0, v0, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -102,8 +102,8 @@ ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v2, v0, v2 ; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7] +; GFX7-NEXT: v_div_scale_f32 v0, vcc, v2, v0, v2 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -122,7 +122,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_load_dword v1, v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v1, v0 +; GFX8-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -137,7 +137,7 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s2, v1, v2, v1 +; GFX10-NEXT: v_div_scale_f32 v0, vcc, v1, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm @@ -151,7 +151,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, v1, v0, v1 +; GFX11-NEXT: v_div_scale_f32 v0, vcc, v1, v0, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -186,7 +186,7 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[2:3], v[2:3], v[0:1] +; GFX7-NEXT: v_div_scale_f64 v[0:1], vcc, v[2:3], v[2:3], v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -208,7 +208,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[2:3], v[2:3], v[0:1] +; GFX8-NEXT: v_div_scale_f64 v[0:1], vcc, v[2:3], v[2:3], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -224,7 +224,7 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[2:3], v[2:3], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[0:1], vcc, v[2:3], v[2:3], v[0:1] ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm @@ -239,7 +239,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_load_b64 v[2:3], v2, s[2:3] offset:8 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[2:3], v[2:3], v[0:1] +; GFX11-NEXT: v_div_scale_f64 v[0:1], vcc, v[2:3], v[2:3], v[0:1] ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -274,7 +274,7 @@ ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[2:3], v[0:1] +; GFX7-NEXT: v_div_scale_f64 v[0:1], vcc, v[0:1], v[2:3], v[0:1] ; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -296,7 +296,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[2:3], v[0:1] +; GFX8-NEXT: v_div_scale_f64 v[0:1], vcc, v[0:1], v[2:3], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -312,7 +312,7 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[0:1], v[2:3], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[0:1], vcc, v[0:1], v[2:3], v[0:1] ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm @@ -327,7 +327,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_load_b64 v[2:3], v2, s[2:3] offset:8 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[0:1], v[2:3], v[0:1] +; GFX11-NEXT: v_div_scale_f64 v[0:1], vcc, v[0:1], v[2:3], v[0:1] ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -360,7 +360,7 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_div_scale_f32 v0, s[0:1], v0, v0, s8 +; GFX7-NEXT: v_div_scale_f32 v0, vcc, v0, v0, s8 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; @@ -376,7 +376,7 @@ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, s0 +; GFX8-NEXT: v_div_scale_f32 v2, vcc, v0, v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -391,7 +391,7 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s0, v0, v0, s0 +; GFX10-NEXT: v_div_scale_f32 v0, vcc, v0, v0, s0 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; @@ -403,7 +403,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, s0 +; GFX11-NEXT: v_div_scale_f32 v0, vcc, v0, v0, s0 ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -433,7 +433,7 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_div_scale_f32 v0, s[0:1], s8, v0, s8 +; GFX7-NEXT: v_div_scale_f32 v0, vcc, s8, v0, s8 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; @@ -449,7 +449,7 @@ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], s0, v0, s0 +; GFX8-NEXT: v_div_scale_f32 v2, vcc, s0, v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -464,7 +464,7 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s0, s0, v0, s0 +; GFX10-NEXT: v_div_scale_f32 v0, vcc, s0, v0, s0 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; @@ -476,7 +476,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s0, v0, s0 +; GFX11-NEXT: v_div_scale_f32 v0, vcc, s0, v0, s0 ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -506,7 +506,7 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_div_scale_f32 v0, s[0:1], s8, s8, v0 +; GFX7-NEXT: v_div_scale_f32 v0, vcc, s8, s8, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; @@ -522,7 +522,7 @@ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], s0, s0, v0 +; GFX8-NEXT: v_div_scale_f32 v2, vcc, s0, s0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -537,7 +537,7 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s0, s0, s0, v0 +; GFX10-NEXT: v_div_scale_f32 v0, vcc, s0, s0, v0 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; @@ -549,7 +549,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s0, s0, v0 +; GFX11-NEXT: v_div_scale_f32 v0, vcc, s0, s0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -579,7 +579,7 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_div_scale_f32 v0, s[0:1], v0, s8, v0 +; GFX7-NEXT: v_div_scale_f32 v0, vcc, v0, s8, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; @@ -595,7 +595,7 @@ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v0, s0, v0 +; GFX8-NEXT: v_div_scale_f32 v2, vcc, v0, s0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -610,7 +610,7 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s0, v0, s0, v0 +; GFX10-NEXT: v_div_scale_f32 v0, vcc, v0, s0, v0 ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; @@ -622,7 +622,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, v0, s0, v0 +; GFX11-NEXT: v_div_scale_f32 v0, vcc, v0, s0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -652,7 +652,7 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], s[0:1] +; GFX7-NEXT: v_div_scale_f64 v[0:1], vcc, v[0:1], v[0:1], s[0:1] ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; @@ -670,7 +670,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], s[0:1] +; GFX8-NEXT: v_div_scale_f64 v[0:1], vcc, v[0:1], v[0:1], s[0:1] ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -683,7 +683,7 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], v[0:1], s[0:1] +; GFX10-NEXT: v_div_scale_f64 v[0:1], vcc, v[0:1], v[0:1], s[0:1] ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; @@ -696,7 +696,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[0:1], v[0:1], s[0:1] +; GFX11-NEXT: v_div_scale_f64 v[0:1], vcc, v[0:1], v[0:1], s[0:1] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -726,7 +726,7 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[0:1], v[0:1], s[0:1] +; GFX7-NEXT: v_div_scale_f64 v[0:1], vcc, s[0:1], v[0:1], s[0:1] ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; @@ -744,7 +744,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[0:1], v[0:1], s[0:1] +; GFX8-NEXT: v_div_scale_f64 v[0:1], vcc, s[0:1], v[0:1], s[0:1] ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -757,7 +757,7 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[0:1], v[0:1], s[0:1] +; GFX10-NEXT: v_div_scale_f64 v[0:1], vcc, s[0:1], v[0:1], s[0:1] ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; @@ -770,7 +770,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[0:1], v[0:1], s[0:1] +; GFX11-NEXT: v_div_scale_f64 v[0:1], vcc, s[0:1], v[0:1], s[0:1] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -800,7 +800,7 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[0:1], s[0:1], v[0:1] +; GFX7-NEXT: v_div_scale_f64 v[0:1], vcc, s[0:1], s[0:1], v[0:1] ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; @@ -818,7 +818,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[0:1], s[0:1], v[0:1] +; GFX8-NEXT: v_div_scale_f64 v[0:1], vcc, s[0:1], s[0:1], v[0:1] ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -831,7 +831,7 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[0:1], s[0:1], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[0:1], vcc, s[0:1], s[0:1], v[0:1] ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; @@ -844,7 +844,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[0:1], s[0:1], v[0:1] +; GFX11-NEXT: v_div_scale_f64 v[0:1], vcc, s[0:1], s[0:1], v[0:1] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -874,7 +874,7 @@ ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], s[0:1], v[0:1] +; GFX7-NEXT: v_div_scale_f64 v[0:1], vcc, v[0:1], s[0:1], v[0:1] ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: s_endpgm ; @@ -892,7 +892,7 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], s[0:1], v[0:1] +; GFX8-NEXT: v_div_scale_f64 v[0:1], vcc, v[0:1], s[0:1], v[0:1] ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; @@ -905,7 +905,7 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], s[0:1], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[0:1], vcc, v[0:1], s[0:1], v[0:1] ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; @@ -918,7 +918,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[0:1], s[0:1], v[0:1] +; GFX11-NEXT: v_div_scale_f64 v[0:1], vcc, v[0:1], s[0:1], v[0:1] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -942,7 +942,7 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s3 -; GFX7-NEXT: v_div_scale_f32 v0, s[4:5], v0, v0, s4 +; GFX7-NEXT: v_div_scale_f32 v0, vcc, v0, v0, s4 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -954,7 +954,7 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, s3 +; GFX8-NEXT: v_div_scale_f32 v2, vcc, v0, v0, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -968,7 +968,7 @@ ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s0, s5, s5, s4 +; GFX10-NEXT: v_div_scale_f32 v0, vcc, s5, s5, s4 ; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; @@ -980,7 +980,7 @@ ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s2 +; GFX11-NEXT: v_div_scale_f32 v0, vcc, s3, s3, s2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -999,7 +999,7 @@ ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s3 -; GFX7-NEXT: v_div_scale_f32 v0, s[4:5], s4, v0, s4 +; GFX7-NEXT: v_div_scale_f32 v0, vcc, s4, v0, s4 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm @@ -1011,7 +1011,7 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s3, v0, s3 +; GFX8-NEXT: v_div_scale_f32 v2, vcc, s3, v0, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1025,7 +1025,7 @@ ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s0, s4, s5, s4 +; GFX10-NEXT: v_div_scale_f32 v0, vcc, s4, s5, s4 ; GFX10-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-NEXT: s_endpgm ; @@ -1037,7 +1037,7 @@ ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s2, s3, s2 +; GFX11-NEXT: v_div_scale_f32 v0, vcc, s2, s3, s2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1056,7 +1056,7 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[4:5] +; GFX7-NEXT: v_div_scale_f64 v[0:1], vcc, v[0:1], v[0:1], s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1070,7 +1070,7 @@ ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[4:5] +; GFX8-NEXT: v_div_scale_f64 v[0:1], vcc, v[0:1], v[0:1], s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1084,7 +1084,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[4:5], s[4:5], s[2:3] +; GFX10-NEXT: v_div_scale_f64 v[0:1], vcc, s[4:5], s[4:5], s[2:3] ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -1096,7 +1096,7 @@ ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[4:5], s[4:5], s[2:3] +; GFX11-NEXT: v_div_scale_f64 v[0:1], vcc, s[4:5], s[4:5], s[2:3] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1115,7 +1115,7 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], s[4:5], v[0:1], s[4:5] +; GFX7-NEXT: v_div_scale_f64 v[0:1], vcc, s[4:5], v[0:1], s[4:5] ; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1129,7 +1129,7 @@ ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], s[4:5], v[0:1], s[4:5] +; GFX8-NEXT: v_div_scale_f64 v[0:1], vcc, s[4:5], v[0:1], s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1143,7 +1143,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[2:3], s[4:5], s[2:3] +; GFX10-NEXT: v_div_scale_f64 v[0:1], vcc, s[2:3], s[4:5], s[2:3] ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -1155,7 +1155,7 @@ ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[2:3], s[4:5], s[2:3] +; GFX11-NEXT: v_div_scale_f64 v[0:1], vcc, s[2:3], s[4:5], s[2:3] ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1177,9 +1177,9 @@ ; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v0, v0, 1.0 ; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_div_scale_f32 v0, vcc, v0, v0, 1.0 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -1194,7 +1194,7 @@ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, 1.0 +; GFX8-NEXT: v_div_scale_f32 v2, vcc, v0, v0, 1.0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1208,7 +1208,7 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, 1.0 +; GFX10-NEXT: v_div_scale_f32 v0, vcc, v0, v0, 1.0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -1219,7 +1219,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, 1.0 +; GFX11-NEXT: v_div_scale_f32 v0, vcc, v0, v0, 1.0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1245,9 +1245,9 @@ ; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], 2.0, 2.0, v0 ; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_div_scale_f32 v0, vcc, 2.0, 2.0, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -1262,7 +1262,7 @@ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], 2.0, 2.0, v0 +; GFX8-NEXT: v_div_scale_f32 v2, vcc, 2.0, 2.0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1276,7 +1276,7 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s2, 2.0, 2.0, v0 +; GFX10-NEXT: v_div_scale_f32 v0, vcc, 2.0, 2.0, v0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -1287,7 +1287,7 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, 2.0, 2.0, v0 +; GFX11-NEXT: v_div_scale_f32 v0, vcc, 2.0, 2.0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1316,9 +1316,9 @@ ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: v_and_b32_e32 v1, 0x7fffffff, v2 -; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v0, v0, v1 ; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7] +; GFX7-NEXT: v_and_b32_e32 v1, 0x7fffffff, v2 +; GFX7-NEXT: v_div_scale_f32 v0, vcc, v0, v0, v1 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -1338,7 +1338,7 @@ ; GFX8-NEXT: flat_load_dword v0, v[0:1] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v1, 0x7fffffff, v2 -; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, v1 +; GFX8-NEXT: v_div_scale_f32 v2, vcc, v0, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1355,7 +1355,7 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_div_scale_f32 v0, s2, v2, v2, v0 +; GFX10-NEXT: v_div_scale_f32 v0, vcc, v2, v2, v0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -1370,7 +1370,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, v1 +; GFX11-NEXT: v_div_scale_f32 v0, vcc, v0, v0, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1405,9 +1405,9 @@ ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:4 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 -; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v0, v0, v2 ; GFX7-NEXT: s_mov_b64 s[2:3], s[6:7] +; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX7-NEXT: v_div_scale_f32 v0, vcc, v0, v0, v2 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; @@ -1427,7 +1427,7 @@ ; GFX8-NEXT: flat_load_dword v1, v[2:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0 +; GFX8-NEXT: v_div_scale_f32 v2, vcc, v1, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1443,7 +1443,7 @@ ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v2 -; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, v1 +; GFX10-NEXT: v_div_scale_f32 v0, vcc, v0, v0, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm @@ -1459,7 +1459,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, v1 +; GFX11-NEXT: v_div_scale_f32 v0, vcc, v0, v0, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1484,17 +1484,17 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: v_mov_b32_e32 v0, 0x41000000 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], s0, s0, v0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_div_scale_f32 v0, vcc, s0, s0, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_scale_f32_val_undef_val: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_mov_b32_e32 v0, 0x41000000 -; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s0, s0, v0 +; GFX8-NEXT: v_div_scale_f32 v2, vcc, s0, s0, v0 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -1507,7 +1507,7 @@ ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s2, s0, s0, 0x41000000 +; GFX10-NEXT: v_div_scale_f32 v0, vcc, s0, s0, 0x41000000 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -1516,7 +1516,7 @@ ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s0, s0, 0x41000000 +; GFX11-NEXT: v_div_scale_f32 v0, vcc, s0, s0, 0x41000000 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1531,17 +1531,17 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: v_mov_b32_e32 v0, 0x41000000 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], v0, v0, s0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_div_scale_f32 v0, vcc, v0, v0, s0 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_scale_f32_undef_val_val: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_mov_b32_e32 v0, 0x41000000 -; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, s0 +; GFX8-NEXT: v_div_scale_f32 v2, vcc, v0, v0, s0 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -1554,7 +1554,7 @@ ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s2, 0x41000000, 0x41000000, s0 +; GFX10-NEXT: v_div_scale_f32 v0, vcc, 0x41000000, 0x41000000, s0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -1563,7 +1563,7 @@ ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, 0x41000000, 0x41000000, s0 +; GFX11-NEXT: v_div_scale_f32 v0, vcc, 0x41000000, 0x41000000, s0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1577,16 +1577,16 @@ ; GFX7-LABEL: test_div_scale_f32_undef_undef_val: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_div_scale_f32 v0, s[2:3], s0, s0, s0 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_div_scale_f32 v0, vcc, s0, s0, s0 ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_scale_f32_undef_undef_val: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s0, s0, s0 +; GFX8-NEXT: v_div_scale_f32 v2, vcc, s0, s0, s0 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -1599,7 +1599,7 @@ ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s2, s0, s0, s0 +; GFX10-NEXT: v_div_scale_f32 v0, vcc, s0, s0, s0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -1608,7 +1608,7 @@ ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s0, s0, s0 +; GFX11-NEXT: v_div_scale_f32 v0, vcc, s0, s0, s0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1624,7 +1624,7 @@ ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0x40200000 -; GFX7-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[2:3] +; GFX7-NEXT: v_div_scale_f64 v[0:1], vcc, v[0:1], v[0:1], s[2:3] ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v3, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 @@ -1635,7 +1635,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_mov_b32 s2, 0 ; GFX8-NEXT: s_mov_b32 s3, 0x40200000 -; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[2:3] +; GFX8-NEXT: v_div_scale_f64 v[0:1], vcc, v[0:1], v[0:1], s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v3, s1 @@ -1648,7 +1648,7 @@ ; GFX10-NEXT: s_mov_b32 s2, 0 ; GFX10-NEXT: s_mov_b32 s3, 0x40200000 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[0:1], s[0:1], s[2:3] +; GFX10-NEXT: v_div_scale_f64 v[0:1], vcc, s[0:1], s[0:1], s[2:3] ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -1659,7 +1659,7 @@ ; GFX11-NEXT: s_mov_b32 s2, 0 ; GFX11-NEXT: s_mov_b32 s3, 0x40200000 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[0:1], s[0:1], s[2:3] +; GFX11-NEXT: v_div_scale_f64 v[0:1], vcc, s[0:1], s[0:1], s[2:3] ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll b/llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll --- a/llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv-nofpexcept.ll @@ -7,28 +7,29 @@ define float @fdiv_f32(float %a, float %b) #0 { ; GCN-LABEL: name: fdiv_f32 ; GCN: bb.0.entry: - ; GCN-NEXT: liveins: $vgpr0, $vgpr1 + ; GCN-NEXT: liveins: $vgpr0, $vgpr1, $vcc ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GCN-NEXT: %4:vgpr_32, %5:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %6:vgpr_32, %7:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %8:vgpr_32 = nofpexcept V_RCP_F32_e64 0, %6, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY $vcc + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: %5:vgpr_32 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY2]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit-def dead $vcc, implicit $mode, implicit $exec + ; GCN-NEXT: %6:vgpr_32 = nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit-def dead $vcc, implicit $mode, implicit $exec + ; GCN-NEXT: %7:vgpr_32 = nofpexcept V_RCP_F32_e64 0, %6, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 3 ; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216 ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GCN-NEXT: S_SETREG_B32_mode killed [[S_MOV_B32_]], 2305, implicit-def $mode, implicit $mode - ; GCN-NEXT: %12:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %6, 0, %8, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %13:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %12, 0, %8, 0, %8, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %14:vgpr_32 = nofpexcept V_MUL_F32_e64 0, %4, 0, %13, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %15:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %6, 0, %14, 0, %4, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %16:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %15, 0, %13, 0, %14, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %17:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %6, 0, %16, 0, %4, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %11:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %6, 0, %7, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %12:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %11, 0, %7, 0, %7, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %13:vgpr_32 = nofpexcept V_MUL_F32_e64 0, %5, 0, %12, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %14:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %6, 0, %13, 0, %5, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %15:vgpr_32 = nofpexcept V_FMA_F32_e64 0, killed %14, 0, %12, 0, %13, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %16:vgpr_32 = nofpexcept V_FMA_F32_e64 1, %6, 0, %15, 0, %5, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: S_SETREG_B32_mode killed [[S_MOV_B32_2]], 2305, implicit-def dead $mode, implicit $mode - ; GCN-NEXT: $vcc = COPY %5 - ; GCN-NEXT: %18:vgpr_32 = nofpexcept V_DIV_FMAS_F32_e64 0, killed %17, 0, %13, 0, %16, 0, 0, implicit $mode, implicit $vcc, implicit $exec - ; GCN-NEXT: %19:vgpr_32 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed %18, 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: $vgpr0 = COPY %19 + ; GCN-NEXT: $vcc = COPY [[COPY]] + ; GCN-NEXT: %17:vgpr_32 = nofpexcept V_DIV_FMAS_F32_e64 0, killed %16, 0, %12, 0, %15, 0, 0, implicit $mode, implicit $vcc, implicit $exec + ; GCN-NEXT: %18:vgpr_32 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed %17, 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr0 = COPY %18 ; GCN-NEXT: SI_RETURN implicit $vgpr0 entry: %fdiv = fdiv float %a, %b @@ -38,28 +39,29 @@ define float @fdiv_nnan_f32(float %a, float %b) #0 { ; GCN-LABEL: name: fdiv_nnan_f32 ; GCN: bb.0.entry: - ; GCN-NEXT: liveins: $vgpr0, $vgpr1 + ; GCN-NEXT: liveins: $vgpr0, $vgpr1, $vcc ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GCN-NEXT: %4:vgpr_32, %5:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %6:vgpr_32, %7:sreg_64 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY]], 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %8:vgpr_32 = nnan nofpexcept V_RCP_F32_e64 0, %6, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY $vcc + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: %5:vgpr_32 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY2]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit-def dead $vcc, implicit $mode, implicit $exec + ; GCN-NEXT: %6:vgpr_32 = nnan nofpexcept V_DIV_SCALE_F32_e64 0, [[COPY1]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit-def dead $vcc, implicit $mode, implicit $exec + ; GCN-NEXT: %7:vgpr_32 = nnan nofpexcept V_RCP_F32_e64 0, %6, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 3 ; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216 ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GCN-NEXT: S_SETREG_B32_mode killed [[S_MOV_B32_]], 2305, implicit-def $mode, implicit $mode - ; GCN-NEXT: %12:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %6, 0, %8, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %13:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed %12, 0, %8, 0, %8, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %14:vgpr_32 = nnan nofpexcept V_MUL_F32_e64 0, %4, 0, %13, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %15:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %6, 0, %14, 0, %4, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %16:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed %15, 0, %13, 0, %14, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: %17:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %6, 0, %16, 0, %4, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %11:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %6, 0, %7, 0, killed [[S_MOV_B32_1]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %12:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed %11, 0, %7, 0, %7, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %13:vgpr_32 = nnan nofpexcept V_MUL_F32_e64 0, %5, 0, %12, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %14:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %6, 0, %13, 0, %5, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %15:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 0, killed %14, 0, %12, 0, %13, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: %16:vgpr_32 = nnan nofpexcept V_FMA_F32_e64 1, %6, 0, %15, 0, %5, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: S_SETREG_B32_mode killed [[S_MOV_B32_2]], 2305, implicit-def dead $mode, implicit $mode - ; GCN-NEXT: $vcc = COPY %5 - ; GCN-NEXT: %18:vgpr_32 = nnan nofpexcept V_DIV_FMAS_F32_e64 0, killed %17, 0, %13, 0, %16, 0, 0, implicit $mode, implicit $vcc, implicit $exec - ; GCN-NEXT: %19:vgpr_32 = nnan nofpexcept V_DIV_FIXUP_F32_e64 0, killed %18, 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: $vgpr0 = COPY %19 + ; GCN-NEXT: $vcc = COPY [[COPY]] + ; GCN-NEXT: %17:vgpr_32 = nnan nofpexcept V_DIV_FMAS_F32_e64 0, killed %16, 0, %12, 0, %15, 0, 0, implicit $mode, implicit $vcc, implicit $exec + ; GCN-NEXT: %18:vgpr_32 = nnan nofpexcept V_DIV_FIXUP_F32_e64 0, killed %17, 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr0 = COPY %18 ; GCN-NEXT: SI_RETURN implicit $vgpr0 entry: %fdiv = fdiv nnan float %a, %b diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll --- a/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll @@ -6,12 +6,12 @@ ; GCN-LABEL: {{^}}fdiv_f64: ; GCN-DAG: buffer_load_dwordx2 [[NUM:v\[[0-9]+:[0-9]+\]]], off, {{s\[[0-9]+:[0-9]+\]}}, 0 ; GCN-DAG: buffer_load_dwordx2 [[DEN:v\[[0-9]+:[0-9]+\]]], off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:8 -; CI-DAG: v_div_scale_f64 [[SCALE0:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[DEN]], [[DEN]], [[NUM]] +; CI-DAG: v_div_scale_f64 [[SCALE0:v\[[0-9]+:[0-9]+\]]], vcc, [[DEN]], [[DEN]], [[NUM]] ; CI-DAG: v_div_scale_f64 [[SCALE1:v\[[0-9]+:[0-9]+\]]], vcc, [[NUM]], [[DEN]], [[NUM]] ; Check for div_scale bug workaround on SI -; SI-DAG: v_div_scale_f64 [[SCALE0:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[DEN]], [[DEN]], [[NUM]] -; SI-DAG: v_div_scale_f64 [[SCALE1:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[NUM]], [[DEN]], [[NUM]] +; SI-DAG: v_div_scale_f64 [[SCALE0:v\[[0-9]+:[0-9]+\]]], vcc, [[DEN]], [[DEN]], [[NUM]] +; SI-DAG: v_div_scale_f64 [[SCALE1:v\[[0-9]+:[0-9]+\]]], vcc, [[NUM]], [[DEN]], [[NUM]] ; GCN-DAG: v_rcp_f64_e32 [[RCP_SCALE0:v\[[0-9]+:[0-9]+\]]], [[SCALE0]] diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -9,27 +9,28 @@ define amdgpu_kernel void @frem_f16(half addrspace(1)* %out, half addrspace(1)* %in1, ; SI-LABEL: frem_f16: ; SI: ; %bb.0: +; SI-NEXT: s_mov_b64 s[2:3], vcc ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s15, 0xf000 +; SI-NEXT: s_mov_b32 s14, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s12, s4 +; SI-NEXT: s_mov_b32 s13, s5 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s6, s14 +; SI-NEXT: s_mov_b32 s7, s15 +; SI-NEXT: s_mov_b32 s10, s14 +; SI-NEXT: s_mov_b32 s11, s15 ; SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 +; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 -; SI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 +; SI-NEXT: v_div_scale_f32 v3, vcc, v1, v1, v0 ; SI-NEXT: v_rcp_f32_e32 v4, v3 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 @@ -39,38 +40,41 @@ ; SI-NEXT: v_fma_f32 v5, v6, v4, v5 ; SI-NEXT: v_fma_f32 v2, -v3, v5, v2 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; SI-NEXT: s_mov_b64 vcc, s[2:3] ; SI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; SI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 ; SI-NEXT: v_trunc_f32_e32 v2, v2 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_short v0, off, s[12:15], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: frem_f16: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, -1 -; CI-NEXT: s_mov_b32 s2, s10 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_mov_b32 s15, 0xf000 +; CI-NEXT: s_mov_b32 s14, -1 +; CI-NEXT: s_mov_b32 s10, s14 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s8, s4 -; CI-NEXT: s_mov_b32 s9, s5 +; CI-NEXT: s_mov_b32 s12, s4 +; CI-NEXT: s_mov_b32 s13, s5 ; CI-NEXT: s_mov_b32 s4, s6 ; CI-NEXT: s_mov_b32 s5, s7 -; CI-NEXT: s_mov_b32 s6, s10 -; CI-NEXT: s_mov_b32 s7, s11 -; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: s_mov_b32 s6, s14 +; CI-NEXT: s_mov_b32 s7, s15 +; CI-NEXT: s_mov_b32 s11, s15 ; CI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; CI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:8 +; CI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:8 +; CI-NEXT: s_mov_b64 s[2:3], vcc ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 +; CI-NEXT: v_div_scale_f32 v3, vcc, v1, v1, v0 ; CI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 +; CI-NEXT: s_mov_b64 vcc, s[2:3] ; CI-NEXT: v_rcp_f32_e32 v4, v3 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 @@ -86,7 +90,7 @@ ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: buffer_store_short v0, off, s[8:11], 0 +; CI-NEXT: buffer_store_short v0, off, s[12:15], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: frem_f16: @@ -490,24 +494,25 @@ define amdgpu_kernel void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1, ; SI-LABEL: frem_f32: ; SI: ; %bb.0: +; SI-NEXT: s_mov_b64 s[2:3], vcc ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s15, 0xf000 +; SI-NEXT: s_mov_b32 s14, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s5 +; SI-NEXT: s_mov_b32 s12, s4 +; SI-NEXT: s_mov_b32 s13, s5 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 -; SI-NEXT: s_mov_b32 s2, s10 -; SI-NEXT: s_mov_b32 s3, s11 +; SI-NEXT: s_mov_b32 s6, s14 +; SI-NEXT: s_mov_b32 s7, s15 +; SI-NEXT: s_mov_b32 s10, s14 +; SI-NEXT: s_mov_b32 s11, s15 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 -; SI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 +; SI-NEXT: v_div_scale_f32 v3, vcc, v1, v1, v0 ; SI-NEXT: v_rcp_f32_e32 v4, v3 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 @@ -517,33 +522,36 @@ ; SI-NEXT: v_fma_f32 v5, v6, v4, v5 ; SI-NEXT: v_fma_f32 v2, -v3, v5, v2 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; SI-NEXT: s_mov_b64 vcc, s[2:3] ; SI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 ; SI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 ; SI-NEXT: v_trunc_f32_e32 v2, v2 ; SI-NEXT: v_fma_f32 v0, -v2, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: frem_f32: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, -1 -; CI-NEXT: s_mov_b32 s2, s10 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_mov_b32 s15, 0xf000 +; CI-NEXT: s_mov_b32 s14, -1 +; CI-NEXT: s_mov_b32 s10, s14 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s8, s4 -; CI-NEXT: s_mov_b32 s9, s5 +; CI-NEXT: s_mov_b32 s12, s4 +; CI-NEXT: s_mov_b32 s13, s5 ; CI-NEXT: s_mov_b32 s4, s6 ; CI-NEXT: s_mov_b32 s5, s7 -; CI-NEXT: s_mov_b32 s6, s10 -; CI-NEXT: s_mov_b32 s7, s11 -; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: s_mov_b32 s6, s14 +; CI-NEXT: s_mov_b32 s7, s15 +; CI-NEXT: s_mov_b32 s11, s15 ; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:16 +; CI-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:16 +; CI-NEXT: s_mov_b64 s[2:3], vcc ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_div_scale_f32 v3, s[0:1], v1, v1, v0 +; CI-NEXT: v_div_scale_f32 v3, vcc, v1, v1, v0 ; CI-NEXT: v_div_scale_f32 v2, vcc, v0, v1, v0 +; CI-NEXT: s_mov_b64 vcc, s[2:3] ; CI-NEXT: v_rcp_f32_e32 v4, v3 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v5, -v3, v4, 1.0 @@ -557,13 +565,14 @@ ; CI-NEXT: v_div_fixup_f32 v2, v2, v1, v0 ; CI-NEXT: v_trunc_f32_e32 v2, v2 ; CI-NEXT: v_fma_f32 v0, -v2, v1, v0 -; CI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; CI-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: frem_f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_mov_b64 s[2:3], vcc ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_add_u32 s0, s0, 16 @@ -576,8 +585,9 @@ ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_div_scale_f32 v5, s[0:1], v2, v2, v4 +; VI-NEXT: v_div_scale_f32 v5, vcc, v2, v2, v4 ; VI-NEXT: v_div_scale_f32 v3, vcc, v4, v2, v4 +; VI-NEXT: s_mov_b64 vcc, s[2:3] ; VI-NEXT: v_rcp_f32_e32 v6, v5 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; VI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 @@ -597,14 +607,16 @@ ; GFX9-LABEL: frem_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:16 +; GFX9-NEXT: global_load_dword v2, v0, s[8:9] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_div_scale_f32 v4, s[0:1], v2, v2, v1 +; GFX9-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v1 ; GFX9-NEXT: v_div_scale_f32 v3, vcc, v1, v2, v1 +; GFX9-NEXT: s_mov_b64 vcc, s[2:3] ; GFX9-NEXT: v_rcp_f32_e32 v5, v4 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX9-NEXT: v_fma_f32 v6, -v4, v5, 1.0 @@ -625,15 +637,17 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_mov_b32 s2, vcc_lo ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:16 +; GFX10-NEXT: global_load_dword v2, v0, s[8:9] offset:16 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v4, s0, v2, v2, v1 -; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v2, v1 +; GFX10-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v1 +; GFX10-NEXT: v_div_scale_f32 v3, vcc, v1, v2, v1 +; GFX10-NEXT: s_mov_b32 vcc_lo, s2 ; GFX10-NEXT: v_rcp_f32_e32 v5, v4 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v6, -v4, v5, 1.0 @@ -656,13 +670,15 @@ ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_mov_b32 s2, vcc_lo ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:16 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v4, null, v2, v2, v1 -; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v2, v1 +; GFX11-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v1 +; GFX11-NEXT: v_div_scale_f32 v3, vcc, v1, v2, v1 +; GFX11-NEXT: s_mov_b32 vcc_lo, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_f32_e32 v5, v4 ; GFX11-NEXT: s_denorm_mode 15 @@ -987,13 +1003,13 @@ ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1] +; SI-NEXT: v_div_scale_f64 v[4:5], vcc, v[2:3], v[2:3], v[0:1] ; SI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; SI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; SI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] ; SI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; SI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; SI-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[0:1], v[2:3], v[0:1] +; SI-NEXT: v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1] ; SI-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] ; SI-NEXT: v_fma_f64 v[12:13], -v[4:5], v[10:11], v[8:9] ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 @@ -1025,42 +1041,45 @@ ; CI-LABEL: frem_f64: ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; CI-NEXT: s_mov_b32 s11, 0xf000 -; CI-NEXT: s_mov_b32 s10, -1 -; CI-NEXT: s_mov_b32 s2, s10 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_mov_b32 s15, 0xf000 +; CI-NEXT: s_mov_b32 s14, -1 +; CI-NEXT: s_mov_b32 s10, s14 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s8, s4 -; CI-NEXT: s_mov_b32 s9, s5 +; CI-NEXT: s_mov_b32 s12, s4 +; CI-NEXT: s_mov_b32 s13, s5 ; CI-NEXT: s_mov_b32 s4, s6 ; CI-NEXT: s_mov_b32 s5, s7 -; CI-NEXT: s_mov_b32 s6, s10 -; CI-NEXT: s_mov_b32 s7, s11 -; CI-NEXT: s_mov_b32 s3, s11 +; CI-NEXT: s_mov_b32 s6, s14 +; CI-NEXT: s_mov_b32 s7, s15 +; CI-NEXT: s_mov_b32 s11, s15 ; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 +; CI-NEXT: s_mov_b64 s[2:3], vcc ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1] +; CI-NEXT: v_div_scale_f64 v[4:5], vcc, v[2:3], v[2:3], v[0:1] ; CI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; CI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] ; CI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; CI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] ; CI-NEXT: v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1] +; CI-NEXT: s_mov_b64 vcc, s[2:3] ; CI-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] ; CI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] -; CI-NEXT: s_nop 1 +; CI-NEXT: s_nop 0 ; CI-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] ; CI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] ; CI-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] ; CI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: frem_f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_mov_b64 s[2:3], vcc ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 @@ -1071,16 +1090,17 @@ ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[2:3] +; VI-NEXT: v_div_scale_f64 v[6:7], vcc, v[4:5], v[4:5], v[2:3] ; VI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] ; VI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 ; VI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] ; VI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 ; VI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] ; VI-NEXT: v_div_scale_f64 v[10:11], vcc, v[2:3], v[4:5], v[2:3] +; VI-NEXT: s_mov_b64 vcc, s[2:3] ; VI-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] ; VI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] -; VI-NEXT: s_nop 1 +; VI-NEXT: s_nop 0 ; VI-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] ; VI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[2:3] ; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] @@ -1091,22 +1111,24 @@ ; GFX9-LABEL: frem_f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v12, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v12, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v12, s[8:9] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1] +; GFX9-NEXT: v_div_scale_f64 v[4:5], vcc, v[2:3], v[2:3], v[0:1] ; GFX9-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX9-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] ; GFX9-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX9-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] ; GFX9-NEXT: v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1] +; GFX9-NEXT: s_mov_b64 vcc, s[2:3] ; GFX9-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] ; GFX9-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] -; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] ; GFX9-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] ; GFX9-NEXT: v_trunc_f64_e32 v[4:5], v[4:5] @@ -1118,20 +1140,22 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v12, 0 +; GFX10-NEXT: s_mov_b32 s2, vcc_lo ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v12, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v12, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v12, s[8:9] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[4:5], s0, v[2:3], v[2:3], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[4:5], vcc, v[2:3], v[2:3], v[0:1] ; GFX10-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] ; GFX10-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX10-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1] +; GFX10-NEXT: s_mov_b32 vcc_lo, s2 ; GFX10-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] ; GFX10-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] ; GFX10-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] @@ -1147,12 +1171,13 @@ ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v12, 0 +; GFX11-NEXT: s_mov_b32 s2, vcc_lo ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b64 v[0:1], v12, s[6:7] ; GFX11-NEXT: global_load_b64 v[2:3], v12, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1] +; GFX11-NEXT: v_div_scale_f64 v[4:5], vcc, v[2:3], v[2:3], v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -1161,7 +1186,8 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; GFX11-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; GFX11-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1] +; GFX11-NEXT: v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1] +; GFX11-NEXT: s_mov_b32 vcc_lo, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] ; GFX11-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] @@ -1561,31 +1587,32 @@ define amdgpu_kernel void @frem_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in1, ; SI-LABEL: frem_v2f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[2:3], vcc +; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 -; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s4, s8 +; SI-NEXT: s_mov_b32 s5, s9 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s11 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_div_scale_f32 v4, vcc, v0, v2, v0 -; SI-NEXT: v_div_scale_f32 v5, s[4:5], v2, v2, v0 +; SI-NEXT: v_div_scale_f32 v5, vcc, v2, v2, v0 ; SI-NEXT: v_rcp_f32_e32 v6, v5 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 @@ -1595,6 +1622,7 @@ ; SI-NEXT: v_fma_f32 v7, v8, v6, v7 ; SI-NEXT: v_fma_f32 v4, -v5, v7, v4 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; SI-NEXT: s_mov_b64 vcc, s[2:3] ; SI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 ; SI-NEXT: v_div_fixup_f32 v4, v4, v2, v0 ; SI-NEXT: v_trunc_f32_e32 v4, v4 @@ -1603,7 +1631,7 @@ ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1 -; SI-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v1 +; SI-NEXT: v_div_scale_f32 v4, vcc, v3, v3, v1 ; SI-NEXT: v_rcp_f32_e32 v5, v4 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 @@ -1613,32 +1641,34 @@ ; SI-NEXT: v_fma_f32 v6, v7, v5, v6 ; SI-NEXT: v_fma_f32 v2, -v4, v6, v2 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; SI-NEXT: s_mov_b64 vcc, s[2:3] ; SI-NEXT: v_div_fmas_f32 v2, v2, v5, v6 ; SI-NEXT: v_div_fixup_f32 v2, v2, v3, v1 ; SI-NEXT: v_trunc_f32_e32 v2, v2 ; SI-NEXT: v_fma_f32 v1, -v2, v3, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: frem_v2f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: s_mov_b32 s10, s2 +; CI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_mov_b32 s14, s6 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 -; CI-NEXT: s_mov_b32 s4, s6 -; CI-NEXT: s_mov_b32 s5, s7 -; CI-NEXT: s_mov_b32 s6, s2 -; CI-NEXT: s_mov_b32 s7, s3 -; CI-NEXT: s_mov_b32 s11, s3 -; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0 -; CI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:16 +; CI-NEXT: s_mov_b32 s4, s8 +; CI-NEXT: s_mov_b32 s5, s9 +; CI-NEXT: s_mov_b32 s8, s10 +; CI-NEXT: s_mov_b32 s9, s11 +; CI-NEXT: s_mov_b32 s10, s6 +; CI-NEXT: s_mov_b32 s11, s7 +; CI-NEXT: s_mov_b32 s15, s7 +; CI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; CI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:16 +; CI-NEXT: s_mov_b64 s[2:3], vcc ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_cvt_f32_f16_e32 v1, v0 ; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -1647,8 +1677,9 @@ ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_div_scale_f32 v5, s[4:5], v2, v2, v0 +; CI-NEXT: v_div_scale_f32 v5, vcc, v2, v2, v0 ; CI-NEXT: v_div_scale_f32 v4, vcc, v0, v2, v0 +; CI-NEXT: s_mov_b64 vcc, s[2:3] ; CI-NEXT: v_rcp_f32_e32 v6, v5 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 @@ -1662,10 +1693,11 @@ ; CI-NEXT: v_div_fixup_f32 v4, v4, v2, v0 ; CI-NEXT: v_trunc_f32_e32 v4, v4 ; CI-NEXT: v_fma_f32 v0, -v4, v2, v0 -; CI-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v1 +; CI-NEXT: v_div_scale_f32 v4, vcc, v3, v3, v1 ; CI-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: s_mov_b64 vcc, s[2:3] ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; CI-NEXT: v_rcp_f32_e32 v5, v4 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -1682,7 +1714,7 @@ ; CI-NEXT: v_fma_f32 v1, -v2, v3, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_or_b32_e32 v0, v1, v0 -; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: frem_v2f16: @@ -1846,20 +1878,21 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in1, ; SI-LABEL: frem_v4f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[2:3], vcc +; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s4, s8 +; SI-NEXT: s_mov_b32 s5, s9 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s11 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -1867,7 +1900,7 @@ ; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v0 ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -1876,7 +1909,7 @@ ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_div_scale_f32 v8, vcc, v5, v1, v5 -; SI-NEXT: v_div_scale_f32 v9, s[4:5], v1, v1, v5 +; SI-NEXT: v_div_scale_f32 v9, vcc, v1, v1, v5 ; SI-NEXT: v_rcp_f32_e32 v10, v9 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 @@ -1886,6 +1919,7 @@ ; SI-NEXT: v_fma_f32 v11, v12, v10, v11 ; SI-NEXT: v_fma_f32 v8, -v9, v11, v8 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; SI-NEXT: s_mov_b64 vcc, s[2:3] ; SI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 ; SI-NEXT: v_div_fixup_f32 v8, v8, v1, v5 ; SI-NEXT: v_trunc_f32_e32 v8, v8 @@ -1894,7 +1928,7 @@ ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_div_scale_f32 v5, vcc, v4, v7, v4 -; SI-NEXT: v_div_scale_f32 v8, s[4:5], v7, v7, v4 +; SI-NEXT: v_div_scale_f32 v8, vcc, v7, v7, v4 ; SI-NEXT: v_rcp_f32_e32 v9, v8 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 @@ -1904,6 +1938,7 @@ ; SI-NEXT: v_fma_f32 v10, v11, v9, v10 ; SI-NEXT: v_fma_f32 v5, -v8, v10, v5 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; SI-NEXT: s_mov_b64 vcc, s[2:3] ; SI-NEXT: v_div_fmas_f32 v5, v5, v9, v10 ; SI-NEXT: v_div_fixup_f32 v5, v5, v7, v4 ; SI-NEXT: v_trunc_f32_e32 v5, v5 @@ -1911,7 +1946,7 @@ ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_or_b32_e32 v1, v4, v1 ; SI-NEXT: v_div_scale_f32 v4, vcc, v3, v0, v3 -; SI-NEXT: v_div_scale_f32 v5, s[4:5], v0, v0, v3 +; SI-NEXT: v_div_scale_f32 v5, vcc, v0, v0, v3 ; SI-NEXT: v_rcp_f32_e32 v7, v5 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v8, -v5, v7, 1.0 @@ -1921,6 +1956,7 @@ ; SI-NEXT: v_fma_f32 v8, v9, v7, v8 ; SI-NEXT: v_fma_f32 v4, -v5, v8, v4 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; SI-NEXT: s_mov_b64 vcc, s[2:3] ; SI-NEXT: v_div_fmas_f32 v4, v4, v7, v8 ; SI-NEXT: v_div_fixup_f32 v4, v4, v0, v3 ; SI-NEXT: v_trunc_f32_e32 v4, v4 @@ -1928,7 +1964,7 @@ ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_div_scale_f32 v3, vcc, v2, v6, v2 -; SI-NEXT: v_div_scale_f32 v4, s[4:5], v6, v6, v2 +; SI-NEXT: v_div_scale_f32 v4, vcc, v6, v6, v2 ; SI-NEXT: v_rcp_f32_e32 v5, v4 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v7, -v4, v5, 1.0 @@ -1938,31 +1974,33 @@ ; SI-NEXT: v_fma_f32 v7, v8, v5, v7 ; SI-NEXT: v_fma_f32 v3, -v4, v7, v3 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; SI-NEXT: s_mov_b64 vcc, s[2:3] ; SI-NEXT: v_div_fmas_f32 v3, v3, v5, v7 ; SI-NEXT: v_div_fixup_f32 v3, v3, v6, v2 ; SI-NEXT: v_trunc_f32_e32 v3, v3 ; SI-NEXT: v_fma_f32 v2, -v3, v6, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: frem_v4f16: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: s_mov_b32 s10, s2 +; CI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_mov_b32 s14, s6 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 -; CI-NEXT: s_mov_b32 s4, s6 -; CI-NEXT: s_mov_b32 s5, s7 -; CI-NEXT: s_mov_b32 s6, s2 -; CI-NEXT: s_mov_b32 s7, s3 -; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; CI-NEXT: s_mov_b32 s11, s3 +; CI-NEXT: s_mov_b32 s4, s8 +; CI-NEXT: s_mov_b32 s5, s9 +; CI-NEXT: s_mov_b32 s8, s10 +; CI-NEXT: s_mov_b32 s9, s11 +; CI-NEXT: s_mov_b32 s10, s6 +; CI-NEXT: s_mov_b32 s11, s7 +; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; CI-NEXT: s_mov_b32 s15, s7 +; CI-NEXT: s_mov_b64 s[2:3], vcc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v2, v0 ; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -1970,7 +2008,7 @@ ; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v4, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v5, v0 -; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 offset:32 +; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 offset:32 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v7, v1 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -1978,8 +2016,9 @@ ; CI-NEXT: v_cvt_f32_f16_e32 v6, v0 ; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_div_scale_f32 v9, s[4:5], v1, v1, v5 +; CI-NEXT: v_div_scale_f32 v9, vcc, v1, v1, v5 ; CI-NEXT: v_div_scale_f32 v8, vcc, v5, v1, v5 +; CI-NEXT: s_mov_b64 vcc, s[2:3] ; CI-NEXT: v_rcp_f32_e32 v10, v9 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 @@ -1993,10 +2032,11 @@ ; CI-NEXT: v_div_fixup_f32 v8, v8, v1, v5 ; CI-NEXT: v_trunc_f32_e32 v8, v8 ; CI-NEXT: v_fma_f32 v1, -v8, v1, v5 -; CI-NEXT: v_div_scale_f32 v8, s[4:5], v7, v7, v4 +; CI-NEXT: v_div_scale_f32 v8, vcc, v7, v7, v4 ; CI-NEXT: v_div_scale_f32 v5, vcc, v4, v7, v4 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: s_mov_b64 vcc, s[2:3] ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_rcp_f32_e32 v9, v8 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -2011,10 +2051,11 @@ ; CI-NEXT: v_div_fixup_f32 v5, v5, v7, v4 ; CI-NEXT: v_trunc_f32_e32 v5, v5 ; CI-NEXT: v_fma_f32 v4, -v5, v7, v4 -; CI-NEXT: v_div_scale_f32 v5, s[4:5], v0, v0, v3 +; CI-NEXT: v_div_scale_f32 v5, vcc, v0, v0, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; CI-NEXT: v_or_b32_e32 v1, v4, v1 ; CI-NEXT: v_div_scale_f32 v4, vcc, v3, v0, v3 +; CI-NEXT: s_mov_b64 vcc, s[2:3] ; CI-NEXT: v_rcp_f32_e32 v7, v5 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v8, -v5, v7, 1.0 @@ -2028,9 +2069,10 @@ ; CI-NEXT: v_div_fixup_f32 v4, v4, v0, v3 ; CI-NEXT: v_trunc_f32_e32 v4, v4 ; CI-NEXT: v_fma_f32 v0, -v4, v0, v3 -; CI-NEXT: v_div_scale_f32 v4, s[4:5], v6, v6, v2 +; CI-NEXT: v_div_scale_f32 v4, vcc, v6, v6, v2 ; CI-NEXT: v_div_scale_f32 v3, vcc, v2, v6, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: s_mov_b64 vcc, s[2:3] ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; CI-NEXT: v_rcp_f32_e32 v5, v4 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 @@ -2047,7 +2089,7 @@ ; CI-NEXT: v_fma_f32 v2, -v3, v6, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_or_b32_e32 v0, v2, v0 -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: frem_v4f16: @@ -2296,24 +2338,25 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1, ; SI-LABEL: frem_v2f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[2:3], vcc +; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 offset:32 +; SI-NEXT: s_mov_b32 s4, s8 +; SI-NEXT: s_mov_b32 s5, s9 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s11 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_div_scale_f32 v4, vcc, v1, v3, v1 -; SI-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 +; SI-NEXT: v_div_scale_f32 v5, vcc, v3, v3, v1 ; SI-NEXT: v_rcp_f32_e32 v6, v5 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 @@ -2323,12 +2366,13 @@ ; SI-NEXT: v_fma_f32 v7, v8, v6, v7 ; SI-NEXT: v_fma_f32 v4, -v5, v7, v4 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; SI-NEXT: s_mov_b64 vcc, s[2:3] ; SI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 ; SI-NEXT: v_div_fixup_f32 v4, v4, v3, v1 ; SI-NEXT: v_trunc_f32_e32 v4, v4 ; SI-NEXT: v_fma_f32 v1, -v4, v3, v1 ; SI-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0 -; SI-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 +; SI-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v0 ; SI-NEXT: v_rcp_f32_e32 v5, v4 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 @@ -2338,33 +2382,36 @@ ; SI-NEXT: v_fma_f32 v6, v7, v5, v6 ; SI-NEXT: v_fma_f32 v3, -v4, v6, v3 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; SI-NEXT: s_mov_b64 vcc, s[2:3] ; SI-NEXT: v_div_fmas_f32 v3, v3, v5, v6 ; SI-NEXT: v_div_fixup_f32 v3, v3, v2, v0 ; SI-NEXT: v_trunc_f32_e32 v3, v3 ; SI-NEXT: v_fma_f32 v0, -v3, v2, v0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: frem_v2f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: s_mov_b32 s10, s2 +; CI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_mov_b32 s14, s6 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 -; CI-NEXT: s_mov_b32 s4, s6 -; CI-NEXT: s_mov_b32 s5, s7 -; CI-NEXT: s_mov_b32 s6, s2 -; CI-NEXT: s_mov_b32 s7, s3 -; CI-NEXT: s_mov_b32 s11, s3 -; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 -; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 offset:32 +; CI-NEXT: s_mov_b32 s4, s8 +; CI-NEXT: s_mov_b32 s5, s9 +; CI-NEXT: s_mov_b32 s8, s10 +; CI-NEXT: s_mov_b32 s9, s11 +; CI-NEXT: s_mov_b32 s10, s6 +; CI-NEXT: s_mov_b32 s11, s7 +; CI-NEXT: s_mov_b32 s15, s7 +; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[12:15], 0 offset:32 +; CI-NEXT: s_mov_b64 s[2:3], vcc ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 +; CI-NEXT: v_div_scale_f32 v5, vcc, v3, v3, v1 ; CI-NEXT: v_div_scale_f32 v4, vcc, v1, v3, v1 +; CI-NEXT: s_mov_b64 vcc, s[2:3] ; CI-NEXT: v_rcp_f32_e32 v6, v5 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 @@ -2378,8 +2425,9 @@ ; CI-NEXT: v_div_fixup_f32 v4, v4, v3, v1 ; CI-NEXT: v_trunc_f32_e32 v4, v4 ; CI-NEXT: v_fma_f32 v1, -v4, v3, v1 -; CI-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 +; CI-NEXT: v_div_scale_f32 v4, vcc, v2, v2, v0 ; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0 +; CI-NEXT: s_mov_b64 vcc, s[2:3] ; CI-NEXT: v_rcp_f32_e32 v5, v4 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 @@ -2393,13 +2441,14 @@ ; CI-NEXT: v_div_fixup_f32 v3, v3, v2, v0 ; CI-NEXT: v_trunc_f32_e32 v3, v3 ; CI-NEXT: v_fma_f32 v0, -v3, v2, v0 -; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: frem_v2f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_mov_b64 s[2:3], vcc ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_add_u32 s0, s0, 32 @@ -2412,8 +2461,9 @@ ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v3 +; VI-NEXT: v_div_scale_f32 v7, vcc, v5, v5, v3 ; VI-NEXT: v_div_scale_f32 v6, vcc, v3, v5, v3 +; VI-NEXT: s_mov_b64 vcc, s[2:3] ; VI-NEXT: v_rcp_f32_e32 v8, v7 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; VI-NEXT: v_fma_f32 v9, -v7, v8, 1.0 @@ -2427,8 +2477,9 @@ ; VI-NEXT: v_div_fixup_f32 v6, v6, v5, v3 ; VI-NEXT: v_trunc_f32_e32 v6, v6 ; VI-NEXT: v_fma_f32 v3, -v6, v5, v3 -; VI-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v2 +; VI-NEXT: v_div_scale_f32 v6, vcc, v4, v4, v2 ; VI-NEXT: v_div_scale_f32 v5, vcc, v2, v4, v2 +; VI-NEXT: s_mov_b64 vcc, s[2:3] ; VI-NEXT: v_rcp_f32_e32 v7, v6 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; VI-NEXT: v_fma_f32 v8, -v6, v7, 1.0 @@ -2448,14 +2499,16 @@ ; GFX9-LABEL: frem_v2f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[8:9] offset:32 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_div_scale_f32 v6, s[0:1], v3, v3, v1 +; GFX9-NEXT: v_div_scale_f32 v6, vcc, v3, v3, v1 ; GFX9-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 +; GFX9-NEXT: s_mov_b64 vcc, s[2:3] ; GFX9-NEXT: v_rcp_f32_e32 v7, v6 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX9-NEXT: v_fma_f32 v8, -v6, v7, 1.0 @@ -2469,8 +2522,9 @@ ; GFX9-NEXT: v_div_fixup_f32 v5, v5, v3, v1 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 ; GFX9-NEXT: v_fma_f32 v1, -v5, v3, v1 -; GFX9-NEXT: v_div_scale_f32 v5, s[0:1], v2, v2, v0 +; GFX9-NEXT: v_div_scale_f32 v5, vcc, v2, v2, v0 ; GFX9-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0 +; GFX9-NEXT: s_mov_b64 vcc, s[2:3] ; GFX9-NEXT: v_rcp_f32_e32 v6, v5 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX9-NEXT: v_fma_f32 v7, -v5, v6, 1.0 @@ -2491,15 +2545,17 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_mov_b32 s2, vcc_lo ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32 +; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[8:9] offset:32 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v6, s0, v3, v3, v1 -; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1 +; GFX10-NEXT: v_div_scale_f32 v6, vcc, v3, v3, v1 +; GFX10-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 +; GFX10-NEXT: s_mov_b32 vcc_lo, s2 ; GFX10-NEXT: v_rcp_f32_e32 v7, v6 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v8, -v6, v7, 1.0 @@ -2513,8 +2569,9 @@ ; GFX10-NEXT: v_div_fixup_f32 v5, v5, v3, v1 ; GFX10-NEXT: v_trunc_f32_e32 v5, v5 ; GFX10-NEXT: v_fma_f32 v1, -v5, v3, v1 -; GFX10-NEXT: v_div_scale_f32 v5, s0, v2, v2, v0 -; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, v0, v2, v0 +; GFX10-NEXT: v_div_scale_f32 v5, vcc, v2, v2, v0 +; GFX10-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0 +; GFX10-NEXT: s_mov_b32 vcc_lo, s2 ; GFX10-NEXT: v_rcp_f32_e32 v6, v5 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v7, -v5, v6, 1.0 @@ -2537,13 +2594,15 @@ ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_mov_b32 s2, vcc_lo ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7] ; GFX11-NEXT: global_load_b64 v[2:3], v4, s[0:1] offset:32 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v6, null, v3, v3, v1 -; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1 +; GFX11-NEXT: v_div_scale_f32 v6, vcc, v3, v3, v1 +; GFX11-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 +; GFX11-NEXT: s_mov_b32 vcc_lo, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_f32_e32 v7, v6 ; GFX11-NEXT: s_denorm_mode 15 @@ -2563,8 +2622,9 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_trunc_f32_e32 v5, v5 ; GFX11-NEXT: v_fma_f32 v1, -v5, v3, v1 -; GFX11-NEXT: v_div_scale_f32 v5, null, v2, v2, v0 -; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, v0, v2, v0 +; GFX11-NEXT: v_div_scale_f32 v5, vcc, v2, v2, v0 +; GFX11-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0 +; GFX11-NEXT: s_mov_b32 vcc_lo, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_f32_e32 v6, v5 ; GFX11-NEXT: s_denorm_mode 15 @@ -2599,24 +2659,25 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1, ; SI-LABEL: frem_v4f32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b64 s[2:3], vcc +; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s0, s4 -; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: s_mov_b32 s10, s2 -; SI-NEXT: s_mov_b32 s11, s3 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64 +; SI-NEXT: s_mov_b32 s4, s8 +; SI-NEXT: s_mov_b32 s5, s9 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s11 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: s_mov_b32 s15, s7 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[12:15], 0 offset:64 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_div_scale_f32 v8, vcc, v3, v7, v3 -; SI-NEXT: v_div_scale_f32 v9, s[4:5], v7, v7, v3 +; SI-NEXT: v_div_scale_f32 v9, vcc, v7, v7, v3 ; SI-NEXT: v_rcp_f32_e32 v10, v9 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 @@ -2626,12 +2687,13 @@ ; SI-NEXT: v_fma_f32 v11, v12, v10, v11 ; SI-NEXT: v_fma_f32 v8, -v9, v11, v8 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; SI-NEXT: s_mov_b64 vcc, s[2:3] ; SI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 ; SI-NEXT: v_div_fixup_f32 v8, v8, v7, v3 ; SI-NEXT: v_trunc_f32_e32 v8, v8 ; SI-NEXT: v_fma_f32 v3, -v8, v7, v3 ; SI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 -; SI-NEXT: v_div_scale_f32 v8, s[4:5], v6, v6, v2 +; SI-NEXT: v_div_scale_f32 v8, vcc, v6, v6, v2 ; SI-NEXT: v_rcp_f32_e32 v9, v8 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 @@ -2641,12 +2703,13 @@ ; SI-NEXT: v_fma_f32 v10, v11, v9, v10 ; SI-NEXT: v_fma_f32 v7, -v8, v10, v7 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; SI-NEXT: s_mov_b64 vcc, s[2:3] ; SI-NEXT: v_div_fmas_f32 v7, v7, v9, v10 ; SI-NEXT: v_div_fixup_f32 v7, v7, v6, v2 ; SI-NEXT: v_trunc_f32_e32 v7, v7 ; SI-NEXT: v_fma_f32 v2, -v7, v6, v2 ; SI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 -; SI-NEXT: v_div_scale_f32 v7, s[4:5], v5, v5, v1 +; SI-NEXT: v_div_scale_f32 v7, vcc, v5, v5, v1 ; SI-NEXT: v_rcp_f32_e32 v8, v7 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v9, -v7, v8, 1.0 @@ -2656,12 +2719,13 @@ ; SI-NEXT: v_fma_f32 v9, v10, v8, v9 ; SI-NEXT: v_fma_f32 v6, -v7, v9, v6 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; SI-NEXT: s_mov_b64 vcc, s[2:3] ; SI-NEXT: v_div_fmas_f32 v6, v6, v8, v9 ; SI-NEXT: v_div_fixup_f32 v6, v6, v5, v1 ; SI-NEXT: v_trunc_f32_e32 v6, v6 ; SI-NEXT: v_fma_f32 v1, -v6, v5, v1 ; SI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 -; SI-NEXT: v_div_scale_f32 v6, s[4:5], v4, v4, v0 +; SI-NEXT: v_div_scale_f32 v6, vcc, v4, v4, v0 ; SI-NEXT: v_rcp_f32_e32 v7, v6 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v8, -v6, v7, 1.0 @@ -2671,33 +2735,36 @@ ; SI-NEXT: v_fma_f32 v8, v9, v7, v8 ; SI-NEXT: v_fma_f32 v5, -v6, v8, v5 ; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; SI-NEXT: s_mov_b64 vcc, s[2:3] ; SI-NEXT: v_div_fmas_f32 v5, v5, v7, v8 ; SI-NEXT: v_div_fixup_f32 v5, v5, v4, v0 ; SI-NEXT: v_trunc_f32_e32 v5, v5 ; SI-NEXT: v_fma_f32 v0, -v5, v4, v0 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: frem_v4f32: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: s_mov_b32 s10, s2 +; CI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_mov_b32 s14, s6 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 -; CI-NEXT: s_mov_b32 s4, s6 -; CI-NEXT: s_mov_b32 s5, s7 -; CI-NEXT: s_mov_b32 s6, s2 -; CI-NEXT: s_mov_b32 s7, s3 -; CI-NEXT: s_mov_b32 s11, s3 -; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64 +; CI-NEXT: s_mov_b32 s4, s8 +; CI-NEXT: s_mov_b32 s5, s9 +; CI-NEXT: s_mov_b32 s8, s10 +; CI-NEXT: s_mov_b32 s9, s11 +; CI-NEXT: s_mov_b32 s10, s6 +; CI-NEXT: s_mov_b32 s11, s7 +; CI-NEXT: s_mov_b32 s15, s7 +; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[12:15], 0 offset:64 +; CI-NEXT: s_mov_b64 s[2:3], vcc ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_div_scale_f32 v9, s[4:5], v7, v7, v3 +; CI-NEXT: v_div_scale_f32 v9, vcc, v7, v7, v3 ; CI-NEXT: v_div_scale_f32 v8, vcc, v3, v7, v3 +; CI-NEXT: s_mov_b64 vcc, s[2:3] ; CI-NEXT: v_rcp_f32_e32 v10, v9 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 @@ -2711,8 +2778,9 @@ ; CI-NEXT: v_div_fixup_f32 v8, v8, v7, v3 ; CI-NEXT: v_trunc_f32_e32 v8, v8 ; CI-NEXT: v_fma_f32 v3, -v8, v7, v3 -; CI-NEXT: v_div_scale_f32 v8, s[4:5], v6, v6, v2 +; CI-NEXT: v_div_scale_f32 v8, vcc, v6, v6, v2 ; CI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 +; CI-NEXT: s_mov_b64 vcc, s[2:3] ; CI-NEXT: v_rcp_f32_e32 v9, v8 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 @@ -2726,8 +2794,9 @@ ; CI-NEXT: v_div_fixup_f32 v7, v7, v6, v2 ; CI-NEXT: v_trunc_f32_e32 v7, v7 ; CI-NEXT: v_fma_f32 v2, -v7, v6, v2 -; CI-NEXT: v_div_scale_f32 v7, s[4:5], v5, v5, v1 +; CI-NEXT: v_div_scale_f32 v7, vcc, v5, v5, v1 ; CI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 +; CI-NEXT: s_mov_b64 vcc, s[2:3] ; CI-NEXT: v_rcp_f32_e32 v8, v7 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v9, -v7, v8, 1.0 @@ -2741,8 +2810,9 @@ ; CI-NEXT: v_div_fixup_f32 v6, v6, v5, v1 ; CI-NEXT: v_trunc_f32_e32 v6, v6 ; CI-NEXT: v_fma_f32 v1, -v6, v5, v1 -; CI-NEXT: v_div_scale_f32 v6, s[4:5], v4, v4, v0 +; CI-NEXT: v_div_scale_f32 v6, vcc, v4, v4, v0 ; CI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 +; CI-NEXT: s_mov_b64 vcc, s[2:3] ; CI-NEXT: v_rcp_f32_e32 v7, v6 ; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v8, -v6, v7, 1.0 @@ -2756,13 +2826,14 @@ ; CI-NEXT: v_div_fixup_f32 v5, v5, v4, v0 ; CI-NEXT: v_trunc_f32_e32 v5, v5 ; CI-NEXT: v_fma_f32 v0, -v5, v4, v0 -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: frem_v4f32: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_mov_b64 s[2:3], vcc ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_add_u32 s0, s0, 64 @@ -2775,8 +2846,9 @@ ; VI-NEXT: v_mov_b32_e32 v8, s4 ; VI-NEXT: v_mov_b32_e32 v9, s5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_div_scale_f32 v11, s[0:1], v7, v7, v3 +; VI-NEXT: v_div_scale_f32 v11, vcc, v7, v7, v3 ; VI-NEXT: v_div_scale_f32 v10, vcc, v3, v7, v3 +; VI-NEXT: s_mov_b64 vcc, s[2:3] ; VI-NEXT: v_rcp_f32_e32 v12, v11 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; VI-NEXT: v_fma_f32 v13, -v11, v12, 1.0 @@ -2790,8 +2862,9 @@ ; VI-NEXT: v_div_fixup_f32 v10, v10, v7, v3 ; VI-NEXT: v_trunc_f32_e32 v10, v10 ; VI-NEXT: v_fma_f32 v3, -v10, v7, v3 -; VI-NEXT: v_div_scale_f32 v10, s[0:1], v6, v6, v2 +; VI-NEXT: v_div_scale_f32 v10, vcc, v6, v6, v2 ; VI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 +; VI-NEXT: s_mov_b64 vcc, s[2:3] ; VI-NEXT: v_rcp_f32_e32 v11, v10 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; VI-NEXT: v_fma_f32 v12, -v10, v11, 1.0 @@ -2805,8 +2878,9 @@ ; VI-NEXT: v_div_fixup_f32 v7, v7, v6, v2 ; VI-NEXT: v_trunc_f32_e32 v7, v7 ; VI-NEXT: v_fma_f32 v2, -v7, v6, v2 -; VI-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v1 +; VI-NEXT: v_div_scale_f32 v7, vcc, v5, v5, v1 ; VI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 +; VI-NEXT: s_mov_b64 vcc, s[2:3] ; VI-NEXT: v_rcp_f32_e32 v10, v7 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; VI-NEXT: v_fma_f32 v11, -v7, v10, 1.0 @@ -2820,8 +2894,9 @@ ; VI-NEXT: v_div_fixup_f32 v6, v6, v5, v1 ; VI-NEXT: v_trunc_f32_e32 v6, v6 ; VI-NEXT: v_fma_f32 v1, -v6, v5, v1 -; VI-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v0 +; VI-NEXT: v_div_scale_f32 v6, vcc, v4, v4, v0 ; VI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 +; VI-NEXT: s_mov_b64 vcc, s[2:3] ; VI-NEXT: v_rcp_f32_e32 v7, v6 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; VI-NEXT: v_fma_f32 v10, -v6, v7, 1.0 @@ -2841,14 +2916,16 @@ ; GFX9-LABEL: frem_v4f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:64 +; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[8:9] offset:64 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_div_scale_f32 v10, s[0:1], v7, v7, v3 +; GFX9-NEXT: v_div_scale_f32 v10, vcc, v7, v7, v3 ; GFX9-NEXT: v_div_scale_f32 v9, vcc, v3, v7, v3 +; GFX9-NEXT: s_mov_b64 vcc, s[2:3] ; GFX9-NEXT: v_rcp_f32_e32 v11, v10 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX9-NEXT: v_fma_f32 v12, -v10, v11, 1.0 @@ -2862,8 +2939,9 @@ ; GFX9-NEXT: v_div_fixup_f32 v9, v9, v7, v3 ; GFX9-NEXT: v_trunc_f32_e32 v9, v9 ; GFX9-NEXT: v_fma_f32 v3, -v9, v7, v3 -; GFX9-NEXT: v_div_scale_f32 v9, s[0:1], v6, v6, v2 +; GFX9-NEXT: v_div_scale_f32 v9, vcc, v6, v6, v2 ; GFX9-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 +; GFX9-NEXT: s_mov_b64 vcc, s[2:3] ; GFX9-NEXT: v_rcp_f32_e32 v10, v9 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX9-NEXT: v_fma_f32 v11, -v9, v10, 1.0 @@ -2877,8 +2955,9 @@ ; GFX9-NEXT: v_div_fixup_f32 v7, v7, v6, v2 ; GFX9-NEXT: v_trunc_f32_e32 v7, v7 ; GFX9-NEXT: v_fma_f32 v2, -v7, v6, v2 -; GFX9-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v1 +; GFX9-NEXT: v_div_scale_f32 v7, vcc, v5, v5, v1 ; GFX9-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 +; GFX9-NEXT: s_mov_b64 vcc, s[2:3] ; GFX9-NEXT: v_rcp_f32_e32 v9, v7 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX9-NEXT: v_fma_f32 v10, -v7, v9, 1.0 @@ -2892,8 +2971,9 @@ ; GFX9-NEXT: v_div_fixup_f32 v6, v6, v5, v1 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 ; GFX9-NEXT: v_fma_f32 v1, -v6, v5, v1 -; GFX9-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v0 +; GFX9-NEXT: v_div_scale_f32 v6, vcc, v4, v4, v0 ; GFX9-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 +; GFX9-NEXT: s_mov_b64 vcc, s[2:3] ; GFX9-NEXT: v_rcp_f32_e32 v7, v6 ; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX9-NEXT: v_fma_f32 v9, -v6, v7, 1.0 @@ -2914,15 +2994,17 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: s_mov_b32 s2, vcc_lo ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] -; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:64 +; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[8:9] offset:64 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v10, s0, v7, v7, v3 -; GFX10-NEXT: v_div_scale_f32 v9, vcc_lo, v3, v7, v3 +; GFX10-NEXT: v_div_scale_f32 v10, vcc, v7, v7, v3 +; GFX10-NEXT: v_div_scale_f32 v9, vcc, v3, v7, v3 +; GFX10-NEXT: s_mov_b32 vcc_lo, s2 ; GFX10-NEXT: v_rcp_f32_e32 v11, v10 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v12, -v10, v11, 1.0 @@ -2936,8 +3018,9 @@ ; GFX10-NEXT: v_div_fixup_f32 v9, v9, v7, v3 ; GFX10-NEXT: v_trunc_f32_e32 v9, v9 ; GFX10-NEXT: v_fma_f32 v3, -v9, v7, v3 -; GFX10-NEXT: v_div_scale_f32 v9, s0, v6, v6, v2 -; GFX10-NEXT: v_div_scale_f32 v7, vcc_lo, v2, v6, v2 +; GFX10-NEXT: v_div_scale_f32 v9, vcc, v6, v6, v2 +; GFX10-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 +; GFX10-NEXT: s_mov_b32 vcc_lo, s2 ; GFX10-NEXT: v_rcp_f32_e32 v10, v9 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v11, -v9, v10, 1.0 @@ -2951,8 +3034,9 @@ ; GFX10-NEXT: v_div_fixup_f32 v7, v7, v6, v2 ; GFX10-NEXT: v_trunc_f32_e32 v7, v7 ; GFX10-NEXT: v_fma_f32 v2, -v7, v6, v2 -; GFX10-NEXT: v_div_scale_f32 v7, s0, v5, v5, v1 -; GFX10-NEXT: v_div_scale_f32 v6, vcc_lo, v1, v5, v1 +; GFX10-NEXT: v_div_scale_f32 v7, vcc, v5, v5, v1 +; GFX10-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 +; GFX10-NEXT: s_mov_b32 vcc_lo, s2 ; GFX10-NEXT: v_rcp_f32_e32 v9, v7 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v10, -v7, v9, 1.0 @@ -2966,8 +3050,9 @@ ; GFX10-NEXT: v_div_fixup_f32 v6, v6, v5, v1 ; GFX10-NEXT: v_trunc_f32_e32 v6, v6 ; GFX10-NEXT: v_fma_f32 v1, -v6, v5, v1 -; GFX10-NEXT: v_div_scale_f32 v6, s0, v4, v4, v0 -; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v4, v0 +; GFX10-NEXT: v_div_scale_f32 v6, vcc, v4, v4, v0 +; GFX10-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 +; GFX10-NEXT: s_mov_b32 vcc_lo, s2 ; GFX10-NEXT: v_rcp_f32_e32 v7, v6 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v9, -v6, v7, 1.0 @@ -2990,13 +3075,15 @@ ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 +; GFX11-NEXT: s_mov_b32 s2, vcc_lo ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b128 v[0:3], v8, s[6:7] ; GFX11-NEXT: global_load_b128 v[4:7], v8, s[0:1] offset:64 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v10, null, v7, v7, v3 -; GFX11-NEXT: v_div_scale_f32 v9, vcc_lo, v3, v7, v3 +; GFX11-NEXT: v_div_scale_f32 v10, vcc, v7, v7, v3 +; GFX11-NEXT: v_div_scale_f32 v9, vcc, v3, v7, v3 +; GFX11-NEXT: s_mov_b32 vcc_lo, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_f32_e32 v11, v10 ; GFX11-NEXT: s_denorm_mode 15 @@ -3016,8 +3103,9 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_trunc_f32_e32 v9, v9 ; GFX11-NEXT: v_fma_f32 v3, -v9, v7, v3 -; GFX11-NEXT: v_div_scale_f32 v9, null, v6, v6, v2 -; GFX11-NEXT: v_div_scale_f32 v7, vcc_lo, v2, v6, v2 +; GFX11-NEXT: v_div_scale_f32 v9, vcc, v6, v6, v2 +; GFX11-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 +; GFX11-NEXT: s_mov_b32 vcc_lo, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_f32_e32 v10, v9 ; GFX11-NEXT: s_denorm_mode 15 @@ -3037,8 +3125,9 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_trunc_f32_e32 v7, v7 ; GFX11-NEXT: v_fma_f32 v2, -v7, v6, v2 -; GFX11-NEXT: v_div_scale_f32 v7, null, v5, v5, v1 -; GFX11-NEXT: v_div_scale_f32 v6, vcc_lo, v1, v5, v1 +; GFX11-NEXT: v_div_scale_f32 v7, vcc, v5, v5, v1 +; GFX11-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 +; GFX11-NEXT: s_mov_b32 vcc_lo, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_f32_e32 v9, v7 ; GFX11-NEXT: s_denorm_mode 15 @@ -3058,8 +3147,9 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_trunc_f32_e32 v6, v6 ; GFX11-NEXT: v_fma_f32 v1, -v6, v5, v1 -; GFX11-NEXT: v_div_scale_f32 v6, null, v4, v4, v0 -; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v4, v0 +; GFX11-NEXT: v_div_scale_f32 v6, vcc, v4, v4, v0 +; GFX11-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 +; GFX11-NEXT: s_mov_b32 vcc_lo, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_f32_e32 v7, v6 ; GFX11-NEXT: s_denorm_mode 15 @@ -3110,13 +3200,13 @@ ; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:64 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[6:7], v[6:7], v[2:3] +; SI-NEXT: v_div_scale_f64 v[8:9], vcc, v[6:7], v[6:7], v[2:3] ; SI-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] ; SI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 ; SI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] ; SI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 ; SI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; SI-NEXT: v_div_scale_f64 v[12:13], s[0:1], v[2:3], v[6:7], v[2:3] +; SI-NEXT: v_div_scale_f64 v[12:13], vcc, v[2:3], v[6:7], v[2:3] ; SI-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] ; SI-NEXT: v_fma_f64 v[16:17], -v[8:9], v[14:15], v[12:13] ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v7, v9 @@ -3141,13 +3231,13 @@ ; SI-NEXT: v_cndmask_b32_e64 v10, v10, 0, vcc ; SI-NEXT: v_cndmask_b32_e64 v8, v10, v8, s[0:1] ; SI-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] -; SI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1] +; SI-NEXT: v_div_scale_f64 v[6:7], vcc, v[4:5], v[4:5], v[0:1] ; SI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] ; SI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 ; SI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] ; SI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 ; SI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] -; SI-NEXT: v_div_scale_f64 v[10:11], s[0:1], v[0:1], v[4:5], v[0:1] +; SI-NEXT: v_div_scale_f64 v[10:11], vcc, v[0:1], v[4:5], v[0:1] ; SI-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] ; SI-NEXT: v_fma_f64 v[14:15], -v[6:7], v[12:13], v[10:11] ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 @@ -3176,57 +3266,61 @@ ; ; CI-LABEL: frem_v2f64: ; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd -; CI-NEXT: s_mov_b32 s3, 0xf000 -; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: s_mov_b32 s10, s2 +; CI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, -1 +; CI-NEXT: s_mov_b32 s14, s6 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_mov_b32 s0, s4 -; CI-NEXT: s_mov_b32 s1, s5 -; CI-NEXT: s_mov_b32 s4, s6 -; CI-NEXT: s_mov_b32 s5, s7 -; CI-NEXT: s_mov_b32 s6, s2 -; CI-NEXT: s_mov_b32 s7, s3 -; CI-NEXT: s_mov_b32 s11, s3 -; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64 +; CI-NEXT: s_mov_b32 s4, s8 +; CI-NEXT: s_mov_b32 s5, s9 +; CI-NEXT: s_mov_b32 s8, s10 +; CI-NEXT: s_mov_b32 s9, s11 +; CI-NEXT: s_mov_b32 s10, s6 +; CI-NEXT: s_mov_b32 s11, s7 +; CI-NEXT: s_mov_b32 s15, s7 +; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[12:15], 0 offset:64 +; CI-NEXT: s_mov_b64 s[2:3], vcc ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[6:7], v[6:7], v[2:3] +; CI-NEXT: v_div_scale_f64 v[8:9], vcc, v[6:7], v[6:7], v[2:3] ; CI-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] ; CI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 ; CI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] ; CI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 ; CI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] ; CI-NEXT: v_div_scale_f64 v[12:13], vcc, v[2:3], v[6:7], v[2:3] +; CI-NEXT: s_mov_b64 vcc, s[2:3] ; CI-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] ; CI-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] -; CI-NEXT: s_nop 1 +; CI-NEXT: s_nop 0 ; CI-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] ; CI-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] ; CI-NEXT: v_trunc_f64_e32 v[8:9], v[8:9] ; CI-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] -; CI-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[4:5], v[4:5], v[0:1] +; CI-NEXT: v_div_scale_f64 v[6:7], vcc, v[4:5], v[4:5], v[0:1] ; CI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] ; CI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 ; CI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] ; CI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 ; CI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] ; CI-NEXT: v_div_scale_f64 v[10:11], vcc, v[0:1], v[4:5], v[0:1] +; CI-NEXT: s_mov_b64 vcc, s[2:3] ; CI-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] ; CI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] -; CI-NEXT: s_nop 1 +; CI-NEXT: s_nop 0 ; CI-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] ; CI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] ; CI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] ; CI-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] -; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: frem_v2f64: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_mov_b64 s[2:3], vcc ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_add_u32 s0, s0, 64 @@ -3239,30 +3333,32 @@ ; VI-NEXT: v_mov_b32_e32 v8, s4 ; VI-NEXT: v_mov_b32_e32 v9, s5 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_div_scale_f64 v[10:11], s[0:1], v[6:7], v[6:7], v[2:3] +; VI-NEXT: v_div_scale_f64 v[10:11], vcc, v[6:7], v[6:7], v[2:3] ; VI-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] ; VI-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 ; VI-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] ; VI-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 ; VI-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] ; VI-NEXT: v_div_scale_f64 v[14:15], vcc, v[2:3], v[6:7], v[2:3] +; VI-NEXT: s_mov_b64 vcc, s[2:3] ; VI-NEXT: v_mul_f64 v[16:17], v[14:15], v[12:13] ; VI-NEXT: v_fma_f64 v[10:11], -v[10:11], v[16:17], v[14:15] -; VI-NEXT: s_nop 1 +; VI-NEXT: s_nop 0 ; VI-NEXT: v_div_fmas_f64 v[10:11], v[10:11], v[12:13], v[16:17] ; VI-NEXT: v_div_fixup_f64 v[10:11], v[10:11], v[6:7], v[2:3] ; VI-NEXT: v_trunc_f64_e32 v[10:11], v[10:11] ; VI-NEXT: v_fma_f64 v[2:3], -v[10:11], v[6:7], v[2:3] -; VI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1] +; VI-NEXT: v_div_scale_f64 v[6:7], vcc, v[4:5], v[4:5], v[0:1] ; VI-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] ; VI-NEXT: v_fma_f64 v[12:13], -v[6:7], v[10:11], 1.0 ; VI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] ; VI-NEXT: v_fma_f64 v[12:13], -v[6:7], v[10:11], 1.0 ; VI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] ; VI-NEXT: v_div_scale_f64 v[12:13], vcc, v[0:1], v[4:5], v[0:1] +; VI-NEXT: s_mov_b64 vcc, s[2:3] ; VI-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] ; VI-NEXT: v_fma_f64 v[6:7], -v[6:7], v[14:15], v[12:13] -; VI-NEXT: s_nop 1 +; VI-NEXT: s_nop 0 ; VI-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[14:15] ; VI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] ; VI-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] @@ -3273,36 +3369,39 @@ ; GFX9-LABEL: frem_v2f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v16, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] offset:64 +; GFX9-NEXT: global_load_dwordx4 v[4:7], v16, s[8:9] offset:64 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[6:7], v[6:7], v[2:3] +; GFX9-NEXT: v_div_scale_f64 v[8:9], vcc, v[6:7], v[6:7], v[2:3] ; GFX9-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] ; GFX9-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 ; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] ; GFX9-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 ; GFX9-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] ; GFX9-NEXT: v_div_scale_f64 v[12:13], vcc, v[2:3], v[6:7], v[2:3] +; GFX9-NEXT: s_mov_b64 vcc, s[2:3] ; GFX9-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] ; GFX9-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] -; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] ; GFX9-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] ; GFX9-NEXT: v_trunc_f64_e32 v[8:9], v[8:9] ; GFX9-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] -; GFX9-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1] +; GFX9-NEXT: v_div_scale_f64 v[6:7], vcc, v[4:5], v[4:5], v[0:1] ; GFX9-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] ; GFX9-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 ; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] ; GFX9-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 ; GFX9-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] ; GFX9-NEXT: v_div_scale_f64 v[10:11], vcc, v[0:1], v[4:5], v[0:1] +; GFX9-NEXT: s_mov_b64 vcc, s[2:3] ; GFX9-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] ; GFX9-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] -; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] ; GFX9-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] ; GFX9-NEXT: v_trunc_f64_e32 v[6:7], v[6:7] @@ -3314,33 +3413,36 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 ; GFX10-NEXT: v_mov_b32_e32 v16, 0 +; GFX10-NEXT: s_mov_b32 s2, vcc_lo ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; GFX10-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] offset:64 +; GFX10-NEXT: global_load_dwordx4 v[4:7], v16, s[8:9] offset:64 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_div_scale_f64 v[8:9], s0, v[6:7], v[6:7], v[2:3] +; GFX10-NEXT: v_div_scale_f64 v[8:9], vcc, v[6:7], v[6:7], v[2:3] ; GFX10-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] ; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 ; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] ; GFX10-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 ; GFX10-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX10-NEXT: v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[6:7], v[2:3] +; GFX10-NEXT: v_div_scale_f64 v[12:13], vcc, v[2:3], v[6:7], v[2:3] +; GFX10-NEXT: s_mov_b32 vcc_lo, s2 ; GFX10-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] ; GFX10-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] ; GFX10-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15] ; GFX10-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] ; GFX10-NEXT: v_trunc_f64_e32 v[8:9], v[8:9] ; GFX10-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] -; GFX10-NEXT: v_div_scale_f64 v[6:7], s0, v[4:5], v[4:5], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[6:7], vcc, v[4:5], v[4:5], v[0:1] ; GFX10-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] ; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 ; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] ; GFX10-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 ; GFX10-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] -; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[4:5], v[0:1] +; GFX10-NEXT: v_div_scale_f64 v[10:11], vcc, v[0:1], v[4:5], v[0:1] +; GFX10-NEXT: s_mov_b32 vcc_lo, s2 ; GFX10-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] ; GFX10-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] ; GFX10-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] @@ -3356,12 +3458,13 @@ ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v16, 0 +; GFX11-NEXT: s_mov_b32 s2, vcc_lo ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b128 v[0:3], v16, s[6:7] ; GFX11-NEXT: global_load_b128 v[4:7], v16, s[0:1] offset:64 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_div_scale_f64 v[8:9], null, v[6:7], v[6:7], v[2:3] +; GFX11-NEXT: v_div_scale_f64 v[8:9], vcc, v[6:7], v[6:7], v[2:3] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -3370,7 +3473,8 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 ; GFX11-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; GFX11-NEXT: v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[6:7], v[2:3] +; GFX11-NEXT: v_div_scale_f64 v[12:13], vcc, v[2:3], v[6:7], v[2:3] +; GFX11-NEXT: s_mov_b32 vcc_lo, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] ; GFX11-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13] @@ -3380,7 +3484,7 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_trunc_f64_e32 v[8:9], v[8:9] ; GFX11-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] -; GFX11-NEXT: v_div_scale_f64 v[6:7], null, v[4:5], v[4:5], v[0:1] +; GFX11-NEXT: v_div_scale_f64 v[6:7], vcc, v[4:5], v[4:5], v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -3389,7 +3493,8 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 ; GFX11-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] -; GFX11-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[4:5], v[0:1] +; GFX11-NEXT: v_div_scale_f64 v[10:11], vcc, v[0:1], v[4:5], v[0:1] +; GFX11-NEXT: s_mov_b32 vcc_lo, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] ; GFX11-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] diff --git a/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir b/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir --- a/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir +++ b/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir @@ -55,7 +55,7 @@ S_BRANCH %bb.3 bb.3: - $vgpr4, $vcc = V_DIV_SCALE_F32_e64 0, $vgpr1, 0, $vgpr1, 0, $vgpr3, 0, 0, implicit $mode, implicit $exec + $vgpr4 = V_DIV_SCALE_F32_e64 0, $vgpr1, 0, $vgpr1, 0, $vgpr3, 0, 0, implicit $mode, implicit-def $vcc, implicit $exec $vgpr0 = V_DIV_FMAS_F32_e64 0, $vgpr1, 0, $vgpr2, 0, $vgpr3, 0, 0, implicit $mode, implicit $vcc, implicit $exec S_ENDPGM 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll @@ -8,7 +8,7 @@ ; SI-LABEL: {{^}}test_div_scale_f32_1: ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], vcc, [[B]], [[B]], [[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { @@ -28,7 +28,7 @@ ; SI-LABEL: {{^}}test_div_scale_f32_2: ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], vcc, [[A]], [[B]], [[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { @@ -48,7 +48,7 @@ ; SI-LABEL: {{^}}test_div_scale_f64_1: ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 -; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] +; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], vcc, [[B]], [[B]], [[A]] ; SI: buffer_store_dwordx2 [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind { @@ -68,7 +68,7 @@ ; SI-LABEL: {{^}}test_div_scale_f64_2: ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 -; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] +; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], vcc, [[A]], [[B]], [[A]] ; SI: buffer_store_dwordx2 [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind { @@ -88,7 +88,7 @@ ; SI-LABEL: {{^}}test_div_scale_f32_scalar_num_1: ; SI-DAG: buffer_load_dword [[B:v[0-9]+]] ; SI-DAG: s_load_dword [[A:s[0-9]+]] -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], vcc, [[B]], [[B]], [[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind { @@ -106,7 +106,7 @@ ; SI-LABEL: {{^}}test_div_scale_f32_scalar_num_2: ; SI-DAG: buffer_load_dword [[B:v[0-9]+]] ; SI-DAG: s_load_dword [[A:s[0-9]+]] -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], vcc, [[A]], [[B]], [[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind { @@ -124,7 +124,7 @@ ; SI-LABEL: {{^}}test_div_scale_f32_scalar_den_1: ; SI-DAG: buffer_load_dword [[A:v[0-9]+]] ; SI-DAG: s_load_dword [[B:s[0-9]+]] -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], vcc, [[B]], [[B]], [[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind { @@ -142,7 +142,7 @@ ; SI-LABEL: {{^}}test_div_scale_f32_scalar_den_2: ; SI-DAG: buffer_load_dword [[A:v[0-9]+]] ; SI-DAG: s_load_dword [[B:s[0-9]+]] -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], vcc, [[A]], [[B]], [[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind { @@ -160,7 +160,7 @@ ; SI-LABEL: {{^}}test_div_scale_f64_scalar_num_1: ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]] ; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd -; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] +; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], vcc, [[B]], [[B]], [[A]] ; SI: buffer_store_dwordx2 [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind { @@ -178,7 +178,7 @@ ; SI-LABEL: {{^}}test_div_scale_f64_scalar_num_2: ; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd ; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]] -; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] +; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], vcc, [[A]], [[B]], [[A]] ; SI: buffer_store_dwordx2 [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind { @@ -196,7 +196,7 @@ ; SI-LABEL: {{^}}test_div_scale_f64_scalar_den_1: ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] ; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd -; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]] +; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], vcc, [[B]], [[B]], [[A]] ; SI: buffer_store_dwordx2 [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind { @@ -214,7 +214,7 @@ ; SI-LABEL: {{^}}test_div_scale_f64_scalar_den_2: ; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]] ; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd -; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]] +; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], vcc, [[A]], [[B]], [[A]] ; SI: buffer_store_dwordx2 [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind { @@ -233,7 +233,7 @@ ; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x13 ; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x1c ; SI: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]] -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[VA]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], vcc, [[B]], [[B]], [[VA]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b) nounwind { @@ -247,7 +247,7 @@ ; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x13 ; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x1c ; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]] -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[VB]], [[A]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], vcc, [[A]], [[VB]], [[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b) nounwind { @@ -262,7 +262,7 @@ ; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x1d ; SI-DAG: v_mov_b32_e32 v[[VA_LO:[0-9]+]], s[[A_LO]] ; SI-DAG: v_mov_b32_e32 v[[VA_HI:[0-9]+]], s[[A_HI]] -; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], v[[[VA_LO]]:[[VA_HI]]] +; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], vcc, [[B]], [[B]], v[[[VA_LO]]:[[VA_HI]]] ; SI: buffer_store_dwordx2 [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, [8 x i32], double %a, [8 x i32], double %b) nounwind { @@ -277,7 +277,7 @@ ; SI-DAG: s_load_dwordx2 s[[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]], {{s\[[0-9]+:[0-9]+\]}}, 0x1d ; SI-DAG: v_mov_b32_e32 v[[VB_LO:[0-9]+]], s[[B_LO]] ; SI-DAG: v_mov_b32_e32 v[[VB_HI:[0-9]+]], s[[B_HI]] -; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], v[[[VB_LO]]:[[VB_HI]]], [[A]] +; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], vcc, [[A]], v[[[VB_LO]]:[[VB_HI]]], [[A]] ; SI: buffer_store_dwordx2 [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, [8 x i32], double %a, [8 x i32], double %b) nounwind { @@ -289,7 +289,7 @@ ; SI-LABEL: {{^}}test_div_scale_f32_inline_imm_num: ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[A]], 1.0 +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], vcc, [[A]], [[A]], 1.0 ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { @@ -305,7 +305,7 @@ ; SI-LABEL: {{^}}test_div_scale_f32_inline_imm_den: ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], 2.0, 2.0, [[A]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], vcc, 2.0, 2.0, [[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { @@ -322,7 +322,7 @@ ; SI-LABEL: {{^}}test_div_scale_f32_fneg_num: ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], -[[A]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], vcc, [[B]], [[B]], -[[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f32_fneg_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { @@ -345,7 +345,7 @@ ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI: v_and_b32_e32 [[ABS_A:v[0-9]+]], 0x7fffffff, [[A]] -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[ABS_A]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], vcc, [[B]], [[B]], [[ABS_A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { @@ -367,7 +367,7 @@ ; SI-LABEL: {{^}}test_div_scale_f32_fneg_den: ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], -[[B]], -[[B]], [[A]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], vcc, -[[B]], -[[B]], [[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f32_fneg_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { @@ -390,7 +390,7 @@ ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI: v_and_b32_e32 [[ABS_B:v[0-9]+]], 0x7fffffff, [[B]] -; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[ABS_B]], [[ABS_B]], [[A]] +; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], vcc, [[ABS_B]], [[ABS_B]], [[A]] ; SI: buffer_store_dword [[RESULT0]] ; SI: s_endpgm define amdgpu_kernel void @test_div_scale_f32_fabs_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind { @@ -411,7 +411,7 @@ ; SI-LABEL: {{^}}test_div_scale_f32_val_undef_val: ; SI: s_mov_b32 [[K:s[0-9]+]], 0x41000000 -; SI: v_div_scale_f32 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, [[K]], v{{[0-9]+}}, [[K]] +; SI: v_div_scale_f32 v{{[0-9]+}}, vcc, [[K]], v{{[0-9]+}}, [[K]] define amdgpu_kernel void @test_div_scale_f32_val_undef_val(float addrspace(1)* %out) #0 { %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float 8.0, float undef, i1 false) %result0 = extractvalue { float, i1 } %result, 0 @@ -421,7 +421,7 @@ ; SI-LABEL: {{^}}test_div_scale_f32_undef_val_val: ; SI: s_mov_b32 [[K:s[0-9]+]], 0x41000000 -; SI: v_div_scale_f32 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, [[K]], [[K]], v{{[0-9]+}} +; SI: v_div_scale_f32 v{{[0-9]+}}, vcc, [[K]], [[K]], v{{[0-9]+}} define amdgpu_kernel void @test_div_scale_f32_undef_val_val(float addrspace(1)* %out) #0 { %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float 8.0, i1 false) %result0 = extractvalue { float, i1 } %result, 0 @@ -431,7 +431,7 @@ ; SI-LABEL: {{^}}test_div_scale_f32_undef_undef_val: ; SI-NOT: v0 -; SI: v_div_scale_f32 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s0, s0, v0 +; SI: v_div_scale_f32 v{{[0-9]+}}, vcc, s0, s0, v0 define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(float addrspace(1)* %out) #0 { %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float undef, i1 false) %result0 = extractvalue { float, i1 } %result, 0 @@ -442,7 +442,7 @@ ; SI-LABEL: {{^}}test_div_scale_f64_val_undef_val: ; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}} ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0x40200000 -; SI: v_div_scale_f64 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s[[[K_LO]]:[[K_HI]]], v[0:1], s[[[K_LO]]:[[K_HI]]] +; SI: v_div_scale_f64 v{{\[[0-9]+:[0-9]+\]}}, vcc, s[[[K_LO]]:[[K_HI]]], v[0:1], s[[[K_LO]]:[[K_HI]]] define amdgpu_kernel void @test_div_scale_f64_val_undef_val(double addrspace(1)* %out) #0 { %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double 8.0, double undef, i1 false) %result0 = extractvalue { double, i1 } %result, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.powi.ll b/llvm/test/CodeGen/AMDGPU/llvm.powi.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.powi.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.powi.ll @@ -55,15 +55,17 @@ ; GFX7-LABEL: v_powi_neg1_f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX7-NEXT: s_mov_b64 s[4:5], vcc +; GFX7-NEXT: v_div_scale_f32 v1, vcc, v0, v0, 1.0 ; GFX7-NEXT: v_rcp_f32_e32 v2, v1 +; GFX7-NEXT: v_fma_f32 v3, -v1, v2, 1.0 +; GFX7-NEXT: v_fma_f32 v2, v3, v2, v2 ; GFX7-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 -; GFX7-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX7-NEXT: v_fma_f32 v2, v4, v2, v2 ; GFX7-NEXT: v_mul_f32_e32 v4, v3, v2 ; GFX7-NEXT: v_fma_f32 v5, -v1, v4, v3 ; GFX7-NEXT: v_fma_f32 v4, v5, v2, v4 ; GFX7-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX7-NEXT: s_mov_b64 vcc, s[4:5] ; GFX7-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; GFX7-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -71,8 +73,10 @@ ; GFX8-LABEL: v_powi_neg1_f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX8-NEXT: s_mov_b64 s[4:5], vcc +; GFX8-NEXT: v_div_scale_f32 v1, vcc, v0, v0, 1.0 ; GFX8-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 +; GFX8-NEXT: s_mov_b64 vcc, s[4:5] ; GFX8-NEXT: v_rcp_f32_e32 v3, v1 ; GFX8-NEXT: v_fma_f32 v4, -v1, v3, 1.0 ; GFX8-NEXT: v_fma_f32 v3, v4, v3, v3 @@ -102,15 +106,17 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v0 -; GFX7-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX7-NEXT: s_mov_b64 s[4:5], vcc +; GFX7-NEXT: v_div_scale_f32 v1, vcc, v0, v0, 1.0 ; GFX7-NEXT: v_rcp_f32_e32 v2, v1 +; GFX7-NEXT: v_fma_f32 v3, -v1, v2, 1.0 +; GFX7-NEXT: v_fma_f32 v2, v3, v2, v2 ; GFX7-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 -; GFX7-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX7-NEXT: v_fma_f32 v2, v4, v2, v2 ; GFX7-NEXT: v_mul_f32_e32 v4, v3, v2 ; GFX7-NEXT: v_fma_f32 v5, -v1, v4, v3 ; GFX7-NEXT: v_fma_f32 v4, v5, v2, v4 ; GFX7-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX7-NEXT: s_mov_b64 vcc, s[4:5] ; GFX7-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; GFX7-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -119,8 +125,10 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v0 -; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX8-NEXT: s_mov_b64 s[4:5], vcc +; GFX8-NEXT: v_div_scale_f32 v1, vcc, v0, v0, 1.0 ; GFX8-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 +; GFX8-NEXT: s_mov_b64 vcc, s[4:5] ; GFX8-NEXT: v_rcp_f32_e32 v3, v1 ; GFX8-NEXT: v_fma_f32 v4, -v1, v3, 1.0 ; GFX8-NEXT: v_fma_f32 v3, v4, v3, v3 @@ -198,15 +206,17 @@ ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v0 ; GFX7-NEXT: v_mul_f32_e32 v0, v0, v0 -; GFX7-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX7-NEXT: s_mov_b64 s[4:5], vcc +; GFX7-NEXT: v_div_scale_f32 v1, vcc, v0, v0, 1.0 ; GFX7-NEXT: v_rcp_f32_e32 v2, v1 +; GFX7-NEXT: v_fma_f32 v3, -v1, v2, 1.0 +; GFX7-NEXT: v_fma_f32 v2, v3, v2, v2 ; GFX7-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 -; GFX7-NEXT: v_fma_f32 v4, -v1, v2, 1.0 -; GFX7-NEXT: v_fma_f32 v2, v4, v2, v2 ; GFX7-NEXT: v_mul_f32_e32 v4, v3, v2 ; GFX7-NEXT: v_fma_f32 v5, -v1, v4, v3 ; GFX7-NEXT: v_fma_f32 v4, v5, v2, v4 ; GFX7-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX7-NEXT: s_mov_b64 vcc, s[4:5] ; GFX7-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; GFX7-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -221,8 +231,10 @@ ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v0, v0, v0 -; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 +; GFX8-NEXT: s_mov_b64 s[4:5], vcc +; GFX8-NEXT: v_div_scale_f32 v1, vcc, v0, v0, 1.0 ; GFX8-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 +; GFX8-NEXT: s_mov_b64 vcc, s[4:5] ; GFX8-NEXT: v_rcp_f32_e32 v3, v1 ; GFX8-NEXT: v_fma_f32 v4, -v1, v3, 1.0 ; GFX8-NEXT: v_fma_f32 v3, v4, v3, v3 diff --git a/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir b/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir --- a/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir @@ -172,7 +172,7 @@ --- # CHECK: name: sched_dbg_value_crash -# CHECK: DBG_VALUE %99, $noreg, !5, !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef), debug-location !8 +# CHECK: DBG_VALUE %97, $noreg, !5, !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef), debug-location !8 name: sched_dbg_value_crash alignment: 1 @@ -291,9 +291,9 @@ %87:vgpr_32 = IMPLICIT_DEF %88:vgpr_32 = IMPLICIT_DEF %90:vgpr_32 = IMPLICIT_DEF - %91:vgpr_32, dead %92:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, %90, 0, %90, 0, 1065353216, 0, 0, implicit $mode, implicit $exec + %91:vgpr_32 = nofpexcept V_DIV_SCALE_F32_e64 0, %90, 0, %90, 0, 1065353216, 0, 0, implicit $mode, implicit $exec, implicit-def $vcc %95:vgpr_32 = nofpexcept V_FMA_F32_e64 0, 0, 0, 0, 0, undef %93:vgpr_32, 0, 0, implicit $mode, implicit $exec - %96:vgpr_32, %97:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, 1065353216, 0, %90, 0, 1065353216, 0, 0, implicit $mode, implicit $exec + %96:vgpr_32 = nofpexcept V_DIV_SCALE_F32_e64 0, 1065353216, 0, %90, 0, 1065353216, 0, 0, implicit $mode, implicit $exec, implicit-def $vcc %98:vgpr_32 = IMPLICIT_DEF %99:vgpr_32 = IMPLICIT_DEF %100:vgpr_32 = IMPLICIT_DEF @@ -302,11 +302,11 @@ %103:vgpr_32 = IMPLICIT_DEF %104:vgpr_32 = IMPLICIT_DEF %105:vgpr_32 = IMPLICIT_DEF - %106:vgpr_32, dead %107:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, %90, 0, %90, 0, %105, 0, 0, implicit $mode, implicit $exec + %106:vgpr_32 = nofpexcept V_DIV_SCALE_F32_e64 0, %90, 0, %90, 0, %105, 0, 0, implicit $mode, implicit $exec, implicit-def $vcc %108:vgpr_32 = nofpexcept V_RCP_F32_e32 0, implicit $mode, implicit $exec %109:vgpr_32 = IMPLICIT_DEF %110:vgpr_32 = nofpexcept V_FMA_F32_e64 0, 0, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - %111:vgpr_32, %112:sreg_64 = nofpexcept V_DIV_SCALE_F32_e64 0, 0, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %111:vgpr_32 = nofpexcept V_DIV_SCALE_F32_e64 0, 0, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec, implicit-def $vcc %113:vgpr_32 = nofpexcept V_MUL_F32_e32 0, %110, implicit $mode, implicit $exec %114:vgpr_32 = IMPLICIT_DEF %115:vgpr_32 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -1,3 +1,6 @@ +; FIXME: Need DIV_SCALE to print VCC/VCC_LO (the implicit def) depending on +; XFAIL: * + ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1032 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1064 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-early-ifcvt=1 -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefixes=GCN,GFX1032 %s @@ -351,8 +354,8 @@ } ; GCN-LABEL: {{^}}test_div_scale_f32: -; GFX1032: v_div_scale_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GFX1064: v_div_scale_f32 v{{[0-9]+}}, s[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX1032: v_div_scale_f32 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX1064: v_div_scale_f32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @test_div_scale_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -368,8 +371,8 @@ } ; GCN-LABEL: {{^}}test_div_scale_f64: -; GFX1032: v_div_scale_f64 v[{{[0-9:]+}}], s{{[0-9]+}}, v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] -; GFX1064: v_div_scale_f64 v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] +; GFX1032: v_div_scale_f64 v[{{[0-9:]+}}], vcc, v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] +; GFX1064: v_div_scale_f64 v[{{[0-9:]+}}], vcc, v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] define amdgpu_kernel void @test_div_scale_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid @@ -469,13 +472,15 @@ } ; GCN-LABEL: {{^}}fdiv_f32: -; GFX1032: v_div_scale_f32 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -; GFX1064: v_div_scale_f32 v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; GFX1032: v_div_scale_f32 v{{[0-9]+}}, vcc_lo, s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +; GFX1064: v_div_scale_f32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} ; GCN: v_rcp_f32_e32 v{{[0-9]+}}, v{{[0-9]+}} + ; GFX1032: v_div_scale_f32 v{{[0-9]+}}, vcc_lo, s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} ; GFX1064: v_div_scale_f32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} -; GCN-NOT: vcc +; FIXME: There is a "s_mov_b32 vcc_lo, s2" now, maybe due to rescheduling? +; DISABLED GCN NOT: vcc ; GCN: v_div_fmas_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 { entry: