Index: llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -107,6 +107,10 @@ const TargetLowering &getTargetLowering() const; + /// \return true if the combine is running prior to legalization and if \p + /// Query is legal on the target. + bool isLegal(const LegalityQuery &Query) const; + /// \return true if the combine is running prior to legalization, or if \p /// Query is legal on the target. bool isLegalOrBeforeLegalizer(const LegalityQuery &Query) const; @@ -329,6 +333,15 @@ bool matchCombineFAbsOfFAbs(MachineInstr &MI, Register &Src); bool applyCombineFAbsOfFAbs(MachineInstr &MI, Register &Src); + /// Transform (fadd (fmul x, y), z) -> (fma x, y, z) + /// (fadd (fmul x, y), z) -> (fmad x, y, z) + bool matchCombineFAddFMulToFMadOrFMA( + MachineInstr &MI, + std::tuple &MatchInfo); + bool applyCombineFAddFMulToFMadOrFMA( + MachineInstr &MI, + std::tuple &MatchInfo); + /// Transform trunc ([asz]ext x) to x or ([asz]ext x) or (trunc x). bool matchCombineTruncOfExt(MachineInstr &MI, std::pair &MatchInfo); @@ -541,6 +554,10 @@ SmallDenseMap &MemOffset2Idx, const SmallVector &RegsToVisit, const unsigned MemSizeInBits); + + /// Checks if \p MI is TargetOpcode::G_FMUL and contractable either + /// due to global flags or MachineInstr flags. + bool isContractableFMul(const MachineInstr &MI, bool AllowFusionGlobally); }; } // namespace llvm Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -792,6 +792,13 @@ return false; } + /// Return true if target always beneficiates from combining into FMA for a + /// given value type. This must typically return false on targets where FMA + /// takes more cycles to execute than FADD. + virtual bool enableAggressiveFMAFusion(LLT Ty) const { + return false; + } + /// Return the ValueType of the result of SETCC operations. virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const; @@ -2665,6 +2672,14 @@ return isFPExtFree(DestVT, SrcVT); } + /// Return true if an fpext operation input to an \p Opcode operation is free + /// (for instance, because half-precision floating-point numbers are + /// implicitly extended to float-precision) for an FMA instruction. + virtual bool isFPExtFoldable(const MachineInstr &MI, unsigned Opcode, + LLT DestTy, LLT SrcTy) const { + return false; + } + /// Return true if folding a vector load into ExtVal (a sign, zero, or any /// extend node) is profitable. virtual bool isVectorLoadExtDesirable(SDValue ExtVal) const { return false; } @@ -2696,11 +2711,46 @@ return false; } + /// Return true if an FMA operation is faster than a pair of fmul and fadd + /// instructions. fmuladd intrinsics will be expanded to FMAs when this method + /// returns true, otherwise fmuladd is expanded to fmul + fadd. + /// + /// NOTE: This may be called before legalization on types for which FMAs are + /// not legal, but should return true if those types will eventually legalize + /// to types that support FMAs. After legalization, it will only be called on + /// types that support FMAs (via Legal or Custom actions) + virtual bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, + const LLT) const { + return false; + } + /// IR version virtual bool isFMAFasterThanFMulAndFAdd(const Function &F, Type *) const { return false; } + /// Returns true if be combined with to form a TargetOpcode::G_FMAD. \p N + /// may be an TargetOpcode::G_FADD, TargetOpcode::G_FSUB, or an + /// TargetOpcode::G_FMUL which will be distributed into an fadd/fsub. + virtual bool isFMADLegal(const MachineInstr &MI, const LLT Ty) const { + assert((MI.getOpcode() == TargetOpcode::G_FADD || + MI.getOpcode() == TargetOpcode::G_FSUB || + MI.getOpcode() == TargetOpcode::G_FMUL) && + "unexpected node in FMAD forming combine"); + switch (Ty.getScalarSizeInBits()) { + case 16: + return isOperationLegal(TargetOpcode::G_FMAD, MVT::f16); + case 32: + return isOperationLegal(TargetOpcode::G_FMAD, MVT::f32); + case 64: + return isOperationLegal(TargetOpcode::G_FMAD, MVT::f64); + default: + break; + } + + return false; + } + /// Returns true if be combined with to form an ISD::FMAD. \p N may be an /// ISD::FADD, ISD::FSUB, or an ISD::FMUL which will be distributed into an /// fadd/fsub. Index: llvm/include/llvm/Target/GlobalISel/Combine.td =================================================================== --- llvm/include/llvm/Target/GlobalISel/Combine.td +++ llvm/include/llvm/Target/GlobalISel/Combine.td @@ -561,6 +561,20 @@ [{ return Helper.matchExtendThroughPhis(*${root}, ${matchinfo}); }]), (apply [{ return Helper.applyExtendThroughPhis(*${root}, ${matchinfo}); }])>; +// Transform (fadd x, (fmul y, z)) -> (fma y, z, x) +// (fadd x, (fmul y, z)) -> (fmad y, z, x) +// Transform (fadd (fmul x, y), z) -> (fma x, y, z) +// (fadd (fmul x, y), z) -> (fmad x, y, z) +def combine_fadd_fmul_to_fmad_or_fma_info : + GIDefMatchData<"std::tuple">; +def combine_fadd_fmul_to_fmad_or_fma: GICombineRule< + (defs root:$root, combine_fadd_fmul_to_fmad_or_fma_info:$info), + (match (wip_match_opcode G_FADD):$root, + [{ return Helper.matchCombineFAddFMulToFMadOrFMA(*${root}, + ${info}); }]), + (apply [{ return Helper.applyCombineFAddFMulToFMadOrFMA(*${root}, + ${info}); }])>; + // Currently only the one combine above. def insert_vec_elt_combines : GICombineGroup< [combine_insert_vec_elts_build_vector]>; @@ -605,4 +619,5 @@ unmerge_merge, fabs_fabs_fold, unmerge_cst, unmerge_dead_to_trunc, unmerge_zext_to_zext, trunc_ext_fold, trunc_shl, const_combines, xor_of_and_with_same_reg, ptr_add_with_zero, - shift_immed_chain, shift_of_shifted_logic_chain, load_or_combine]>; + shift_immed_chain, shift_of_shifted_logic_chain, + combine_fadd_fmul_to_fmad_or_fma]>; Index: llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -115,6 +115,11 @@ return !LI || LI->getAction(Query).Action == LegalizeActions::Legal; } +bool CombinerHelper::isLegal( + const LegalityQuery &Query) const { + return LI && LI->getAction(Query).Action == LegalizeActions::Legal; +} + void CombinerHelper::replaceRegWith(MachineRegisterInfo &MRI, Register FromReg, Register ToReg) const { Observer.changingAllUsesOfReg(MRI, FromReg); @@ -3656,6 +3661,105 @@ return true; } +bool CombinerHelper::isContractableFMul(const MachineInstr &MI, + bool AllowFusionGlobally) { + if (MI.getOpcode() != TargetOpcode::G_FMUL) + return false; + return AllowFusionGlobally || + MI.getFlag(MachineInstr::MIFlag::FmReassoc) || + MI.getFlag(MachineInstr::MIFlag::FmContract); +} + +bool CombinerHelper::matchCombineFAddFMulToFMadOrFMA( + MachineInstr &MI, + std::tuple &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_FADD); + + auto *MF = MI.getParent()->getParent(); + const auto &TLI = *MF->getSubtarget().getTargetLowering(); + const TargetOptions &Options = MF->getTarget().Options; + LLT DstType = MRI.getType(MI.getOperand(0).getReg()); + LLT SrcType = MRI.getType(MI.getOperand(1).getReg()); + MachineInstr *MI0 = MRI.getVRegDef(MI.getOperand(1).getReg()); + MachineInstr *MI1 = MRI.getVRegDef(MI.getOperand(2).getReg()); + + bool LegalOperations = + isLegal({TargetOpcode::G_FADD, {DstType, SrcType}}); + // Floating-point multiply-add with intermediate rounding. + bool HasFMAD = (LegalOperations && TLI.isFMADLegal(MI, DstType)); + // Floating-point multiply-add without intermediate rounding. + bool HasFMA = + TLI.isFMAFasterThanFMulAndFAdd(*MF, DstType) && + (!LegalOperations || isLegal({TargetOpcode::G_FMA, {DstType, SrcType}})); + + // No valid opcode, do not combine. + if (!HasFMAD && !HasFMA) + return false; + + bool CanFuse = + Options.UnsafeFPMath || MI.getFlag(MachineInstr::MIFlag::FmContract); + bool AllowFusionGlobally = + (Options.AllowFPOpFusion == FPOpFusion::Fast || CanFuse || HasFMAD); + + // If the addition is not contractable, do not combine. + if (!AllowFusionGlobally && !MI.getFlag(MachineInstr::MIFlag::FmContract)) + return false; + + unsigned PreferredFusedOpcode = + HasFMAD ? TargetOpcode::G_FMAD : TargetOpcode::G_FMA; + bool Aggressive = TLI.enableAggressiveFMAFusion(DstType); + + // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)), + // prefer to fold the multiply with fewer uses. + if (Aggressive && + isContractableFMul(*MI0, AllowFusionGlobally) && + isContractableFMul(*MI1, AllowFusionGlobally)) { + if (std::distance( + MRI.use_instr_nodbg_begin(MI0->getOperand(0).getReg()), + MRI.use_instr_nodbg_end()) > + std::distance( + MRI.use_instr_nodbg_begin(MI1->getOperand(0).getReg()), + MRI.use_instr_nodbg_end())) + std::swap(MI0, MI1); + } + + // fold (fadd (fmul x, y), z) -> (fma x, y, z) + if (isContractableFMul(*MI0, AllowFusionGlobally) && + (Aggressive || MRI.hasOneNonDBGUse(MI0->getOperand(0).getReg()))) { + MatchInfo = {MI0->getOperand(1).getReg(), + MI0->getOperand(2).getReg(), + MI1->getOperand(0).getReg(), + PreferredFusedOpcode}; + return true; + } + + // fold (fadd x, (fmul y, z)) -> (fma y, z, x) + if (isContractableFMul(*MI1, AllowFusionGlobally) && + (Aggressive || MRI.hasOneNonDBGUse(MI1->getOperand(0).getReg()))) { + MatchInfo = {MI1->getOperand(1).getReg(), + MI1->getOperand(2).getReg(), + MI0->getOperand(0).getReg(), + PreferredFusedOpcode}; + return true; + } + + return false; +} + +bool CombinerHelper::applyCombineFAddFMulToFMadOrFMA( + MachineInstr &MI, + std::tuple &MatchInfo) { + Register Src1, Src2, Src3; + unsigned PreferredFusedOpcode; + std::tie(Src1, Src2, Src3, PreferredFusedOpcode) = MatchInfo; + + Builder.setInstrAndDebugLoc(MI); + Builder.buildInstr(PreferredFusedOpcode, + {MI.getOperand(0).getReg()}, {Src1, Src2, Src3}); + MI.eraseFromParent(); + return true; +} + bool CombinerHelper::tryCombine(MachineInstr &MI) { if (tryCombineCopy(MI)) return true; Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3350,9 +3350,12 @@ InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const { // FIXME: Handle op_sel + Register Src; + unsigned Mods; + std::tie(Src, Mods) = selectVOP3ModsImpl(Root); return {{ - [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods + [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods }}; } Index: llvm/lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.h +++ llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -248,6 +248,8 @@ bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override; + bool isFPExtFoldable(const MachineInstr &MI, unsigned Opcode, LLT DestTy, + LLT SrcTy) const override; bool isShuffleMaskLegal(ArrayRef /*Mask*/, EVT /*VT*/) const override; @@ -373,6 +375,7 @@ MachineBasicBlock *BB) const override; bool hasBitPreservingFPLogic(EVT VT) const override; + bool enableAggressiveFMAFusion(LLT Ty) const override; bool enableAggressiveFMAFusion(EVT VT) const override; EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override; @@ -381,6 +384,9 @@ bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override; + bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, + const LLT Ty) const override; + bool isFMADLegal(const MachineInstr &MI, const LLT Ty) const override; bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override; SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const; Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -872,6 +872,16 @@ !hasFP32Denormals(DAG.getMachineFunction()); } +bool SITargetLowering::isFPExtFoldable(const MachineInstr &MI, unsigned Opcode, + LLT DestTy, LLT SrcTy) const { + return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) || + (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) && + DestTy.getScalarSizeInBits() == 32 && + SrcTy.getScalarSizeInBits() == 16 && + // TODO: This probably only requires no input flushing? + !hasFP32Denormals(*MI.getMF()); +} + bool SITargetLowering::isShuffleMaskLegal(ArrayRef, EVT) const { // SI has some legal vector types, but no legal vector operations. Say no // shuffles are legal in order to prefer scalarizing some vector operations. @@ -4309,6 +4319,10 @@ return true; } +bool SITargetLowering::enableAggressiveFMAFusion(LLT Ty) const { + return true; +} + EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, EVT VT) const { if (!VT.isVector()) { @@ -4374,6 +4388,37 @@ return false; } +bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, + const LLT Ty) const { + switch(Ty.getScalarSizeInBits()) { + case 16: + return isFMAFasterThanFMulAndFAdd(MF, MVT::f16); + case 32: + return isFMAFasterThanFMulAndFAdd(MF, MVT::f32); + case 64: + return isFMAFasterThanFMulAndFAdd(MF, MVT::f64); + default: + break; + } + + return false; +} + +bool SITargetLowering::isFMADLegal(const MachineInstr &MI, + const LLT Ty) const { + if (!Ty.isScalar()) + return false; + + if (Ty.getScalarSizeInBits() == 16) + return Subtarget->hasMadF16() && + !hasFP64FP16Denormals(*MI.getMF()); + if (Ty.getScalarSizeInBits() == 32) + return Subtarget->hasMadMacF32Insts() && + !hasFP32Denormals(*MI.getMF()); + + return false; +} + bool SITargetLowering::isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const { // TODO: Check future ftz flag Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll @@ -0,0 +1,1196 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -fp-contract=fast < %s | FileCheck -check-prefix=GFX9-CONTRACT %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX9-DENORM %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -fp-contract=fast < %s | FileCheck -check-prefix=GFX10-CONTRACT %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 --denormal-fp-math=preserve-sign < %s | FileCheck -check-prefix=GFX10-DENORM %s + +; fold (fadd (fmul x, y), z) -> (fma x, y, z) +; fold (fadd x, (fmul y, z)) -> (fma y, z, x) + +define float @test_f32_add_mul(float %x, float %y, float %z) { +; GFX9-LABEL: test_f32_add_mul: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-CONTRACT-LABEL: test_f32_add_mul: +; GFX9-CONTRACT: ; %bb.0: ; %.entry +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-DENORM-LABEL: test_f32_add_mul: +; GFX9-DENORM: ; %bb.0: ; %.entry +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-DENORM-NEXT: v_mad_f32 v0, v0, v1, v2 +; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_f32_add_mul: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-CONTRACT-LABEL: test_f32_add_mul: +; GFX10-CONTRACT: ; %bb.0: ; %.entry +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-DENORM-LABEL: test_f32_add_mul: +; GFX10-DENORM: ; %bb.0: ; %.entry +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-DENORM-NEXT: v_mad_f32 v0, v0, v1, v2 +; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +.entry: + %a = fmul float %x, %y + %b = fadd float %a, %z + ret float %b +} + +define float @test_f32_add_mul_rhs(float %x, float %y, float %z) { +; GFX9-LABEL: test_f32_add_mul_rhs: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-CONTRACT-LABEL: test_f32_add_mul_rhs: +; GFX9-CONTRACT: ; %bb.0: ; %.entry +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-DENORM-LABEL: test_f32_add_mul_rhs: +; GFX9-DENORM: ; %bb.0: ; %.entry +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-DENORM-NEXT: v_mad_f32 v0, v0, v1, v2 +; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_f32_add_mul_rhs: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-CONTRACT-LABEL: test_f32_add_mul_rhs: +; GFX10-CONTRACT: ; %bb.0: ; %.entry +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-DENORM-LABEL: test_f32_add_mul_rhs: +; GFX10-DENORM: ; %bb.0: ; %.entry +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-DENORM-NEXT: v_mad_f32 v0, v0, v1, v2 +; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +.entry: + %a = fmul float %x, %y + %b = fadd float %z, %a + ret float %b +} + +define half @test_half_add_mul(half %x, half %y, half %z) { +; GFX9-LABEL: test_half_add_mul: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX9-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-CONTRACT-LABEL: test_half_add_mul: +; GFX9-CONTRACT: ; %bb.0: ; %.entry +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-CONTRACT-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-DENORM-LABEL: test_half_add_mul: +; GFX9-DENORM: ; %bb.0: ; %.entry +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-DENORM-NEXT: v_mad_legacy_f16 v0, v0, v1, v2 +; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_half_add_mul: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX10-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-CONTRACT-LABEL: test_half_add_mul: +; GFX10-CONTRACT: ; %bb.0: ; %.entry +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CONTRACT-NEXT: v_fmac_f16_e32 v2, v0, v1 +; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-DENORM-LABEL: test_half_add_mul: +; GFX10-DENORM: ; %bb.0: ; %.entry +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-DENORM-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX10-DENORM-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +.entry: + %a = fmul half %x, %y + %b = fadd half %a, %z + ret half %b +} + +define half @test_half_add_mul_rhs(half %x, half %y, half %z) { +; GFX9-LABEL: test_half_add_mul_rhs: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX9-NEXT: v_add_f16_e32 v0, v2, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-CONTRACT-LABEL: test_half_add_mul_rhs: +; GFX9-CONTRACT: ; %bb.0: ; %.entry +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-CONTRACT-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-DENORM-LABEL: test_half_add_mul_rhs: +; GFX9-DENORM: ; %bb.0: ; %.entry +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-DENORM-NEXT: v_mad_legacy_f16 v0, v0, v1, v2 +; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_half_add_mul_rhs: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX10-NEXT: v_add_f16_e32 v0, v2, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-CONTRACT-LABEL: test_half_add_mul_rhs: +; GFX10-CONTRACT: ; %bb.0: ; %.entry +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CONTRACT-NEXT: v_fmac_f16_e32 v2, v0, v1 +; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-DENORM-LABEL: test_half_add_mul_rhs: +; GFX10-DENORM: ; %bb.0: ; %.entry +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-DENORM-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX10-DENORM-NEXT: v_add_f16_e32 v0, v2, v0 +; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +.entry: + %a = fmul half %x, %y + %b = fadd half %z, %a + ret half %b +} + +define double @test_double_add_mul(double %x, double %y, double %z) { +; GFX9-LABEL: test_double_add_mul: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-CONTRACT-LABEL: test_double_add_mul: +; GFX9-CONTRACT: ; %bb.0: ; %.entry +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-DENORM-LABEL: test_double_add_mul: +; GFX9-DENORM: ; %bb.0: ; %.entry +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-DENORM-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GFX9-DENORM-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5] +; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_double_add_mul: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GFX10-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-CONTRACT-LABEL: test_double_add_mul: +; GFX10-CONTRACT: ; %bb.0: ; %.entry +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-DENORM-LABEL: test_double_add_mul: +; GFX10-DENORM: ; %bb.0: ; %.entry +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-DENORM-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] +; GFX10-DENORM-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +.entry: + %a = fmul double %x, %y + %b = fadd double %a, %z + ret double %b +} + +define double @test_double_add_mul_rhs(double %x, double %y, double %z) { +; GFX9-LABEL: test_double_add_mul_rhs: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GFX9-NEXT: v_add_f64 v[0:1], v[4:5], v[0:1] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-CONTRACT-LABEL: test_double_add_mul_rhs: +; GFX9-CONTRACT: ; %bb.0: ; %.entry +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-DENORM-LABEL: test_double_add_mul_rhs: +; GFX9-DENORM: ; %bb.0: ; %.entry +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-DENORM-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GFX9-DENORM-NEXT: v_add_f64 v[0:1], v[4:5], v[0:1] +; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_double_add_mul_rhs: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GFX10-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-NEXT: v_mov_b32_e32 v7, v5 +; GFX10-NEXT: v_add_f64 v[0:1], v[6:7], v[0:1] +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-CONTRACT-LABEL: test_double_add_mul_rhs: +; GFX10-CONTRACT: ; %bb.0: ; %.entry +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-DENORM-LABEL: test_double_add_mul_rhs: +; GFX10-DENORM: ; %bb.0: ; %.entry +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-DENORM-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3] +; GFX10-DENORM-NEXT: v_mov_b32_e32 v6, v4 +; GFX10-DENORM-NEXT: v_mov_b32_e32 v7, v5 +; GFX10-DENORM-NEXT: v_add_f64 v[0:1], v[6:7], v[0:1] +; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +.entry: + %a = fmul double %x, %y + %b = fadd double %z, %a + ret double %b +} + +define <4 x float> @test_4xfloat_add_mul(<4 x float> %x, <4 x float> %y, <4 x float> %z) { +; GFX9-LABEL: test_4xfloat_add_mul: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v4 +; GFX9-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX9-NEXT: v_mul_f32_e32 v2, v2, v6 +; GFX9-NEXT: v_mul_f32_e32 v3, v3, v7 +; GFX9-NEXT: v_add_f32_e32 v0, v0, v8 +; GFX9-NEXT: v_add_f32_e32 v1, v1, v9 +; GFX9-NEXT: v_add_f32_e32 v2, v2, v10 +; GFX9-NEXT: v_add_f32_e32 v3, v3, v11 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-CONTRACT-LABEL: test_4xfloat_add_mul: +; GFX9-CONTRACT: ; %bb.0: ; %.entry +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-CONTRACT-NEXT: v_fma_f32 v0, v0, v4, v8 +; GFX9-CONTRACT-NEXT: v_fma_f32 v1, v1, v5, v9 +; GFX9-CONTRACT-NEXT: v_fma_f32 v2, v2, v6, v10 +; GFX9-CONTRACT-NEXT: v_fma_f32 v3, v3, v7, v11 +; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-DENORM-LABEL: test_4xfloat_add_mul: +; GFX9-DENORM: ; %bb.0: ; %.entry +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-DENORM-NEXT: v_mad_f32 v0, v0, v4, v8 +; GFX9-DENORM-NEXT: v_mad_f32 v1, v1, v5, v9 +; GFX9-DENORM-NEXT: v_mad_f32 v2, v2, v6, v10 +; GFX9-DENORM-NEXT: v_mad_f32 v3, v3, v7, v11 +; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_4xfloat_add_mul: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f32_e32 v23, v2, v6 +; GFX10-NEXT: v_mul_f32_e32 v15, v0, v4 +; GFX10-NEXT: v_mul_f32_e32 v19, v1, v5 +; GFX10-NEXT: v_mul_f32_e32 v6, v3, v7 +; GFX10-NEXT: v_add_f32_e32 v2, v23, v10 +; GFX10-NEXT: v_add_f32_e32 v0, v15, v8 +; GFX10-NEXT: v_add_f32_e32 v1, v19, v9 +; GFX10-NEXT: v_add_f32_e32 v3, v6, v11 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-CONTRACT-LABEL: test_4xfloat_add_mul: +; GFX10-CONTRACT: ; %bb.0: ; %.entry +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CONTRACT-NEXT: v_fma_f32 v0, v0, v4, v8 +; GFX10-CONTRACT-NEXT: v_fma_f32 v1, v1, v5, v9 +; GFX10-CONTRACT-NEXT: v_fma_f32 v2, v2, v6, v10 +; GFX10-CONTRACT-NEXT: v_fma_f32 v3, v3, v7, v11 +; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-DENORM-LABEL: test_4xfloat_add_mul: +; GFX10-DENORM: ; %bb.0: ; %.entry +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-DENORM-NEXT: v_mad_f32 v0, v0, v4, v8 +; GFX10-DENORM-NEXT: v_mad_f32 v1, v1, v5, v9 +; GFX10-DENORM-NEXT: v_mad_f32 v2, v2, v6, v10 +; GFX10-DENORM-NEXT: v_mad_f32 v3, v3, v7, v11 +; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +.entry: + %a = fmul <4 x float> %x, %y + %b = fadd <4 x float> %a, %z + ret <4 x float> %b +} + +define <5 x float> @test_5xfloat_add_mul_rhs(<5 x float> %x, <5 x float> %y, <5 x float> %z) { +; GFX9-LABEL: test_5xfloat_add_mul_rhs: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v5 +; GFX9-NEXT: v_mul_f32_e32 v1, v1, v6 +; GFX9-NEXT: v_mul_f32_e32 v2, v2, v7 +; GFX9-NEXT: v_mul_f32_e32 v3, v3, v8 +; GFX9-NEXT: v_mul_f32_e32 v4, v4, v9 +; GFX9-NEXT: v_add_f32_e32 v0, v10, v0 +; GFX9-NEXT: v_add_f32_e32 v1, v11, v1 +; GFX9-NEXT: v_add_f32_e32 v2, v12, v2 +; GFX9-NEXT: v_add_f32_e32 v3, v13, v3 +; GFX9-NEXT: v_add_f32_e32 v4, v14, v4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-CONTRACT-LABEL: test_5xfloat_add_mul_rhs: +; GFX9-CONTRACT: ; %bb.0: ; %.entry +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-CONTRACT-NEXT: v_fma_f32 v0, v0, v5, v10 +; GFX9-CONTRACT-NEXT: v_fma_f32 v1, v1, v6, v11 +; GFX9-CONTRACT-NEXT: v_fma_f32 v2, v2, v7, v12 +; GFX9-CONTRACT-NEXT: v_fma_f32 v3, v3, v8, v13 +; GFX9-CONTRACT-NEXT: v_fma_f32 v4, v4, v9, v14 +; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-DENORM-LABEL: test_5xfloat_add_mul_rhs: +; GFX9-DENORM: ; %bb.0: ; %.entry +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-DENORM-NEXT: v_mad_f32 v0, v0, v5, v10 +; GFX9-DENORM-NEXT: v_mad_f32 v1, v1, v6, v11 +; GFX9-DENORM-NEXT: v_mad_f32 v2, v2, v7, v12 +; GFX9-DENORM-NEXT: v_mad_f32 v3, v3, v8, v13 +; GFX9-DENORM-NEXT: v_mad_f32 v4, v4, v9, v14 +; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_5xfloat_add_mul_rhs: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v5 +; GFX10-NEXT: v_mul_f32_e32 v1, v1, v6 +; GFX10-NEXT: v_mul_f32_e32 v2, v2, v7 +; GFX10-NEXT: v_mul_f32_e32 v3, v3, v8 +; GFX10-NEXT: v_mul_f32_e32 v4, v4, v9 +; GFX10-NEXT: v_add_f32_e32 v0, v10, v0 +; GFX10-NEXT: v_add_f32_e32 v1, v11, v1 +; GFX10-NEXT: v_add_f32_e32 v2, v12, v2 +; GFX10-NEXT: v_add_f32_e32 v3, v13, v3 +; GFX10-NEXT: v_add_f32_e32 v4, v14, v4 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-CONTRACT-LABEL: test_5xfloat_add_mul_rhs: +; GFX10-CONTRACT: ; %bb.0: ; %.entry +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CONTRACT-NEXT: v_fma_f32 v0, v0, v5, v10 +; GFX10-CONTRACT-NEXT: v_fma_f32 v1, v1, v6, v11 +; GFX10-CONTRACT-NEXT: v_fma_f32 v2, v2, v7, v12 +; GFX10-CONTRACT-NEXT: v_fma_f32 v3, v3, v8, v13 +; GFX10-CONTRACT-NEXT: v_fma_f32 v4, v4, v9, v14 +; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-DENORM-LABEL: test_5xfloat_add_mul_rhs: +; GFX10-DENORM: ; %bb.0: ; %.entry +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-DENORM-NEXT: v_mad_f32 v0, v0, v5, v10 +; GFX10-DENORM-NEXT: v_mad_f32 v1, v1, v6, v11 +; GFX10-DENORM-NEXT: v_mad_f32 v2, v2, v7, v12 +; GFX10-DENORM-NEXT: v_mad_f32 v3, v3, v8, v13 +; GFX10-DENORM-NEXT: v_mad_f32 v4, v4, v9, v14 +; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +.entry: + %a = fmul <5 x float> %x, %y + %b = fadd <5 x float> %z, %a + ret <5 x float> %b +} + +define <4 x half> @test_4xhalf_add_mul(<4 x half> %x, <4 x half> %y, <4 x half> %z) { +; GFX9-LABEL: test_4xhalf_add_mul: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 +; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 +; GFX9-NEXT: v_pk_add_f16 v0, v0, v4 +; GFX9-NEXT: v_pk_add_f16 v1, v1, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-CONTRACT-LABEL: test_4xhalf_add_mul: +; GFX9-CONTRACT: ; %bb.0: ; %.entry +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v4 +; GFX9-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v5 +; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-DENORM-LABEL: test_4xhalf_add_mul: +; GFX9-DENORM: ; %bb.0: ; %.entry +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 +; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 +; GFX9-DENORM-NEXT: v_pk_add_f16 v0, v0, v4 +; GFX9-DENORM-NEXT: v_pk_add_f16 v1, v1, v5 +; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_4xhalf_add_mul: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_pk_mul_f16 v7, v0, v2 +; GFX10-NEXT: v_pk_mul_f16 v3, v1, v3 +; GFX10-NEXT: v_pk_add_f16 v0, v7, v4 +; GFX10-NEXT: v_pk_add_f16 v1, v3, v5 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-CONTRACT-LABEL: test_4xhalf_add_mul: +; GFX10-CONTRACT: ; %bb.0: ; %.entry +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v4 +; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v5 +; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-DENORM-LABEL: test_4xhalf_add_mul: +; GFX10-DENORM: ; %bb.0: ; %.entry +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-DENORM-NEXT: v_pk_mul_f16 v7, v0, v2 +; GFX10-DENORM-NEXT: v_pk_mul_f16 v3, v1, v3 +; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v7, v4 +; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v3, v5 +; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +.entry: + %a = fmul <4 x half> %x, %y + %b = fadd <4 x half> %a, %z + ret <4 x half> %b +} + +define <3 x half> @test_4xhalf_add_mul_rhs(<3 x half> %x, <3 x half> %y, <3 x half> %z) { +; GFX9-LABEL: test_4xhalf_add_mul_rhs: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX9-NEXT: v_mov_b32_e32 v9, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX9-NEXT: v_and_or_b32 v0, v0, v9, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; GFX9-NEXT: v_and_or_b32 v2, v2, v9, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; GFX9-NEXT: v_and_or_b32 v2, v4, v9, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX9-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX9-NEXT: v_and_or_b32 v0, v0, v9, v4 +; GFX9-NEXT: v_and_or_b32 v1, v1, v9, s4 +; GFX9-NEXT: v_and_or_b32 v3, v3, v9, s4 +; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 +; GFX9-NEXT: v_pk_add_f16 v0, v2, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_and_or_b32 v3, v5, v9, s4 +; GFX9-NEXT: v_and_or_b32 v1, v1, v9, s4 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_pk_add_f16 v1, v3, v1 +; GFX9-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-NEXT: v_and_or_b32 v0, v0, v9, v2 +; GFX9-NEXT: v_and_or_b32 v1, v1, v9, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-CONTRACT-LABEL: test_4xhalf_add_mul_rhs: +; GFX9-CONTRACT: ; %bb.0: ; %.entry +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-CONTRACT-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX9-CONTRACT-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX9-CONTRACT-NEXT: v_mov_b32_e32 v9, 0xffff +; GFX9-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX9-CONTRACT-NEXT: v_and_or_b32 v0, v0, v9, v6 +; GFX9-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; GFX9-CONTRACT-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX9-CONTRACT-NEXT: v_and_or_b32 v2, v2, v9, v6 +; GFX9-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX9-CONTRACT-NEXT: v_and_or_b32 v4, v4, v9, v6 +; GFX9-CONTRACT-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v4 +; GFX9-CONTRACT-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-CONTRACT-NEXT: v_and_or_b32 v1, v1, v9, s4 +; GFX9-CONTRACT-NEXT: v_and_or_b32 v3, v3, v9, s4 +; GFX9-CONTRACT-NEXT: v_and_or_b32 v5, v5, v9, s4 +; GFX9-CONTRACT-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v5 +; GFX9-CONTRACT-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-CONTRACT-NEXT: v_and_or_b32 v0, v0, v9, v2 +; GFX9-CONTRACT-NEXT: v_and_or_b32 v1, v1, v9, s4 +; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-DENORM-LABEL: test_4xhalf_add_mul_rhs: +; GFX9-DENORM: ; %bb.0: ; %.entry +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-DENORM-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX9-DENORM-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX9-DENORM-NEXT: v_mov_b32_e32 v9, 0xffff +; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX9-DENORM-NEXT: v_and_or_b32 v0, v0, v9, v6 +; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; GFX9-DENORM-NEXT: v_and_or_b32 v2, v2, v9, v6 +; GFX9-DENORM-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 +; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; GFX9-DENORM-NEXT: v_and_or_b32 v2, v4, v9, v2 +; GFX9-DENORM-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX9-DENORM-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX9-DENORM-NEXT: v_and_or_b32 v0, v0, v9, v4 +; GFX9-DENORM-NEXT: v_and_or_b32 v1, v1, v9, s4 +; GFX9-DENORM-NEXT: v_and_or_b32 v3, v3, v9, s4 +; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 +; GFX9-DENORM-NEXT: v_pk_add_f16 v0, v2, v0 +; GFX9-DENORM-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-DENORM-NEXT: v_and_or_b32 v3, v5, v9, s4 +; GFX9-DENORM-NEXT: v_and_or_b32 v1, v1, v9, s4 +; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-DENORM-NEXT: v_pk_add_f16 v1, v3, v1 +; GFX9-DENORM-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-DENORM-NEXT: v_and_or_b32 v0, v0, v9, v2 +; GFX9-DENORM-NEXT: v_and_or_b32 v1, v1, v9, s4 +; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_4xhalf_add_mul_rhs: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX10-NEXT: v_mov_b32_e32 v11, 0xffff +; GFX10-NEXT: s_lshl_b32 s4, s4, 16 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v7 +; GFX10-NEXT: v_and_or_b32 v1, v1, v11, s4 +; GFX10-NEXT: v_and_or_b32 v3, v3, v11, s4 +; GFX10-NEXT: v_and_or_b32 v0, v0, v11, v6 +; GFX10-NEXT: v_and_or_b32 v2, v2, v11, v9 +; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 +; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX10-NEXT: v_and_or_b32 v1, v1, v11, s4 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX10-NEXT: v_and_or_b32 v2, v4, v11, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, v11, v6 +; GFX10-NEXT: v_pk_add_f16 v0, v2, v0 +; GFX10-NEXT: v_and_or_b32 v2, v5, v11, s4 +; GFX10-NEXT: s_lshl_b32 s4, s4, 16 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX10-NEXT: v_pk_add_f16 v1, v2, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GFX10-NEXT: v_and_or_b32 v1, v1, v11, s4 +; GFX10-NEXT: v_and_or_b32 v0, v0, v11, v6 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-CONTRACT-LABEL: test_4xhalf_add_mul_rhs: +; GFX10-CONTRACT: ; %bb.0: ; %.entry +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v10, 0xffff +; GFX10-CONTRACT-NEXT: s_lshl_b32 s4, s4, 16 +; GFX10-CONTRACT-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX10-CONTRACT-NEXT: v_lshlrev_b32_e32 v11, 16, v6 +; GFX10-CONTRACT-NEXT: v_lshlrev_b32_e32 v15, 16, v8 +; GFX10-CONTRACT-NEXT: v_and_or_b32 v1, v1, v10, s4 +; GFX10-CONTRACT-NEXT: v_and_or_b32 v2, v2, v10, v7 +; GFX10-CONTRACT-NEXT: v_and_or_b32 v11, v0, v10, v11 +; GFX10-CONTRACT-NEXT: v_and_or_b32 v4, v4, v10, v15 +; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v0, v11, v2, v4 +; GFX10-CONTRACT-NEXT: v_and_or_b32 v2, v3, v10, s4 +; GFX10-CONTRACT-NEXT: v_and_or_b32 v4, v5, v10, s4 +; GFX10-CONTRACT-NEXT: s_lshl_b32 s4, s4, 16 +; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v2, v4 +; GFX10-CONTRACT-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX10-CONTRACT-NEXT: v_and_or_b32 v1, v1, v10, s4 +; GFX10-CONTRACT-NEXT: v_and_or_b32 v0, v0, v10, v3 +; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-DENORM-LABEL: test_4xhalf_add_mul_rhs: +; GFX10-DENORM: ; %bb.0: ; %.entry +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX10-DENORM-NEXT: v_mov_b32_e32 v11, 0xffff +; GFX10-DENORM-NEXT: s_lshl_b32 s4, s4, 16 +; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v9, 16, v7 +; GFX10-DENORM-NEXT: v_and_or_b32 v1, v1, v11, s4 +; GFX10-DENORM-NEXT: v_and_or_b32 v3, v3, v11, s4 +; GFX10-DENORM-NEXT: v_and_or_b32 v0, v0, v11, v6 +; GFX10-DENORM-NEXT: v_and_or_b32 v2, v2, v11, v9 +; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 +; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 +; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; GFX10-DENORM-NEXT: v_and_or_b32 v1, v1, v11, s4 +; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX10-DENORM-NEXT: v_and_or_b32 v2, v4, v11, v2 +; GFX10-DENORM-NEXT: v_and_or_b32 v0, v0, v11, v6 +; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v2, v0 +; GFX10-DENORM-NEXT: v_and_or_b32 v2, v5, v11, s4 +; GFX10-DENORM-NEXT: s_lshl_b32 s4, s4, 16 +; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v2, v1 +; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GFX10-DENORM-NEXT: v_and_or_b32 v1, v1, v11, s4 +; GFX10-DENORM-NEXT: v_and_or_b32 v0, v0, v11, v6 +; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +.entry: + %a = fmul <3 x half> %x, %y + %b = fadd <3 x half> %z, %a + ret <3 x half> %b +} + + +define <4 x double> @test_4xdouble_add_mul(<4 x double> %x, <4 x double> %y, <4 x double> %z) { +; GFX9-LABEL: test_4xdouble_add_mul: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[8:9] +; GFX9-NEXT: v_mul_f64 v[2:3], v[2:3], v[10:11] +; GFX9-NEXT: v_mul_f64 v[4:5], v[4:5], v[12:13] +; GFX9-NEXT: v_mul_f64 v[6:7], v[6:7], v[14:15] +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], v[16:17] +; GFX9-NEXT: v_add_f64 v[2:3], v[2:3], v[18:19] +; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], v[20:21] +; GFX9-NEXT: v_add_f64 v[6:7], v[6:7], v[22:23] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-CONTRACT-LABEL: test_4xdouble_add_mul: +; GFX9-CONTRACT: ; %bb.0: ; %.entry +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[8:9], v[16:17] +; GFX9-CONTRACT-NEXT: v_fma_f64 v[2:3], v[2:3], v[10:11], v[18:19] +; GFX9-CONTRACT-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[20:21] +; GFX9-CONTRACT-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[22:23] +; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-DENORM-LABEL: test_4xdouble_add_mul: +; GFX9-DENORM: ; %bb.0: ; %.entry +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-DENORM-NEXT: v_mul_f64 v[0:1], v[0:1], v[8:9] +; GFX9-DENORM-NEXT: v_mul_f64 v[2:3], v[2:3], v[10:11] +; GFX9-DENORM-NEXT: v_mul_f64 v[4:5], v[4:5], v[12:13] +; GFX9-DENORM-NEXT: v_mul_f64 v[6:7], v[6:7], v[14:15] +; GFX9-DENORM-NEXT: v_add_f64 v[0:1], v[0:1], v[16:17] +; GFX9-DENORM-NEXT: v_add_f64 v[2:3], v[2:3], v[18:19] +; GFX9-DENORM-NEXT: v_add_f64 v[4:5], v[4:5], v[20:21] +; GFX9-DENORM-NEXT: v_add_f64 v[6:7], v[6:7], v[22:23] +; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_4xdouble_add_mul: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v26, v0 +; GFX10-NEXT: v_mov_b32_e32 v27, v1 +; GFX10-NEXT: v_mov_b32_e32 v24, v2 +; GFX10-NEXT: v_mov_b32_e32 v25, v3 +; GFX10-NEXT: v_mov_b32_e32 v30, v4 +; GFX10-NEXT: v_mov_b32_e32 v31, v5 +; GFX10-NEXT: v_mov_b32_e32 v28, v6 +; GFX10-NEXT: v_mov_b32_e32 v29, v7 +; GFX10-NEXT: v_mul_f64 v[26:27], v[26:27], v[8:9] +; GFX10-NEXT: v_mul_f64 v[8:9], v[24:25], v[10:11] +; GFX10-NEXT: v_mul_f64 v[10:11], v[30:31], v[12:13] +; GFX10-NEXT: v_mul_f64 v[12:13], v[28:29], v[14:15] +; GFX10-NEXT: v_add_f64 v[0:1], v[26:27], v[16:17] +; GFX10-NEXT: v_add_f64 v[2:3], v[8:9], v[18:19] +; GFX10-NEXT: v_add_f64 v[4:5], v[10:11], v[20:21] +; GFX10-NEXT: v_add_f64 v[6:7], v[12:13], v[22:23] +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-CONTRACT-LABEL: test_4xdouble_add_mul: +; GFX10-CONTRACT: ; %bb.0: ; %.entry +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v26, v0 +; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v27, v1 +; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v24, v2 +; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v25, v3 +; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v30, v4 +; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v31, v5 +; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v28, v6 +; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v29, v7 +; GFX10-CONTRACT-NEXT: v_fma_f64 v[0:1], v[26:27], v[8:9], v[16:17] +; GFX10-CONTRACT-NEXT: v_fma_f64 v[2:3], v[24:25], v[10:11], v[18:19] +; GFX10-CONTRACT-NEXT: v_fma_f64 v[4:5], v[30:31], v[12:13], v[20:21] +; GFX10-CONTRACT-NEXT: v_fma_f64 v[6:7], v[28:29], v[14:15], v[22:23] +; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-DENORM-LABEL: test_4xdouble_add_mul: +; GFX10-DENORM: ; %bb.0: ; %.entry +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-DENORM-NEXT: v_mov_b32_e32 v26, v0 +; GFX10-DENORM-NEXT: v_mov_b32_e32 v27, v1 +; GFX10-DENORM-NEXT: v_mov_b32_e32 v24, v2 +; GFX10-DENORM-NEXT: v_mov_b32_e32 v25, v3 +; GFX10-DENORM-NEXT: v_mov_b32_e32 v30, v4 +; GFX10-DENORM-NEXT: v_mov_b32_e32 v31, v5 +; GFX10-DENORM-NEXT: v_mov_b32_e32 v28, v6 +; GFX10-DENORM-NEXT: v_mov_b32_e32 v29, v7 +; GFX10-DENORM-NEXT: v_mul_f64 v[26:27], v[26:27], v[8:9] +; GFX10-DENORM-NEXT: v_mul_f64 v[8:9], v[24:25], v[10:11] +; GFX10-DENORM-NEXT: v_mul_f64 v[10:11], v[30:31], v[12:13] +; GFX10-DENORM-NEXT: v_mul_f64 v[12:13], v[28:29], v[14:15] +; GFX10-DENORM-NEXT: v_add_f64 v[0:1], v[26:27], v[16:17] +; GFX10-DENORM-NEXT: v_add_f64 v[2:3], v[8:9], v[18:19] +; GFX10-DENORM-NEXT: v_add_f64 v[4:5], v[10:11], v[20:21] +; GFX10-DENORM-NEXT: v_add_f64 v[6:7], v[12:13], v[22:23] +; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +.entry: + %a = fmul <4 x double> %x, %y + %b = fadd <4 x double> %a, %z + ret <4 x double> %b +} + +define <10 x double> @test_10xdouble_add_mul_rhs(<10 x double> %x, <10 x double> %y, <10 x double> %z) { +; GFX9-LABEL: test_10xdouble_add_mul_rhs: +; GFX9: ; %bb.0: ; %.entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], v[20:21] +; GFX9-NEXT: v_mul_f64 v[2:3], v[2:3], v[22:23] +; GFX9-NEXT: v_mul_f64 v[4:5], v[4:5], v[24:25] +; GFX9-NEXT: v_mul_f64 v[6:7], v[6:7], v[26:27] +; GFX9-NEXT: v_mul_f64 v[8:9], v[8:9], v[28:29] +; GFX9-NEXT: v_mul_f64 v[10:11], v[10:11], v[30:31] +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_mul_f64 v[12:13], v[12:13], v[20:21] +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_mul_f64 v[14:15], v[14:15], v[22:23] +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_mul_f64 v[16:17], v[16:17], v[24:25] +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_mul_f64 v[18:19], v[18:19], v[26:27] +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:76 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_add_f64 v[0:1], v[28:29], v[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_add_f64 v[2:3], v[30:31], v[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_add_f64 v[4:5], v[20:21], v[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add_f64 v[6:7], v[22:23], v[6:7] +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_add_f64 v[8:9], v[24:25], v[8:9] +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_add_f64 v[10:11], v[26:27], v[10:11] +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_add_f64 v[12:13], v[20:21], v[12:13] +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add_f64 v[14:15], v[22:23], v[14:15] +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_f64 v[16:17], v[28:29], v[16:17] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[18:19], v[30:31], v[18:19] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-CONTRACT-LABEL: test_10xdouble_add_mul_rhs: +; GFX9-CONTRACT: ; %bb.0: ; %.entry +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-CONTRACT-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 +; GFX9-CONTRACT-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:32 +; GFX9-CONTRACT-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:36 +; GFX9-CONTRACT-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:40 +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(1) +; GFX9-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[20:21], v[34:35] +; GFX9-CONTRACT-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16 +; GFX9-CONTRACT-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:20 +; GFX9-CONTRACT-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 +; GFX9-CONTRACT-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:44 +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX9-CONTRACT-NEXT: v_fma_f64 v[2:3], v[2:3], v[22:23], v[36:37] +; GFX9-CONTRACT-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:48 +; GFX9-CONTRACT-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:52 +; GFX9-CONTRACT-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:56 +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(1) +; GFX9-CONTRACT-NEXT: v_fma_f64 v[4:5], v[4:5], v[24:25], v[22:23] +; GFX9-CONTRACT-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:60 +; GFX9-CONTRACT-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 +; GFX9-CONTRACT-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 +; GFX9-CONTRACT-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72 +; GFX9-CONTRACT-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:76 +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(4) +; GFX9-CONTRACT-NEXT: v_fma_f64 v[6:7], v[6:7], v[26:27], v[34:35] +; GFX9-CONTRACT-NEXT: buffer_load_dword v26, off, s[0:3], s32 +; GFX9-CONTRACT-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:4 +; GFX9-CONTRACT-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 +; GFX9-CONTRACT-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(6) +; GFX9-CONTRACT-NEXT: v_fma_f64 v[8:9], v[8:9], v[28:29], v[22:23] +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(4) +; GFX9-CONTRACT-NEXT: v_fma_f64 v[10:11], v[10:11], v[30:31], v[24:25] +; GFX9-CONTRACT-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:80 +; GFX9-CONTRACT-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 +; GFX9-CONTRACT-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:88 +; GFX9-CONTRACT-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:92 +; GFX9-CONTRACT-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96 +; GFX9-CONTRACT-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:100 +; GFX9-CONTRACT-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:104 +; GFX9-CONTRACT-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(6) +; GFX9-CONTRACT-NEXT: v_fma_f64 v[12:13], v[12:13], v[26:27], v[22:23] +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(4) +; GFX9-CONTRACT-NEXT: v_fma_f64 v[14:15], v[14:15], v[34:35], v[24:25] +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(2) +; GFX9-CONTRACT-NEXT: v_fma_f64 v[16:17], v[16:17], v[20:21], v[28:29] +; GFX9-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX9-CONTRACT-NEXT: v_fma_f64 v[18:19], v[18:19], v[32:33], v[30:31] +; GFX9-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-DENORM-LABEL: test_10xdouble_add_mul_rhs: +; GFX9-DENORM: ; %bb.0: ; %.entry +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-DENORM-NEXT: v_mul_f64 v[0:1], v[0:1], v[20:21] +; GFX9-DENORM-NEXT: v_mul_f64 v[2:3], v[2:3], v[22:23] +; GFX9-DENORM-NEXT: v_mul_f64 v[4:5], v[4:5], v[24:25] +; GFX9-DENORM-NEXT: v_mul_f64 v[6:7], v[6:7], v[26:27] +; GFX9-DENORM-NEXT: v_mul_f64 v[8:9], v[8:9], v[28:29] +; GFX9-DENORM-NEXT: v_mul_f64 v[10:11], v[10:11], v[30:31] +; GFX9-DENORM-NEXT: buffer_load_dword v20, off, s[0:3], s32 +; GFX9-DENORM-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:4 +; GFX9-DENORM-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:8 +; GFX9-DENORM-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:12 +; GFX9-DENORM-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:16 +; GFX9-DENORM-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:20 +; GFX9-DENORM-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:24 +; GFX9-DENORM-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:28 +; GFX9-DENORM-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:32 +; GFX9-DENORM-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:36 +; GFX9-DENORM-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:40 +; GFX9-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(10) +; GFX9-DENORM-NEXT: v_mul_f64 v[12:13], v[12:13], v[20:21] +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(8) +; GFX9-DENORM-NEXT: v_mul_f64 v[14:15], v[14:15], v[22:23] +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(6) +; GFX9-DENORM-NEXT: v_mul_f64 v[16:17], v[16:17], v[24:25] +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(4) +; GFX9-DENORM-NEXT: v_mul_f64 v[18:19], v[18:19], v[26:27] +; GFX9-DENORM-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:48 +; GFX9-DENORM-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:52 +; GFX9-DENORM-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 +; GFX9-DENORM-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:60 +; GFX9-DENORM-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64 +; GFX9-DENORM-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 +; GFX9-DENORM-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:72 +; GFX9-DENORM-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:76 +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(10) +; GFX9-DENORM-NEXT: v_add_f64 v[0:1], v[28:29], v[0:1] +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(8) +; GFX9-DENORM-NEXT: v_add_f64 v[2:3], v[30:31], v[2:3] +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(6) +; GFX9-DENORM-NEXT: v_add_f64 v[4:5], v[20:21], v[4:5] +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(4) +; GFX9-DENORM-NEXT: v_add_f64 v[6:7], v[22:23], v[6:7] +; GFX9-DENORM-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 +; GFX9-DENORM-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 +; GFX9-DENORM-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:88 +; GFX9-DENORM-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:92 +; GFX9-DENORM-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96 +; GFX9-DENORM-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:100 +; GFX9-DENORM-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:104 +; GFX9-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(10) +; GFX9-DENORM-NEXT: v_add_f64 v[8:9], v[24:25], v[8:9] +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(8) +; GFX9-DENORM-NEXT: v_add_f64 v[10:11], v[26:27], v[10:11] +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(6) +; GFX9-DENORM-NEXT: v_add_f64 v[12:13], v[20:21], v[12:13] +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(4) +; GFX9-DENORM-NEXT: v_add_f64 v[14:15], v[22:23], v[14:15] +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(2) +; GFX9-DENORM-NEXT: v_add_f64 v[16:17], v[28:29], v[16:17] +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: v_add_f64 v[18:19], v[30:31], v[18:19] +; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_10xdouble_add_mul_rhs: +; GFX10: ; %bb.0: ; %.entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_clause 0xb +; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 +; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 +; GFX10-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 +; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 +; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 +; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 +; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32 +; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 +; GFX10-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:40 +; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 +; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], v[20:21] +; GFX10-NEXT: v_mul_f64 v[2:3], v[2:3], v[22:23] +; GFX10-NEXT: v_mul_f64 v[4:5], v[4:5], v[24:25] +; GFX10-NEXT: v_mul_f64 v[6:7], v[6:7], v[26:27] +; GFX10-NEXT: s_clause 0x7 +; GFX10-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:48 +; GFX10-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:52 +; GFX10-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 +; GFX10-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:60 +; GFX10-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64 +; GFX10-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 +; GFX10-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:72 +; GFX10-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:76 +; GFX10-NEXT: v_mul_f64 v[8:9], v[8:9], v[28:29] +; GFX10-NEXT: v_mul_f64 v[10:11], v[10:11], v[30:31] +; GFX10-NEXT: s_waitcnt vmcnt(18) +; GFX10-NEXT: v_mul_f64 v[12:13], v[12:13], v[32:33] +; GFX10-NEXT: s_waitcnt vmcnt(16) +; GFX10-NEXT: v_mul_f64 v[14:15], v[14:15], v[34:35] +; GFX10-NEXT: s_clause 0x7 +; GFX10-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 +; GFX10-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:84 +; GFX10-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:88 +; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 +; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 +; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100 +; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:104 +; GFX10-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 +; GFX10-NEXT: s_waitcnt vmcnt(22) +; GFX10-NEXT: v_mul_f64 v[16:17], v[16:17], v[36:37] +; GFX10-NEXT: s_waitcnt vmcnt(20) +; GFX10-NEXT: v_mul_f64 v[36:37], v[18:19], v[38:39] +; GFX10-NEXT: s_waitcnt vmcnt(18) +; GFX10-NEXT: v_add_f64 v[0:1], v[48:49], v[0:1] +; GFX10-NEXT: s_waitcnt vmcnt(14) +; GFX10-NEXT: v_add_f64 v[4:5], v[20:21], v[4:5] +; GFX10-NEXT: v_add_f64 v[2:3], v[50:51], v[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(12) +; GFX10-NEXT: v_add_f64 v[6:7], v[22:23], v[6:7] +; GFX10-NEXT: s_waitcnt vmcnt(10) +; GFX10-NEXT: v_add_f64 v[8:9], v[24:25], v[8:9] +; GFX10-NEXT: s_waitcnt vmcnt(8) +; GFX10-NEXT: v_add_f64 v[10:11], v[26:27], v[10:11] +; GFX10-NEXT: s_waitcnt vmcnt(6) +; GFX10-NEXT: v_add_f64 v[12:13], v[28:29], v[12:13] +; GFX10-NEXT: s_waitcnt vmcnt(4) +; GFX10-NEXT: v_add_f64 v[14:15], v[30:31], v[14:15] +; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: v_add_f64 v[16:17], v[32:33], v[16:17] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_f64 v[18:19], v[34:35], v[36:37] +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-CONTRACT-LABEL: test_10xdouble_add_mul_rhs: +; GFX10-CONTRACT: ; %bb.0: ; %.entry +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CONTRACT-NEXT: s_clause 0xb +; GFX10-CONTRACT-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 +; GFX10-CONTRACT-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 +; GFX10-CONTRACT-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:40 +; GFX10-CONTRACT-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:44 +; GFX10-CONTRACT-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:48 +; GFX10-CONTRACT-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:52 +; GFX10-CONTRACT-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:56 +; GFX10-CONTRACT-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:60 +; GFX10-CONTRACT-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16 +; GFX10-CONTRACT-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 +; GFX10-CONTRACT-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:24 +; GFX10-CONTRACT-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:28 +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(10) +; GFX10-CONTRACT-NEXT: v_fma_f64 v[0:1], v[0:1], v[20:21], v[32:33] +; GFX10-CONTRACT-NEXT: s_clause 0x3 +; GFX10-CONTRACT-NEXT: buffer_load_dword v20, off, s[0:3], s32 +; GFX10-CONTRACT-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:4 +; GFX10-CONTRACT-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 +; GFX10-CONTRACT-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(12) +; GFX10-CONTRACT-NEXT: v_fma_f64 v[2:3], v[2:3], v[22:23], v[34:35] +; GFX10-CONTRACT-NEXT: s_clause 0x3 +; GFX10-CONTRACT-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 +; GFX10-CONTRACT-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:68 +; GFX10-CONTRACT-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:72 +; GFX10-CONTRACT-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:76 +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(14) +; GFX10-CONTRACT-NEXT: v_fma_f64 v[4:5], v[4:5], v[24:25], v[36:37] +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(12) +; GFX10-CONTRACT-NEXT: v_fma_f64 v[6:7], v[6:7], v[26:27], v[38:39] +; GFX10-CONTRACT-NEXT: s_clause 0x7 +; GFX10-CONTRACT-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 +; GFX10-CONTRACT-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:84 +; GFX10-CONTRACT-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:88 +; GFX10-CONTRACT-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92 +; GFX10-CONTRACT-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:96 +; GFX10-CONTRACT-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:100 +; GFX10-CONTRACT-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:104 +; GFX10-CONTRACT-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:108 +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(10) +; GFX10-CONTRACT-NEXT: v_fma_f64 v[8:9], v[8:9], v[28:29], v[22:23] +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(6) +; GFX10-CONTRACT-NEXT: v_fma_f64 v[12:13], v[12:13], v[20:21], v[24:25] +; GFX10-CONTRACT-NEXT: v_fma_f64 v[10:11], v[10:11], v[30:31], v[34:35] +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(4) +; GFX10-CONTRACT-NEXT: v_fma_f64 v[14:15], v[14:15], v[32:33], v[26:27] +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(2) +; GFX10-CONTRACT-NEXT: v_fma_f64 v[16:17], v[16:17], v[48:49], v[36:37] +; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; GFX10-CONTRACT-NEXT: v_fma_f64 v[18:19], v[18:19], v[50:51], v[38:39] +; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-DENORM-LABEL: test_10xdouble_add_mul_rhs: +; GFX10-DENORM: ; %bb.0: ; %.entry +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-DENORM-NEXT: s_clause 0xb +; GFX10-DENORM-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; GFX10-DENORM-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 +; GFX10-DENORM-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 +; GFX10-DENORM-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 +; GFX10-DENORM-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 +; GFX10-DENORM-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 +; GFX10-DENORM-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 +; GFX10-DENORM-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; GFX10-DENORM-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32 +; GFX10-DENORM-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 +; GFX10-DENORM-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:40 +; GFX10-DENORM-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 +; GFX10-DENORM-NEXT: v_mul_f64 v[0:1], v[0:1], v[20:21] +; GFX10-DENORM-NEXT: v_mul_f64 v[2:3], v[2:3], v[22:23] +; GFX10-DENORM-NEXT: v_mul_f64 v[4:5], v[4:5], v[24:25] +; GFX10-DENORM-NEXT: v_mul_f64 v[6:7], v[6:7], v[26:27] +; GFX10-DENORM-NEXT: s_clause 0x7 +; GFX10-DENORM-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:48 +; GFX10-DENORM-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:52 +; GFX10-DENORM-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 +; GFX10-DENORM-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:60 +; GFX10-DENORM-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:64 +; GFX10-DENORM-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 +; GFX10-DENORM-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:72 +; GFX10-DENORM-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:76 +; GFX10-DENORM-NEXT: v_mul_f64 v[8:9], v[8:9], v[28:29] +; GFX10-DENORM-NEXT: v_mul_f64 v[10:11], v[10:11], v[30:31] +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(18) +; GFX10-DENORM-NEXT: v_mul_f64 v[12:13], v[12:13], v[32:33] +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(16) +; GFX10-DENORM-NEXT: v_mul_f64 v[14:15], v[14:15], v[34:35] +; GFX10-DENORM-NEXT: s_clause 0x7 +; GFX10-DENORM-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:80 +; GFX10-DENORM-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:84 +; GFX10-DENORM-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:88 +; GFX10-DENORM-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 +; GFX10-DENORM-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 +; GFX10-DENORM-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100 +; GFX10-DENORM-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:104 +; GFX10-DENORM-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(22) +; GFX10-DENORM-NEXT: v_mul_f64 v[16:17], v[16:17], v[36:37] +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(20) +; GFX10-DENORM-NEXT: v_mul_f64 v[36:37], v[18:19], v[38:39] +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(18) +; GFX10-DENORM-NEXT: v_add_f64 v[0:1], v[48:49], v[0:1] +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(14) +; GFX10-DENORM-NEXT: v_add_f64 v[4:5], v[20:21], v[4:5] +; GFX10-DENORM-NEXT: v_add_f64 v[2:3], v[50:51], v[2:3] +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(12) +; GFX10-DENORM-NEXT: v_add_f64 v[6:7], v[22:23], v[6:7] +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(10) +; GFX10-DENORM-NEXT: v_add_f64 v[8:9], v[24:25], v[8:9] +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(8) +; GFX10-DENORM-NEXT: v_add_f64 v[10:11], v[26:27], v[10:11] +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(6) +; GFX10-DENORM-NEXT: v_add_f64 v[12:13], v[28:29], v[12:13] +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(4) +; GFX10-DENORM-NEXT: v_add_f64 v[14:15], v[30:31], v[14:15] +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(2) +; GFX10-DENORM-NEXT: v_add_f64 v[16:17], v[32:33], v[16:17] +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: v_add_f64 v[18:19], v[34:35], v[36:37] +; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] +.entry: + %a = fmul <10 x double> %x, %y + %b = fadd <10 x double> %z, %a + ret <10 x double> %b +}