diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -46,6 +46,7 @@ void copyExtraImplicitOps(MachineInstr &NewMI, MachineInstr &MI) const; void shrinkScalarCompare(MachineInstr &MI) const; void shrinkMIMG(MachineInstr &MI) const; + void shrinkMadFma(MachineInstr &MI) const; bool shrinkScalarLogicOp(MachineInstr &MI) const; bool instAccessReg(iterator_range &&R, Register Reg, unsigned SubReg) const; @@ -324,6 +325,82 @@ } } +// Shrink MAD to MADAK/MADMK and FMA to FMAAK/FMAMK. +void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const { + if (!ST->hasVOP3Literal()) + return; + + if (TII->hasAnyModifiersSet(MI)) + return; + + const unsigned Opcode = MI.getOpcode(); + MachineOperand &Src0 = *TII->getNamedOperand(MI, AMDGPU::OpName::src0); + MachineOperand &Src1 = *TII->getNamedOperand(MI, AMDGPU::OpName::src1); + MachineOperand &Src2 = *TII->getNamedOperand(MI, AMDGPU::OpName::src2); + unsigned NewOpcode = AMDGPU::INSTRUCTION_LIST_END; + + bool Swap = false; + + // Detect "Dst = VSrc * VGPR + Imm" and convert to AK form. + if (Src2.isImm() && !TII->isInlineConstant(Src2)) { + if (Src0.isReg() && TRI->isVGPR(*MRI, Src0.getReg())) { + Swap = true; + } else if (Src0.isReg() && TRI->isVGPR(*MRI, Src0.getReg())) { + } else { + return; + } + + switch (Opcode) { + default: + llvm_unreachable(""); + case AMDGPU::V_MAD_F32_e64: + NewOpcode = AMDGPU::V_MADAK_F32; + break; + case AMDGPU::V_FMA_F32_e64: + NewOpcode = AMDGPU::V_FMAAK_F32; + break; + } + } + + // Detect "Dst = VSrc * Imm + VGPR" and convert to MK form. + if (Src2.isReg() && TRI->isVGPR(*MRI, Src2.getReg())) { + if (Src0.isImm() && !TII->isInlineConstant(Src0)) { + Swap = true; + } else if (Src1.isImm() && !TII->isInlineConstant(Src1)) { + } else { + return; + } + + switch (Opcode) { + default: + llvm_unreachable(""); + case AMDGPU::V_MAD_F32_e64: + NewOpcode = AMDGPU::V_MADMK_F32; + break; + case AMDGPU::V_FMA_F32_e64: + NewOpcode = AMDGPU::V_FMAMK_F32; + break; + } + } + + if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) + return; + + if (Swap) { + // Swap Src0 and Src1 by building a new instruction. + MachineInstr *NewMI = + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(NewOpcode), + MI.getOperand(0).getReg()); + NewMI->addOperand(Src1); + NewMI->addOperand(Src0); + NewMI->addOperand(Src2); + MI.eraseFromParent(); + } else { + TII->removeModOperands(MI); + MI.setDesc(TII->get(NewOpcode)); + } +} + /// Attempt to shink AND/OR/XOR operations requiring non-inlineable literals. /// For AND or OR, try using S_BITSET{0,1} to clear or set bits. /// If the inverse of the immediate is legal, use ANDN2, ORN2 or @@ -726,6 +803,16 @@ continue; } + if (!TII->isVOP3(MI)) + continue; + + // TODO: Also shrink F16 forms. + if (MI.getOpcode() == AMDGPU::V_MAD_F32_e64 || + MI.getOpcode() == AMDGPU::V_FMA_F32_e64) { + shrinkMadFma(MI); + continue; + } + if (!TII->hasVALU32BitEncoding(MI.getOpcode())) continue; diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll --- a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll @@ -68,7 +68,7 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_f32_e32 v4, v4, v10 ; GCN-NEXT: v_mul_f32_e32 v3, v4, v6 -; GCN-NEXT: v_fma_f32 v4, v5, s0, 0x3ca3d70a +; GCN-NEXT: v_fmaak_f32 v4, s0, v5, 0x3ca3d70a ; GCN-NEXT: v_mul_f32_e32 v1, v3, v1 ; GCN-NEXT: v_mul_f32_e32 v2, v7, v4 ; GCN-NEXT: v_fmac_f32_e32 v1, v2, v0 diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll --- a/llvm/test/CodeGen/AMDGPU/madak.ll +++ b/llvm/test/CodeGen/AMDGPU/madak.ll @@ -135,8 +135,8 @@ ; GCN-DAG: {{buffer|flat|global}}_load_dword{{(_addtid)?}} [[VA:v[0-9]+]] ; GCN-NOT: v_madak_f32 ; GFX6_8_9: v_mac_f32_e32 [[VK]], [[SB]], [[VA]] -; GFX10-MAD: v_mad_f32 v{{[0-9]+}}, [[VA]], [[SB]], 0x41200000 -; GFX10-FMA: v_fma_f32 v{{[0-9]+}}, [[VA]], [[SB]], 0x41200000 +; GFX10-MAD: v_madak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000 +; GFX10-FMA: v_fmaak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000 ; GFX940-FMA: v_fmac_f32_e32 v{{[0-9]+}}, [[SB]], [[VA]] define amdgpu_kernel void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone