Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -110,8 +110,11 @@ SDValue &Offset, SDValue &GLC) const; SDNode *SelectAddrSpaceCast(SDNode *N); bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3NoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods, SDValue &Clamp, SDValue &Omod) const; + bool SelectVOP3NoMods0(SDValue In, SDValue &Src, SDValue &SrcMods, + SDValue &Clamp, SDValue &Omod) const; bool SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, SDValue &SrcMods, SDValue &Omod) const; @@ -1317,6 +1320,12 @@ return true; } +bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + bool Res = SelectVOP3Mods(In, Src, SrcMods); + return Res && cast(SrcMods)->isNullValue(); +} + bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods, SDValue &Clamp, SDValue &Omod) const { @@ -1328,6 +1337,16 @@ return SelectVOP3Mods(In, Src, SrcMods); } +bool AMDGPUDAGToDAGISel::SelectVOP3NoMods0(SDValue In, SDValue &Src, + SDValue &SrcMods, SDValue &Clamp, + SDValue &Omod) const { + bool Res = SelectVOP3Mods0(In, Src, SrcMods, Clamp, Omod); + + return Res && cast(SrcMods)->isNullValue() && + cast(Clamp)->isNullValue() && + cast(Omod)->isNullValue(); +} + bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp(SDValue In, SDValue &Src, SDValue &SrcMods, SDValue &Omod) const { Index: llvm/trunk/lib/Target/AMDGPU/SIFoldOperands.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIFoldOperands.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -126,11 +126,42 @@ return false; } +static bool isUseMIInFoldList(const std::vector &FoldList, + const MachineInstr *MI) { + for (auto Candidate : FoldList) { + if (Candidate.UseMI == MI) + return true; + } + return false; +} + static bool tryAddToFoldList(std::vector &FoldList, MachineInstr *MI, unsigned OpNo, MachineOperand *OpToFold, const SIInstrInfo *TII) { if (!TII->isOperandLegal(MI, OpNo, OpToFold)) { + + // Special case for v_mac_f32_e64 if we are trying to fold into src2 + unsigned Opc = MI->getOpcode(); + if (Opc == AMDGPU::V_MAC_F32_e64 && + (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) { + // Check if changing this to a v_mad_f32 instruction will allow us to + // fold the operand. + MI->setDesc(TII->get(AMDGPU::V_MAD_F32)); + bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold, TII); + if (FoldAsMAD) { + MI->untieRegOperand(OpNo); + return true; + } + MI->setDesc(TII->get(Opc)); + } + + // If we are already folding into another operand of MI, then + // we can't commute the instruction, otherwise we risk making the + // other fold illegal. + if (isUseMIInFoldList(FoldList, MI)) + return false; + // Operand is not legal, so try to commute the instruction to // see if this makes it possible to fold. unsigned CommuteIdx0; Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h @@ -144,6 +144,10 @@ unsigned getMachineCSELookAheadLimit() const override { return 500; } + MachineInstr *convertToThreeAddress(MachineFunction::iterator &MBB, + MachineBasicBlock::iterator &MI, + LiveVariables *LV) const override; + bool isSALU(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::SALU; } Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -924,7 +924,7 @@ return false; unsigned Opc = UseMI->getOpcode(); - if (Opc == AMDGPU::V_MAD_F32) { + if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64) { // Don't fold if we are using source modifiers. The new VOP2 instructions // don't have them. if (hasModifiersSet(*UseMI, AMDGPU::OpName::src0_modifiers) || @@ -963,9 +963,9 @@ // instead of having to modify in place. // Remove these first since they are at the end. - UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, + UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); - UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, + UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); unsigned Src1Reg = Src1->getReg(); @@ -980,6 +980,14 @@ Src1->setSubReg(Src2SubReg); Src1->setIsKill(Src2->isKill()); + if (Opc == AMDGPU::V_MAC_F32_e64) { + UseMI->untieRegOperand( + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); + } + + UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, + AMDGPU::OpName::src2)); + // ChangingToImmediate adds Src2 back to the instruction. Src2->ChangeToImmediate(Imm); removeModOperands(*UseMI); @@ -1010,11 +1018,17 @@ // instead of having to modify in place. // Remove these first since they are at the end. - UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, + UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); - UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(AMDGPU::V_MAD_F32, + UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); + if (Opc == AMDGPU::V_MAC_F32_e64) { + UseMI->untieRegOperand( + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); + } + + // ChangingToImmediate adds Src2 back to the instruction. Src2->ChangeToImmediate(Imm); // These come before src2. @@ -1126,6 +1140,38 @@ return false; } +MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, + MachineBasicBlock::iterator &MI, + LiveVariables *LV) const { + + switch (MI->getOpcode()) { + default: return nullptr; + case AMDGPU::V_MAC_F32_e64: break; + case AMDGPU::V_MAC_F32_e32: { + const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0); + if (Src0->isImm() && !isInlineConstant(*Src0, 4)) + return nullptr; + break; + } + } + + const MachineOperand *Dst = getNamedOperand(*MI, AMDGPU::OpName::dst); + const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0); + const MachineOperand *Src1 = getNamedOperand(*MI, AMDGPU::OpName::src1); + const MachineOperand *Src2 = getNamedOperand(*MI, AMDGPU::OpName::src2); + + return BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_MAD_F32)) + .addOperand(*Dst) + .addImm(0) // Src0 mods + .addOperand(*Src0) + .addImm(0) // Src1 mods + .addOperand(*Src1) + .addImm(0) // Src mods + .addOperand(*Src2) + .addImm(0) // clamp + .addImm(0); // omod +} + bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { int64_t SVal = Imm.getSExtValue(); if (SVal >= -16 && SVal <= 64) Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td @@ -529,9 +529,11 @@ def MUBUFOffsetAtomic : ComplexPattern; def VOP3Mods0 : ComplexPattern; +def VOP3NoMods0 : ComplexPattern; def VOP3Mods0Clamp : ComplexPattern; def VOP3Mods0Clamp0OMod : ComplexPattern; def VOP3Mods : ComplexPattern; +def VOP3NoMods : ComplexPattern; //===----------------------------------------------------------------------===// // SI assembler operands @@ -1113,6 +1115,13 @@ field dag Ins = (ins VCSrc_32:$src0, VGPR_32:$vsrc1, u32imm:$src2); field string Asm = "$dst, $src0, $vsrc1, $src2"; } +def VOP_MAC : VOPProfile <[f32, f32, f32, f32]> { + let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2); + let Ins64 = getIns64, 3, + HasModifiers>.ret; + let Asm32 = getAsm32<2>.ret; + let Asm64 = getAsm64<2, HasModifiers>.ret; +} def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>; def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>; def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>; Index: llvm/trunk/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstructions.td +++ llvm/trunk/lib/Target/AMDGPU/SIInstructions.td @@ -1488,7 +1488,10 @@ defm V_OR_B32 : VOP2Inst , "v_or_b32", VOP_I32_I32_I32>; defm V_XOR_B32 : VOP2Inst , "v_xor_b32", VOP_I32_I32_I32>; -defm V_MAC_F32 : VOP2Inst , "v_mac_f32", VOP_F32_F32_F32>; +let Constraints = "$dst = $src2", DisableEncoding="$src2", + isConvertibleToThreeAddress = 1 in { +defm V_MAC_F32 : VOP2Inst , "v_mac_f32", VOP_MAC>; +} } // End isCommutable = 1 defm V_MADMK_F32 : VOP2MADK , "v_madmk_f32">; @@ -2206,6 +2209,15 @@ (V_CNDMASK_B32_e64 $src2, $src1, $src0) >; +// Pattern for V_MAC_F32 +def : Pat < + (fmad (VOP3NoMods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), + (VOP3NoMods f32:$src1, i32:$src1_modifiers), + (VOP3NoMods f32:$src2, i32:$src2_modifiers)), + (V_MAC_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1, + $src2_modifiers, $src2, $clamp, $omod) +>; + /********** ======================= **********/ /********** Image sampling patterns **********/ /********** ======================= **********/ Index: llvm/trunk/lib/Target/AMDGPU/SIShrinkInstructions.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -94,8 +94,15 @@ // is vcc. We should handle this the same way we handle vopc, by addding // a register allocation hint pre-regalloc and then do the shrining // post-regalloc. - if (Src2) - return false; + if (Src2) { + if (MI.getOpcode() != AMDGPU::V_MAC_F32_e64) + return false; + + const MachineOperand *Src2Mod = + TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers); + if (!isVGPR(Src2, TRI, MRI) || (Src2Mod && Src2Mod->getImm() != 0)) + return false; + } const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); const MachineOperand *Src1Mod = @@ -259,6 +266,11 @@ if (Src1) Inst32.addOperand(*Src1); + const MachineOperand *Src2 = + TII->getNamedOperand(MI, AMDGPU::OpName::src2); + if (Src2) + Inst32.addOperand(*Src2); + ++NumInstructionsShrunk; MI.eraseFromParent(); Index: llvm/trunk/test/CodeGen/AMDGPU/fmuladd.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/fmuladd.ll +++ llvm/trunk/test/CodeGen/AMDGPU/fmuladd.ll @@ -6,7 +6,7 @@ declare float @llvm.fabs.f32(float) nounwind readnone ; CHECK-LABEL: {{^}}fmuladd_f32: -; CHECK: v_mad_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}} +; CHECK: v_mac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} define void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2, float addrspace(1)* %in3) { @@ -34,8 +34,8 @@ ; CHECK-LABEL: {{^}}fmuladd_2.0_a_b_f32 ; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]] -; CHECK: buffer_store_dword [[RESULT]] +; CHECK: v_mac_f32_e32 [[R2]], 2.0, [[R1]] +; CHECK: buffer_store_dword [[R2]] define void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -53,8 +53,8 @@ ; CHECK-LABEL: {{^}}fmuladd_a_2.0_b_f32 ; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]] -; CHECK: buffer_store_dword [[RESULT]] +; CHECK: v_mac_f32_e32 [[R2]], 2.0, [[R1]] +; CHECK: buffer_store_dword [[R2]] define void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -72,8 +72,8 @@ ; CHECK-LABEL: {{^}}fadd_a_a_b_f32: ; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]] -; CHECK: buffer_store_dword [[RESULT]] +; CHECK: v_mac_f32_e32 [[R2]], 2.0, [[R1]] +; CHECK: buffer_store_dword [[R2]] define void @fadd_a_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) { @@ -94,8 +94,8 @@ ; CHECK-LABEL: {{^}}fadd_b_a_a_f32: ; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]] -; CHECK: buffer_store_dword [[RESULT]] +; CHECK: v_mac_f32_e32 [[R2]], 2.0, [[R1]] +; CHECK: buffer_store_dword [[R2]] define void @fadd_b_a_a_f32(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) { @@ -116,8 +116,8 @@ ; CHECK-LABEL: {{^}}fmuladd_neg_2.0_a_b_f32 ; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]] -; CHECK: buffer_store_dword [[RESULT]] +; CHECK: v_mac_f32_e32 [[R2]], -2.0, [[R1]] +; CHECK: buffer_store_dword [[R2]] define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -136,8 +136,8 @@ ; CHECK-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f32 ; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]] -; CHECK: buffer_store_dword [[RESULT]] +; CHECK: v_mac_f32_e32 [[R2]], 2.0, [[R1]] +; CHECK: buffer_store_dword [[R2]] define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -158,8 +158,8 @@ ; CHECK-LABEL: {{^}}fmuladd_2.0_neg_a_b_f32 ; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]] -; CHECK: buffer_store_dword [[RESULT]] +; CHECK: v_mac_f32_e32 [[R2]], -2.0, [[R1]] +; CHECK: buffer_store_dword [[R2]] define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll @@ -5,7 +5,7 @@ ; FUNC-LABEL: {{^}}test_lrp: ; SI: v_sub_f32 -; SI: v_mad_f32 +; SI: v_mac_f32_e32 define void @test_lrp(float addrspace(1)* %out, float %src0, float %src1, float %src2) nounwind { %mad = call float @llvm.AMDGPU.lrp(float %src0, float %src1, float %src2) nounwind readnone store float %mad, float addrspace(1)* %out, align 4 Index: llvm/trunk/test/CodeGen/AMDGPU/mad-combine.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/mad-combine.ll +++ llvm/trunk/test/CodeGen/AMDGPU/mad-combine.ll @@ -19,7 +19,7 @@ ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] +; SI-STD: v_mac_f32_e32 [[C]], [[B]], [[A]] ; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] @@ -29,7 +29,8 @@ ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] ; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]] -; SI: buffer_store_dword [[RESULT]] +; SI-DENORM: buffer_store_dword [[RESULT]] +; SI-STD: buffer_store_dword [[C]] define void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.r600.read.tidig.x() #0 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -54,8 +55,8 @@ ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} -; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]] -; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]] +; SI-STD-DAG: v_mac_f32_e32 [[C]], [[B]], [[A]] +; SI-STD-DAG: v_mac_f32_e32 [[D]], [[B]], [[A]] ; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]] ; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]] @@ -64,8 +65,10 @@ ; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]] ; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]] -; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DENORM-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DENORM-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-STD-DAG: buffer_store_dword [[C]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-STD-DAG: buffer_store_dword [[D]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} ; SI: s_endpgm define void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.r600.read.tidig.x() #0 @@ -96,13 +99,14 @@ ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] +; SI-STD: v_mac_f32_e32 [[C]], [[B]], [[A]] ; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] ; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] -; SI: buffer_store_dword [[RESULT]] +; SI-DENORM: buffer_store_dword [[RESULT]] +; SI-STD: buffer_store_dword [[C]] define void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.r600.read.tidig.x() #0 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -482,7 +486,7 @@ ; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} ; SI-STD: v_mad_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]] -; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP]] +; SI-STD: v_mac_f32_e32 [[TMP]], [[B]], [[A]] ; SI-DENORM: v_fma_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]] ; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP]] @@ -492,7 +496,8 @@ ; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]] ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP2]] -; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DENORM: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-STD: buffer_store_dword [[TMP]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: s_endpgm define void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.r600.read.tidig.x() #0 Index: llvm/trunk/test/CodeGen/AMDGPU/mad-sub.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/mad-sub.ll +++ llvm/trunk/test/CodeGen/AMDGPU/mad-sub.ll @@ -123,7 +123,7 @@ } ; FUNC-LABEL: {{^}}neg_neg_mad_f32: -; SI: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; SI: v_mac_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} define void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 { %tid = tail call i32 @llvm.r600.read.tidig.x() #0 %tid.ext = sext i32 %tid to i64 @@ -172,8 +172,8 @@ ; FUNC-LABEL: {{^}}fsub_c_fadd_a_a: ; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]] -; SI: buffer_store_dword [[RESULT]] +; SI: v_mac_f32_e32 [[R2]], -2.0, [[R1]] +; SI: buffer_store_dword [[R2]] define void @fsub_c_fadd_a_a(float addrspace(1)* %out, float addrspace(1)* %in) { %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid Index: llvm/trunk/test/CodeGen/AMDGPU/madak.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/madak.ll +++ llvm/trunk/test/CodeGen/AMDGPU/madak.ll @@ -9,7 +9,7 @@ ; GCN-LABEL: {{^}}madak_f32: ; GCN: buffer_load_dword [[VA:v[0-9]+]] ; GCN: buffer_load_dword [[VB:v[0-9]+]] -; GCN: v_madak_f32_e32 {{v[0-9]+}}, [[VB]], [[VA]], 0x41200000 +; GCN: v_madak_f32_e32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 define void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid @@ -34,8 +34,8 @@ ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 -; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], [[VK]] -; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VC]], [[VK]] +; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VB]], [[VA]], [[VK]] +; GCN-DAG: v_mac_f32_e32 [[VK]], [[VC]], [[VA]] ; GCN: s_endpgm define void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone @@ -105,7 +105,7 @@ ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]] ; GCN-NOT: v_madak_f32 -; GCN: v_mad_f32 {{v[0-9]+}}, [[SB]], [[VA]], [[VK]] +; GCN: v_mac_f32_e32 [[VK]], [[SB]], [[VA]] define void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) nounwind { %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid @@ -124,7 +124,7 @@ ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]] ; GCN-NOT: v_madak_f32 -; GCN: v_mad_f32 {{v[0-9]+}}, [[VA]], [[SB]], [[VK]] +; GCN: v_mac_f32_e32 [[VK]], [[SB]], [[VA]] define void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float addrspace(1)* noalias %in.b) nounwind { %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid @@ -140,7 +140,7 @@ ; GCN-LABEL: {{^}}s_s_madak_f32: ; GCN-NOT: v_madak_f32 -; GCN: v_mad_f32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; GCN: v_mac_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} define void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) nounwind { %mul = fmul float %a, %b %madak = fadd float %mul, 10.0 Index: llvm/trunk/test/CodeGen/AMDGPU/madmk.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/madmk.ll +++ llvm/trunk/test/CodeGen/AMDGPU/madmk.ll @@ -28,8 +28,8 @@ ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 -; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VK]], [[VB]] -; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VK]], [[VC]] +; GCN-DAG: v_mac_f32_e32 [[VB]], [[VK]], [[VA]] +; GCN-DAG: v_mac_f32_e32 [[VC]], [[VK]], [[VA]] ; GCN: s_endpgm define void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone @@ -59,7 +59,7 @@ ; GCN-LABEL: {{^}}madmk_inline_imm_f32: ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GCN: v_mad_f32 {{v[0-9]+}}, 4.0, [[VA]], [[VB]] +; GCN: v_mac_f32_e32 [[VB]], 4.0, [[VA]] define void @madmk_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -77,7 +77,7 @@ ; GCN-LABEL: {{^}}s_s_madmk_f32: ; GCN-NOT: v_madmk_f32 -; GCN: v_mad_f32 +; GCN: v_mac_f32_e32 ; GCN: s_endpgm define void @s_s_madmk_f32(float addrspace(1)* noalias %out, float %a, float %b) nounwind { %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone @@ -107,7 +107,7 @@ ; GCN-LABEL: {{^}}scalar_vector_madmk_f32: ; GCN-NOT: v_madmk_f32 -; GCN: v_mad_f32 +; GCN: v_mac_f32_e32 ; GCN: s_endpgm define void @scalar_vector_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %a) nounwind { %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone Index: llvm/trunk/test/CodeGen/AMDGPU/v_mac.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/v_mac.ll +++ llvm/trunk/test/CodeGen/AMDGPU/v_mac.ll @@ -0,0 +1,155 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s + +; GCN-LABEL: {{^}}mac_vvv: +; GCN: buffer_load_dword [[A:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0{{$}} +; GCN: buffer_load_dword [[B:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0 offset:4 +; GCN: buffer_load_dword [[C:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0 offset:8 +; GCN: v_mac_f32_e32 [[C]], [[B]], [[A]] +; GCN: buffer_store_dword [[C]] +define void @mac_vvv(float addrspace(1)* %out, float addrspace(1)* %in) { +entry: + %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 + %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2 + + %a = load float, float addrspace(1)* %in + %b = load float, float addrspace(1)* %b_ptr + %c = load float, float addrspace(1)* %c_ptr + + %tmp0 = fmul float %a, %b + %tmp1 = fadd float %tmp0, %c + store float %tmp1, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}mad_inline_sgpr_inline: +; GCN-NOT: v_mac_f32 +; GCN: v_mad_f32 v{{[0-9]}}, 0.5, s{{[0-9]+}}, 0.5 +define void @mad_inline_sgpr_inline(float addrspace(1)* %out, float %in) { +entry: + %tmp0 = fmul float 0.5, %in + %tmp1 = fadd float %tmp0, 0.5 + store float %tmp1, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}mad_vvs: +; GCN-NOT: v_mac_f32 +; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} +define void @mad_vvs(float addrspace(1)* %out, float addrspace(1)* %in, float %c) { +entry: + %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 + + %a = load float, float addrspace(1)* %in + %b = load float, float addrspace(1)* %b_ptr + + %tmp0 = fmul float %a, %b + %tmp1 = fadd float %tmp0, %c + store float %tmp1, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}mac_ssv: +; GCN: v_mac_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +define void @mac_ssv(float addrspace(1)* %out, float addrspace(1)* %in, float %a) { +entry: + %c = load float, float addrspace(1)* %in + + %tmp0 = fmul float %a, %a + %tmp1 = fadd float %tmp0, %c + store float %tmp1, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}mac_mad_same_add: +; GCN: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]] +; GCN: v_mac_f32_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}} +define void @mac_mad_same_add(float addrspace(1)* %out, float addrspace(1)* %in) { +entry: + %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 + %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2 + %d_ptr = getelementptr float, float addrspace(1)* %in, i32 3 + %e_ptr = getelementptr float, float addrspace(1)* %in, i32 4 + + %a = load float, float addrspace(1)* %in + %b = load float, float addrspace(1)* %b_ptr + %c = load float, float addrspace(1)* %c_ptr + %d = load float, float addrspace(1)* %d_ptr + %e = load float, float addrspace(1)* %e_ptr + + %tmp0 = fmul float %a, %b + %tmp1 = fadd float %tmp0, %c + + %tmp2 = fmul float %d, %e + %tmp3 = fadd float %tmp2, %c + + %out1 = getelementptr float, float addrspace(1)* %out, i32 1 + store float %tmp1, float addrspace(1)* %out + store float %tmp3, float addrspace(1)* %out1 + ret void +} + +; There is no advantage to using v_mac when one of the operands is negated +; and v_mad accepts more operand types. + +; GCN-LABEL: {{^}}mad_neg_src0: +; GCN-NOT: v_mac_f32 +; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} +define void @mad_neg_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +entry: + %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 + %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2 + + %a = load float, float addrspace(1)* %in + %b = load float, float addrspace(1)* %b_ptr + %c = load float, float addrspace(1)* %c_ptr + + %neg_a = fsub float 0.0, %a + %tmp0 = fmul float %neg_a, %b + %tmp1 = fadd float %tmp0, %c + + store float %tmp1, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}mad_neg_src1: +; GCN-NOT: v_mac_f32 +; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} +define void @mad_neg_src1(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +entry: + %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 + %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2 + + %a = load float, float addrspace(1)* %in + %b = load float, float addrspace(1)* %b_ptr + %c = load float, float addrspace(1)* %c_ptr + + %neg_b = fsub float 0.0, %b + %tmp0 = fmul float %a, %neg_b + %tmp1 = fadd float %tmp0, %c + + store float %tmp1, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}mad_neg_src2: +; GCN-NOT: v_mac +; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}} +define void @mad_neg_src2(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +entry: + %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 + %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2 + + %a = load float, float addrspace(1)* %in + %b = load float, float addrspace(1)* %b_ptr + %c = load float, float addrspace(1)* %c_ptr + + %neg_c = fsub float 0.0, %c + %tmp0 = fmul float %a, %b + %tmp1 = fadd float %tmp0, %neg_c + + store float %tmp1, float addrspace(1)* %out + ret void +} + +attributes #0 = { "true" "unsafe-fp-math"="true" }