Index: lib/Target/AMDGPU/SIFoldOperands.cpp =================================================================== --- lib/Target/AMDGPU/SIFoldOperands.cpp +++ lib/Target/AMDGPU/SIFoldOperands.cpp @@ -80,6 +80,7 @@ const MachineOperand *isClamp(const MachineInstr &MI) const; bool tryFoldClamp(MachineInstr &MI); + bool tryFoldOModMad(MachineInstr &MI); std::pair isOMod(const MachineInstr &MI) const; bool tryFoldOMod(MachineInstr &MI); @@ -766,6 +767,7 @@ static int getOModValue(unsigned Opc, int64_t Val) { switch (Opc) { + case AMDGPU::V_MAD_F32: case AMDGPU::V_MUL_F32_e64: { switch (static_cast(Val)) { case 0x3f000000: // 0.5 @@ -858,10 +860,70 @@ } } +bool SIFoldOperands::tryFoldOModMad(MachineInstr &MI) { + unsigned Op = MI.getOpcode(); + + if (ST->hasFP32Denormals()) + return false; + const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + const MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); + + if (!Src1->isImm() || !Src2->isImm()) + return false; + + /* see if the multiply operand is omod compat. */ + int OMod = getOModValue(Op, Src1->getImm()); + if (OMod == SIOutMods::NONE || + TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || + TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) || + TII->hasModifiersSet(MI, AMDGPU::OpName::omod) || + TII->hasModifiersSet(MI, AMDGPU::OpName::clamp)) + return false; + + float f = BitsToFloat(Src2->getImm()); + + switch (OMod) { + case SIOutMods::DIV2: + f *= 2.0f; + break; + case SIOutMods::MUL2: + f *= 0.5f; + break; + case SIOutMods::MUL4: + f *= 0.25f; + break; + default: + break; + } + + if (!AMDGPU::isInlinableLiteral32(FloatToBits(f), false)) + return false; + + DEBUG(dbgs() << "Replacing MAD " << MI << " with omod Add\n"); + + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), + TII->get(AMDGPU::V_ADD_F32_e64), + MI.getOperand(0).getReg()) + .addImm(0) // src0 modifiers + .add(*Src0) + .addImm(0) // src1 modifiers + .addImm((int32_t)FloatToBits(f)) // src1 + .addImm(0) //clamp + .addImm(OMod); + MI.eraseFromParent(); + return true; +} + // FIXME: Does this need to check IEEE bit on function? bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) { const MachineOperand *RegOp; int OMod; + unsigned Op = MI.getOpcode(); + + if (Op == AMDGPU::V_MAD_F32) + return tryFoldOModMad(MI); + std::tie(RegOp, OMod) = isOMod(MI); if (OMod == SIOutMods::NONE || !RegOp->isReg() || RegOp->getSubReg() != AMDGPU::NoSubRegister ||