Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -443,6 +443,8 @@ return MaxPrivateElementSize; } + unsigned getConstantBusLimit(unsigned Opcode) const; + bool hasIntClamp() const { return HasIntClamp; } Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -256,6 +256,26 @@ *this, *static_cast(RegBankInfo.get()), TM)); } +unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { + if (getGeneration() < GFX10) + return 1; + + switch (Opcode) { + case AMDGPU::V_LSHLREV_B64: + case AMDGPU::V_LSHLREV_B64_gfx10: + case AMDGPU::V_LSHL_B64: + case AMDGPU::V_LSHRREV_B64: + case AMDGPU::V_LSHRREV_B64_gfx10: + case AMDGPU::V_LSHR_B64: + case AMDGPU::V_ASHRREV_I64: + case AMDGPU::V_ASHRREV_I64_gfx10: + case AMDGPU::V_ASHR_I64: + return 1; + } + + return 2; +} + unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, const Function &F) const { if (NWaves == 1) Index: llvm/trunk/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ llvm/trunk/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -2443,6 +2443,8 @@ const unsigned Opcode = Inst.getOpcode(); const MCInstrDesc &Desc = MII.get(Opcode); unsigned ConstantBusUseCount = 0; + unsigned NumLiterals = 0; + unsigned LiteralSize; if (Desc.TSFlags & (SIInstrFlags::VOPC | @@ -2454,8 +2456,10 @@ ++ConstantBusUseCount; } + SmallDenseSet SGPRsUsed; unsigned SGPRUsed = findImplicitSGPRReadInVOP(Inst); if (SGPRUsed != AMDGPU::NoRegister) { + SGPRsUsed.insert(SGPRUsed); ++ConstantBusUseCount; } @@ -2478,16 +2482,42 @@ // flat_scratch_lo, flat_scratch_hi // are theoretically valid but they are disabled anyway. // Note that this code mimics SIInstrInfo::verifyInstruction - if (Reg != SGPRUsed) { + if (!SGPRsUsed.count(Reg)) { + SGPRsUsed.insert(Reg); ++ConstantBusUseCount; } SGPRUsed = Reg; } else { // Expression or a literal - ++ConstantBusUseCount; + + if (Desc.OpInfo[OpIdx].OperandType == MCOI::OPERAND_IMMEDIATE) + continue; // special operand like VINTERP attr_chan + + // An instruction may use only one literal. + // This has been validated on the previous step. + // See validateVOP3Literal. + // This literal may be used as more than one operand. + // If all these operands are of the same size, + // this literal counts as one scalar value. + // Otherwise it counts as 2 scalar values. + // See "GFX10 Shader Programming", section 3.6.2.3. + + unsigned Size = AMDGPU::getOperandSize(Desc, OpIdx); + if (Size < 4) Size = 4; + + if (NumLiterals == 0) { + NumLiterals = 1; + LiteralSize = Size; + } else if (LiteralSize != Size) { + NumLiterals = 2; + } } } } } + ConstantBusUseCount += NumLiterals; + + if (isGFX10()) + return ConstantBusUseCount <= 2; return ConstantBusUseCount <= 1; } Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2151,9 +2151,11 @@ Src0->ChangeToImmediate(Def->getOperand(1).getImm()); Src0Inlined = true; } else if ((RI.isPhysicalRegister(Src0->getReg()) && - RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg()))) || + (ST.getConstantBusLimit(Opc) <= 1 && + RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg())))) || (RI.isVirtualRegister(Src0->getReg()) && - RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))) + (ST.getConstantBusLimit(Opc) <= 1 && + RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))) return false; // VGPR is okay as Src0 - fallthrough } @@ -2350,7 +2352,9 @@ if (!IsFMA && !Src0Mods && !Src1Mods && !Clamp && !Omod && // If we have an SGPR input, we will violate the constant bus restriction. - (!Src0->isReg() || !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) { + (ST.getConstantBusLimit(Opc) > 1 || + !Src0->isReg() || + !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) { if (auto Imm = getFoldableImm(Src2)) { return BuildMI(*MBB, MI, MI.getDebugLoc(), get(IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32)) @@ -3090,9 +3094,12 @@ if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) ++ConstantBusCount; + SmallVector SGPRsUsed; unsigned SGPRUsed = findImplicitSGPRRead(MI); - if (SGPRUsed != AMDGPU::NoRegister) + if (SGPRUsed != AMDGPU::NoRegister) { ++ConstantBusCount; + SGPRsUsed.push_back(SGPRUsed); + } for (int OpIdx : OpIndices) { if (OpIdx == -1) @@ -3100,23 +3107,37 @@ const MachineOperand &MO = MI.getOperand(OpIdx); if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { if (MO.isReg()) { - if (MO.getReg() != SGPRUsed) - ++ConstantBusCount; SGPRUsed = MO.getReg(); + if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) { + return !RI.regsOverlap(SGPRUsed, SGPR); + })) { + ++ConstantBusCount; + SGPRsUsed.push_back(SGPRUsed); + } } else { ++ConstantBusCount; ++LiteralCount; } } } - if (ConstantBusCount > 1) { - ErrInfo = "VOP* instruction uses the constant bus more than once"; + const GCNSubtarget &ST = MF->getSubtarget(); + // v_writelane_b32 is an exception from constant bus restriction: + // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const + if (ConstantBusCount > ST.getConstantBusLimit(Opcode) && + Opcode != AMDGPU::V_WRITELANE_B32) { + ErrInfo = "VOP* instruction violates constant bus restriction"; return false; } if (isVOP3(MI) && LiteralCount) { - ErrInfo = "VOP3 instruction uses literal"; - return false; + if (LiteralCount && !ST.hasVOP3Literal()) { + ErrInfo = "VOP3 instruction uses literal"; + return false; + } + if (LiteralCount > 1) { + ErrInfo = "VOP3 instruction uses more than one literal"; + return false; + } } } @@ -3509,31 +3530,47 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO) const { - const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + const MachineFunction &MF = *MI.getParent()->getParent(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); const MCInstrDesc &InstDesc = MI.getDesc(); const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; + const GCNSubtarget &ST = MF.getSubtarget(); const TargetRegisterClass *DefinedRC = OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; if (!MO) MO = &MI.getOperand(OpIdx); + int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode()); + int VOP3LiteralLimit = ST.hasVOP3Literal() ? 1 : 0; if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) { + if (isVOP3(MI) && isLiteralConstantLike(*MO, OpInfo) && !VOP3LiteralLimit--) + return false; - RegSubRegPair SGPRUsed; + SmallDenseSet SGPRsUsed; if (MO->isReg()) - SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg()); + SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg())); for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { if (i == OpIdx) continue; const MachineOperand &Op = MI.getOperand(i); if (Op.isReg()) { - if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) && + RegSubRegPair SGPR(Op.getReg(), Op.getSubReg()); + if (!SGPRsUsed.count(SGPR) && usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) { - return false; + if (--ConstantBusLimit <= 0) + return false; + SGPRsUsed.insert(SGPR); } } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) { - return false; + if (--ConstantBusLimit <= 0) + return false; + } else if (isVOP3(MI) && AMDGPU::isSISrcOperand(InstDesc, i) && + isLiteralConstantLike(Op, InstDesc.OpInfo[i])) { + if (!VOP3LiteralLimit--) + return false; + if (--ConstantBusLimit <= 0) + return false; } } } @@ -3569,7 +3606,7 @@ // disabled for the operand type for instructions because they will always // violate the one constant bus use rule. bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister; - if (HasImplicitSGPR) { + if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1) { int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); MachineOperand &Src0 = MI.getOperand(Src0Idx); @@ -3680,7 +3717,14 @@ }; // Find the one SGPR operand we are allowed to use. + int ConstantBusLimit = ST.getConstantBusLimit(Opc); + int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0; + SmallDenseSet SGPRsUsed; unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); + if (SGPRReg != AMDGPU::NoRegister) { + SGPRsUsed.insert(SGPRReg); + --ConstantBusLimit; + } for (unsigned i = 0; i < 3; ++i) { int Idx = VOP3Idx[i]; @@ -3688,16 +3732,32 @@ break; MachineOperand &MO = MI.getOperand(Idx); - // We should never see a VOP3 instruction with an illegal immediate operand. - if (!MO.isReg()) + if (!MO.isReg()) { + if (!isLiteralConstantLike(MO, get(Opc).OpInfo[Idx])) + continue; + + if (LiteralLimit > 0 && ConstantBusLimit > 0) { + --LiteralLimit; + --ConstantBusLimit; + continue; + } + + --LiteralLimit; + --ConstantBusLimit; + legalizeOpWithMove(MI, Idx); continue; + } if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) continue; // VGPRs are legal - if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) { - SGPRReg = MO.getReg(); - // We can use one SGPR in each VOP3 instruction. + // We can use one SGPR in each VOP3 instruction prior to GFX10 + // and two starting from GFX10. + if (SGPRsUsed.count(MO.getReg())) + continue; + if (ConstantBusLimit > 0) { + SGPRsUsed.insert(MO.getReg()); + --ConstantBusLimit; continue; }