Index: lib/Target/AMDGPU/AMDGPU.td =================================================================== --- lib/Target/AMDGPU/AMDGPU.td +++ lib/Target/AMDGPU/AMDGPU.td @@ -8,23 +8,12 @@ //===------------------------------------------------------------===// include "llvm/Target/Target.td" +include "AMDGPUFeatures.td" //===------------------------------------------------------------===// // Subtarget Features (device properties) //===------------------------------------------------------------===// -def FeatureFP64 : SubtargetFeature<"fp64", - "FP64", - "true", - "Enable double precision operations" ->; - -def FeatureFMA : SubtargetFeature<"fmaf", - "FMA", - "true", - "Enable single precision FMA (not as fast as mul+add, but fused)" ->; - def FeatureFastFMAF32 : SubtargetFeature<"fast-fmaf", "FastFMAF32", "true", @@ -43,30 +32,6 @@ "Most fp64 instructions are half rate instead of quarter" >; -def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst", - "R600ALUInst", - "false", - "Older version of ALU instructions encoding" ->; - -def FeatureVertexCache : SubtargetFeature<"HasVertexCache", - "HasVertexCache", - "true", - "Specify use of dedicated vertex cache" ->; - -def FeatureCaymanISA : SubtargetFeature<"caymanISA", - "CaymanISA", - "true", - "Use Cayman ISA" ->; - -def FeatureCFALUBug : SubtargetFeature<"cfalubug", - "CFALUBug", - "true", - "GPU has CF_ALU bug" ->; - def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space", "FlatAddressSpace", "true", @@ -146,27 +111,6 @@ "VI SGPR initialization bug requiring a fixed SGPR allocation size" >; -class SubtargetFeatureFetchLimit : - SubtargetFeature <"fetch"#Value, - "TexVTXClauseSize", - Value, - "Limit the maximum number of fetches in a clause to "#Value ->; - -def FeatureFetchLimit8 : SubtargetFeatureFetchLimit <"8">; -def FeatureFetchLimit16 : SubtargetFeatureFetchLimit <"16">; - -class SubtargetFeatureWavefrontSize : SubtargetFeature< - "wavefrontsize"#Value, - "WavefrontSize", - !cast(Value), - "The number of threads per wavefront" ->; - -def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>; -def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>; -def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>; - class SubtargetFeatureLDSBankCount : SubtargetFeature < "ldsbankcount"#Value, "LDSBankCount", @@ -177,19 +121,6 @@ def FeatureLDSBankCount16 : SubtargetFeatureLDSBankCount<16>; def FeatureLDSBankCount32 : SubtargetFeatureLDSBankCount<32>; -class SubtargetFeatureLocalMemorySize : SubtargetFeature< - "localmemorysize"#Value, - "LocalMemorySize", - !cast(Value), - "The size of local memory in bytes" ->; - -def FeatureGCN : SubtargetFeature<"gcn", - "IsGCN", - "true", - "GCN or newer GPU" ->; - def FeatureGCN3Encoding : SubtargetFeature<"gcn3-encoding", "GCN3Encoding", "true", @@ -347,12 +278,6 @@ [FeatureFP64FP16Denormals] >; -def FeatureDX10Clamp : SubtargetFeature<"dx10-clamp", - "DX10Clamp", - "true", - "clamp modifier clamps NaNs to 0.0" ->; - def FeatureFPExceptions : SubtargetFeature<"fp-exceptions", "FPExceptions", "true", @@ -395,12 +320,6 @@ "Dump MachineInstrs in the CodeEmitter" >; -def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca", - "EnablePromoteAlloca", - "true", - "Enable promote alloca pass" ->; - // XXX - This should probably be removed once enabled by default def FeatureEnableLoadStoreOpt : SubtargetFeature <"load-store-opt", "EnableLoadStoreOpt", @@ -464,45 +383,29 @@ "Dummy feature to disable assembler instructions" >; -class SubtargetFeatureGeneration Implies> : - SubtargetFeature ; - -def FeatureLocalMemorySize0 : SubtargetFeatureLocalMemorySize<0>; -def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>; -def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>; - -def FeatureR600 : SubtargetFeatureGeneration<"R600", - [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0] ->; - -def FeatureR700 : SubtargetFeatureGeneration<"R700", - [FeatureFetchLimit16, FeatureLocalMemorySize0] ->; - -def FeatureEvergreen : SubtargetFeatureGeneration<"EVERGREEN", - [FeatureFetchLimit16, FeatureLocalMemorySize32768] +def FeatureGCN : SubtargetFeature<"gcn", + "IsGCN", + "true", + "GCN or newer GPU" >; -def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS", - [FeatureFetchLimit16, FeatureWavefrontSize64, - FeatureLocalMemorySize32768] ->; +class AMDGPUSubtargetFeatureGeneration Implies> : + SubtargetFeatureGeneration ; -def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS", +def FeatureSouthernIslands : AMDGPUSubtargetFeatureGeneration<"SOUTHERN_ISLANDS", [FeatureFP64, FeatureLocalMemorySize32768, FeatureMIMG_R128, FeatureWavefrontSize64, FeatureGCN, FeatureLDSBankCount32, FeatureMovrel] >; -def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS", +def FeatureSeaIslands : AMDGPUSubtargetFeatureGeneration<"SEA_ISLANDS", [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128, FeatureWavefrontSize64, FeatureGCN, FeatureFlatAddressSpace, FeatureCIInsts, FeatureMovrel] >; -def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS", +def FeatureVolcanicIslands : AMDGPUSubtargetFeatureGeneration<"VOLCANIC_ISLANDS", [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128, FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN, FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts, @@ -513,7 +416,7 @@ ] >; -def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9", +def FeatureGFX9 : AMDGPUSubtargetFeatureGeneration<"GFX9", [FeatureFP64, FeatureLocalMemorySize65536, FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN, FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts, @@ -710,8 +613,6 @@ // Predicate helper class //===----------------------------------------------------------------------===// -def TruePredicate : Predicate<"true">; - def isSICI : Predicate< "Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||" "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS" @@ -791,36 +692,15 @@ def EnableLateCFGStructurize : Predicate< "EnableLateStructurizeCFG">; -// Exists to help track down where SubtargetPredicate isn't set rather -// than letting tablegen crash with an unhelpful error. -def InvalidPred : Predicate<"predicate not set on instruction or pattern">; - -class PredicateControl { - Predicate SubtargetPredicate = InvalidPred; - Predicate SIAssemblerPredicate = isSICI; - Predicate VIAssemblerPredicate = isVI; - list AssemblerPredicates = []; - Predicate AssemblerPredicate = TruePredicate; - list OtherPredicates = []; - list Predicates = !listconcat([SubtargetPredicate, - AssemblerPredicate], - AssemblerPredicates, - OtherPredicates); -} - -class AMDGPUPat : Pat, - PredicateControl; - - // Include AMDGPU TD files -include "R600Schedule.td" -include "R600Processors.td" include "SISchedule.td" include "GCNProcessors.td" include "AMDGPUInstrInfo.td" include "AMDGPUIntrinsics.td" +include "SIIntrinsics.td" include "AMDGPURegisterInfo.td" include "AMDGPURegisterBanks.td" include "AMDGPUInstructions.td" +include "SIInstrInfo.td" include "AMDGPUCallingConv.td" include "AMDGPUSearchableTables.td" Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -194,7 +194,7 @@ if (!MFI->isEntryFunction()) return; - const AMDGPUSubtarget &STM = MF->getSubtarget(); + const AMDGPUCommonSubtarget &STM = AMDGPUCommonSubtarget::get(*MF); amd_kernel_code_t KernelCode; if (STM.isAmdCodeObjectV2(*MF)) { getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF); @@ -213,7 +213,7 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() { const SIMachineFunctionInfo *MFI = MF->getInfo(); - const AMDGPUSubtarget &STM = MF->getSubtarget(); + const AMDGPUCommonSubtarget &STM = AMDGPUCommonSubtarget::get(*MF); if (MFI->isEntryFunction() && STM.isAmdCodeObjectV2(*MF)) { SmallString<128> SymbolName; getNameWithPrefix(SymbolName, &MF->getFunction()), @@ -302,7 +302,7 @@ SetupMachineFunction(MF); - const AMDGPUSubtarget &STM = MF.getSubtarget(); + const AMDGPUCommonSubtarget &STM = AMDGPUCommonSubtarget::get(MF); MCContext &Context = getObjFileLowering().getContext(); // FIXME: This should be an explicit check for Mesa. if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) { @@ -311,7 +311,7 @@ OutStreamer->SwitchSection(ConfigSection); } - if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { + if (TM.getTargetTriple().getArch() == Triple::amdgcn) { if (MFI->isEntryFunction()) { getSIProgramInfo(CurrentProgramInfo, MF); } else { @@ -342,7 +342,7 @@ Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0); OutStreamer->SwitchSection(CommentSection); - if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { + if (TM.getTargetTriple().getArch() == Triple::amdgcn) { if (!MFI->isEntryFunction()) { OutStreamer->emitRawComment(" Function info:", false); SIFunctionResourceInfo &Info = CallGraphResourceInfo[&MF.getFunction()]; @@ -451,7 +451,7 @@ for (const MachineBasicBlock &MBB : MF) { for (const MachineInstr &MI : MBB) { - if (MI.getOpcode() == AMDGPU::KILLGT) + if (MI.getOpcode() == R600::KILLGT) killPixel = true; unsigned numOperands = MI.getNumOperands(); for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) { Index: lib/Target/AMDGPU/AMDGPUCallingConv.td =================================================================== --- lib/Target/AMDGPU/AMDGPUCallingConv.td +++ lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -85,17 +85,6 @@ ]>> ]>; -// Calling convention for R600 -def CC_R600 : CallingConv<[ - CCIfInReg>> -]>; - // Calling convention for compute kernels def CC_AMDGPU_Kernel : CallingConv<[ CCCustom<"allocateKernArg"> @@ -165,9 +154,5 @@ CCIf<"static_cast" "(State.getMachineFunction().getSubtarget()).getGeneration() >= " "AMDGPUSubtarget::SOUTHERN_ISLANDS && State.getCallingConv() == CallingConv::C", - CCDelegateTo>, - CCIf<"static_cast" - "(State.getMachineFunction().getSubtarget()).getGeneration() < " - "AMDGPUSubtarget::SOUTHERN_ISLANDS", - CCDelegateTo> + CCDelegateTo> ]>; Index: lib/Target/AMDGPU/AMDGPUFeatures.td =================================================================== --- /dev/null +++ lib/Target/AMDGPU/AMDGPUFeatures.td @@ -0,0 +1,52 @@ + +def FeatureFP64 : SubtargetFeature<"fp64", + "FP64", + "true", + "Enable double precision operations" +>; + +def FeatureFMA : SubtargetFeature<"fmaf", + "FMA", + "true", + "Enable single precision FMA (not as fast as mul+add, but fused)" +>; + +class SubtargetFeatureLocalMemorySize : SubtargetFeature< + "localmemorysize"#Value, + "LocalMemorySize", + !cast(Value), + "The size of local memory in bytes" +>; + +def FeatureLocalMemorySize0 : SubtargetFeatureLocalMemorySize<0>; +def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>; +def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>; + +class SubtargetFeatureWavefrontSize : SubtargetFeature< + "wavefrontsize"#Value, + "WavefrontSize", + !cast(Value), + "The number of threads per wavefront" +>; + +def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>; +def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>; +def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>; + +class SubtargetFeatureGeneration Implies> : + SubtargetFeature ; + +def FeatureDX10Clamp : SubtargetFeature<"dx10-clamp", + "DX10Clamp", + "true", + "clamp modifier clamps NaNs to 0.0" +>; + +def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca", + "EnablePromoteAlloca", + "true", + "Enable promote alloca pass" +>; + Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -102,15 +102,11 @@ bool isNoNanSrc(SDValue N) const; bool isInlineImmediate(const SDNode *N) const; - bool isConstantLoad(const MemSDNode *N, int cbID) const; bool isUniformBr(const SDNode *N) const; SDNode *glueCopyToM0(SDNode *N) const; const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const; - bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr); - bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg, - SDValue& Offset); virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset); virtual bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset); bool isDSOffsetLegal(const SDValue &Base, unsigned Offset, @@ -225,9 +221,18 @@ }; class R600DAGToDAGISel : public AMDGPUDAGToDAGISel { + const R600Subtarget *Subtarget; + AMDGPUAS AMDGPUASI; + + bool isConstantLoad(const MemSDNode *N, int cbID) const; + bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr); + bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg, + SDValue& Offset); public: explicit R600DAGToDAGISel(TargetMachine *TM, CodeGenOpt::Level OptLevel) : - AMDGPUDAGToDAGISel(TM, OptLevel) {} + AMDGPUDAGToDAGISel(TM, OptLevel) { + AMDGPUASI = AMDGPU::getAMDGPUAS(*TM); + } void Select(SDNode *N) override; @@ -235,6 +240,11 @@ SDValue &Offset) override; bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset) override; + + bool runOnMachineFunction(MachineFunction &MF) override; +protected: + // Include the pieces autogenerated from the target description. +#include "R600GenDAGISel.inc" }; } // end anonymous namespace @@ -276,8 +286,7 @@ } bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const { - const SIInstrInfo *TII - = static_cast(Subtarget)->getInstrInfo(); + const SIInstrInfo *TII = Subtarget->getInstrInfo(); if (const ConstantSDNode *C = dyn_cast(N)) return TII->isInlineConstant(C->getAPIntValue()); @@ -394,7 +403,6 @@ EVT VT = N->getValueType(0); unsigned NumVectorElts = VT.getVectorNumElements(); EVT EltVT = VT.getVectorElementType(); - const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo(); SDLoc DL(N); SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32); @@ -422,7 +430,7 @@ } RegSeqArgs[1 + (2 * i)] = N->getOperand(i); RegSeqArgs[1 + (2 * i) + 1] = - CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL, + CurDAG->getTargetConstant(AMDGPURegisterInfo::getSubRegFromChannel(i), DL, MVT::i32); } if (NOps != NumVectorElts) { @@ -433,7 +441,7 @@ for (unsigned i = NOps; i < NumVectorElts; ++i) { RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0); RegSeqArgs[1 + (2 * i) + 1] = - CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), DL, MVT::i32); + CurDAG->getTargetConstant(AMDGPURegisterInfo::getSubRegFromChannel(i), DL, MVT::i32); } } @@ -632,16 +640,6 @@ SelectCode(N); } -bool AMDGPUDAGToDAGISel::isConstantLoad(const MemSDNode *N, int CbId) const { - if (!N->readMem()) - return false; - if (CbId == -1) - return N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || - N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT; - - return N->getAddressSpace() == AMDGPUASI.CONSTANT_BUFFER_0 + CbId; -} - bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const { const BasicBlock *BB = FuncInfo->MBB->getBasicBlock(); const Instruction *Term = BB->getTerminator(); @@ -657,26 +655,6 @@ // Complex Patterns //===----------------------------------------------------------------------===// -bool AMDGPUDAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr, - SDValue& IntPtr) { - if (ConstantSDNode *Cst = dyn_cast(Addr)) { - IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, SDLoc(Addr), - true); - return true; - } - return false; -} - -bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr, - SDValue& BaseReg, SDValue &Offset) { - if (!isa(Addr)) { - BaseReg = Addr; - Offset = CurDAG->getIntPtrConstant(0, SDLoc(Addr), true); - return true; - } - return false; -} - bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset) { return false; @@ -688,11 +666,11 @@ SDLoc DL(Addr); if ((C = dyn_cast(Addr))) { - Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32); + Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32); Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) && (C = dyn_cast(Addr.getOperand(0)))) { - Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32); + Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32); Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) && (C = dyn_cast(Addr.getOperand(1)))) { @@ -2149,6 +2127,41 @@ } while (IsModified); } +bool R600DAGToDAGISel::runOnMachineFunction(MachineFunction &MF) { + Subtarget = &MF.getSubtarget(); + return SelectionDAGISel::runOnMachineFunction(MF); +} + +bool R600DAGToDAGISel::isConstantLoad(const MemSDNode *N, int CbId) const { + if (!N->readMem()) + return false; + if (CbId == -1) + return N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || + N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT; + + return N->getAddressSpace() == AMDGPUASI.CONSTANT_BUFFER_0 + CbId; +} + +bool R600DAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr, + SDValue& IntPtr) { + if (ConstantSDNode *Cst = dyn_cast(Addr)) { + IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, SDLoc(Addr), + true); + return true; + } + return false; +} + +bool R600DAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr, + SDValue& BaseReg, SDValue &Offset) { + if (!isa(Addr)) { + BaseReg = Addr; + Offset = CurDAG->getIntPtrConstant(0, SDLoc(Addr), true); + return true; + } + return false; +} + void R600DAGToDAGISel::Select(SDNode *N) { unsigned int Opc = N->getOpcode(); if (N->isMachineOpcode()) { @@ -2169,12 +2182,12 @@ // pass. We want to avoid 128 bits copies as much as possible because they // can't be bundled by our scheduler. switch(NumVectorElts) { - case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break; + case 2: RegClassID = R600::R600_Reg64RegClassID; break; case 4: if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR) - RegClassID = AMDGPU::R600_Reg128VerticalRegClassID; + RegClassID = R600::R600_Reg128VerticalRegClassID; else - RegClassID = AMDGPU::R600_Reg128RegClassID; + RegClassID = R600::R600_Reg128RegClassID; break; default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR"); } @@ -2192,11 +2205,11 @@ SDLoc DL(Addr); if ((C = dyn_cast(Addr))) { - Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32); + Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32); Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) && (C = dyn_cast(Addr.getOperand(0)))) { - Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32); + Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32); Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32); } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) && (C = dyn_cast(Addr.getOperand(1)))) { @@ -2227,7 +2240,7 @@ && isInt<16>(IMMOffset->getZExtValue())) { Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(CurDAG->getEntryNode()), - AMDGPU::ZERO, MVT::i32); + R600::ZERO, MVT::i32); Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr), MVT::i32); return true; Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -23,11 +23,13 @@ namespace llvm { class AMDGPUMachineFunction; -class AMDGPUSubtarget; +class AMDGPUCommonSubtarget; struct ArgDescriptor; class AMDGPUTargetLowering : public TargetLowering { private: + const AMDGPUCommonSubtarget *Subtarget; + /// \returns AMDGPUISD::FFBH_U32 node if the incoming \p Op may have been /// legalized from a smaller type VT. Need to match pre-legalized type because /// the generic legalization inserts the add/sub between the select and @@ -39,7 +41,6 @@ static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG); protected: - const AMDGPUSubtarget *Subtarget; AMDGPUAS AMDGPUASI; SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; @@ -123,7 +124,7 @@ void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl &Ins) const; public: - AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI); + AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUCommonSubtarget &STI); bool mayIgnoreSignedZero(SDValue Op) const { if (getTargetMachine().Options.NoSignedZerosFPMath) Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -151,7 +151,7 @@ } AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, - const AMDGPUSubtarget &STI) + const AMDGPUCommonSubtarget &STI) : TargetLowering(TM), Subtarget(&STI) { AMDGPUASI = AMDGPU::getAMDGPUAS(TM); // Lower floating point store/load to integer store/load to reduce the number @@ -355,13 +355,6 @@ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom); - if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { - setOperationAction(ISD::FCEIL, MVT::f64, Custom); - setOperationAction(ISD::FTRUNC, MVT::f64, Custom); - setOperationAction(ISD::FRINT, MVT::f64, Custom); - setOperationAction(ISD::FFLOOR, MVT::f64, Custom); - } - if (!Subtarget->hasBFI()) { // fcopysign can be done in a single instruction with BFI. setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); @@ -773,7 +766,7 @@ { const LoadSDNode * L = dyn_cast(N); if (L->getMemOperand()->getAddrSpace() - == Subtarget->getAMDGPUAS().CONSTANT_ADDRESS_32BIT) + == AMDGPUASI.CONSTANT_ADDRESS_32BIT) return true; return false; } @@ -824,7 +817,7 @@ case ISD::LOAD: { const LoadSDNode *L = dyn_cast(N); if (L->getMemOperand()->getAddrSpace() == - Subtarget->getAMDGPUAS().PRIVATE_ADDRESS) + AMDGPUASI.PRIVATE_ADDRESS) return true; } break; case ISD::CALLSEQ_END: @@ -4302,9 +4295,11 @@ switch (IID) { case Intrinsic::amdgcn_mbcnt_lo: case Intrinsic::amdgcn_mbcnt_hi: { + const SISubtarget &ST = + DAG.getMachineFunction().getSubtarget(); // These return at most the wavefront size - 1. unsigned Size = Op.getValueType().getSizeInBits(); - Known.Zero.setHighBits(Size - Subtarget->getWavefrontSizeLog2()); + Known.Zero.setHighBits(Size - ST.getWavefrontSizeLog2()); break; } default: Index: lib/Target/AMDGPU/AMDGPUInstrInfo.h =================================================================== --- lib/Target/AMDGPU/AMDGPUInstrInfo.h +++ lib/Target/AMDGPU/AMDGPUInstrInfo.h @@ -20,10 +20,6 @@ #include "Utils/AMDGPUBaseInfo.h" #include "llvm/CodeGen/TargetInstrInfo.h" -#define GET_INSTRINFO_HEADER -#include "AMDGPUGenInstrInfo.inc" -#undef GET_INSTRINFO_HEADER - namespace llvm { class AMDGPUSubtarget; @@ -31,21 +27,15 @@ class MachineInstr; class MachineInstrBuilder; -class AMDGPUInstrInfo : public AMDGPUGenInstrInfo { +class AMDGPUInstrInfo { private: const AMDGPUSubtarget &ST; - virtual void anchor(); -protected: - AMDGPUAS AMDGPUASI; +// virtual void anchor(); public: explicit AMDGPUInstrInfo(const AMDGPUSubtarget &st); - bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, - int64_t Offset1, int64_t Offset2, - unsigned NumLoads) const override; - /// \brief Return a target-specific opcode if Opcode is a pseudo instruction. /// Return -1 if the target-specific opcode for the pseudo instruction does /// not exist. If Opcode is not a pseudo instruction, this is identity. Index: lib/Target/AMDGPU/AMDGPUInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUInstrInfo.cpp +++ lib/Target/AMDGPU/AMDGPUInstrInfo.cpp @@ -23,109 +23,14 @@ using namespace llvm; -#define GET_INSTRINFO_CTOR_DTOR -#include "AMDGPUGenInstrInfo.inc" - -namespace llvm { -namespace AMDGPU { -#define GET_RSRCINTRINSIC_IMPL -#include "AMDGPUGenSearchableTables.inc" - -#define GET_D16IMAGEDIMINTRINSIC_IMPL -#include "AMDGPUGenSearchableTables.inc" -} -} - // Pin the vtable to this file. -void AMDGPUInstrInfo::anchor() {} +//void AMDGPUInstrInfo::anchor() {} AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &ST) - : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), - ST(ST), - AMDGPUASI(ST.getAMDGPUAS()) {} - -// FIXME: This behaves strangely. If, for example, you have 32 load + stores, -// the first 16 loads will be interleaved with the stores, and the next 16 will -// be clustered as expected. It should really split into 2 16 store batches. -// -// Loads are clustered until this returns false, rather than trying to schedule -// groups of stores. This also means we have to deal with saying different -// address space loads should be clustered, and ones which might cause bank -// conflicts. -// -// This might be deprecated so it might not be worth that much effort to fix. -bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, - int64_t Offset0, int64_t Offset1, - unsigned NumLoads) const { - assert(Offset1 > Offset0 && - "Second offset should be larger than first offset!"); - // If we have less than 16 loads in a row, and the offsets are within 64 - // bytes, then schedule together. - - // A cacheline is 64 bytes (for global memory). - return (NumLoads <= 16 && (Offset1 - Offset0) < 64); -} + : ST(ST) { } -// This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td -enum SIEncodingFamily { - SI = 0, - VI = 1, - SDWA = 2, - SDWA9 = 3, - GFX80 = 4, - GFX9 = 5 -}; -static SIEncodingFamily subtargetEncodingFamily(const AMDGPUSubtarget &ST) { - switch (ST.getGeneration()) { - case AMDGPUSubtarget::SOUTHERN_ISLANDS: - case AMDGPUSubtarget::SEA_ISLANDS: - return SIEncodingFamily::SI; - case AMDGPUSubtarget::VOLCANIC_ISLANDS: - case AMDGPUSubtarget::GFX9: - return SIEncodingFamily::VI; - // FIXME: This should never be called for r600 GPUs. - case AMDGPUSubtarget::R600: - case AMDGPUSubtarget::R700: - case AMDGPUSubtarget::EVERGREEN: - case AMDGPUSubtarget::NORTHERN_ISLANDS: - return SIEncodingFamily::SI; - } - - llvm_unreachable("Unknown subtarget generation!"); -} - -int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const { - SIEncodingFamily Gen = subtargetEncodingFamily(ST); - - if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 && - ST.getGeneration() >= AMDGPUSubtarget::GFX9) - Gen = SIEncodingFamily::GFX9; - - if (get(Opcode).TSFlags & SIInstrFlags::SDWA) - Gen = ST.getGeneration() == AMDGPUSubtarget::GFX9 ? SIEncodingFamily::SDWA9 - : SIEncodingFamily::SDWA; - // Adjust the encoding family to GFX80 for D16 buffer instructions when the - // subtarget has UnpackedD16VMem feature. - // TODO: remove this when we discard GFX80 encoding. - if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16) - && !(get(Opcode).TSFlags & SIInstrFlags::MIMG)) - Gen = SIEncodingFamily::GFX80; - - int MCOp = AMDGPU::getMCOpcode(Opcode, Gen); - - // -1 means that Opcode is already a native instruction. - if (MCOp == -1) - return Opcode; - - // (uint16_t)-1 means that Opcode is a pseudo instruction that has - // no encoding in the given subtarget generation. - if (MCOp == (uint16_t)-1) - return -1; - - return MCOp; -} // TODO: Should largely merge with AMDGPUTTIImpl::isSourceOfDivergence. bool AMDGPUInstrInfo::isUniformMMO(const MachineMemOperand *MMO) { Index: lib/Target/AMDGPU/AMDGPUInstructions.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstructions.td +++ lib/Target/AMDGPU/AMDGPUInstructions.td @@ -42,6 +42,47 @@ field bits<32> Inst = 0xffffffff; } +//===---------------------------------------------------------------------===// +// Return instruction +//===---------------------------------------------------------------------===// + +class ILFormat pattern> +: Instruction { + + let Namespace = "AMDGPU"; + dag OutOperandList = outs; + dag InOperandList = ins; + let Pattern = pattern; + let AsmString = !strconcat(asmstr, "\n"); + let isPseudo = 1; + let Itinerary = NullALU; + bit hasIEEEFlag = 0; + bit hasZeroOpFlag = 0; + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let isCodeGenOnly = 1; +} + +def TruePredicate : Predicate<"true">; + +// Exists to help track down where SubtargetPredicate isn't set rather +// than letting tablegen crash with an unhelpful error. +def InvalidPred : Predicate<"predicate not set on instruction or pattern">; + +class PredicateControl { + Predicate SubtargetPredicate = InvalidPred; + list AssemblerPredicates = []; + Predicate AssemblerPredicate = TruePredicate; + list OtherPredicates = []; + list Predicates = !listconcat([SubtargetPredicate, + AssemblerPredicate], + AssemblerPredicates, + OtherPredicates); +} +class AMDGPUPat : Pat, + PredicateControl; + def FP16Denormals : Predicate<"Subtarget->hasFP16Denormals()">; def FP32Denormals : Predicate<"Subtarget->hasFP32Denormals()">; def FP64Denormals : Predicate<"Subtarget->hasFP64Denormals()">; @@ -94,12 +135,6 @@ // Misc. PatFrags //===----------------------------------------------------------------------===// -class HasOneUseUnaryOp : PatFrag< - (ops node:$src0), - (op $src0), - [{ return N->hasOneUse(); }] ->; - class HasOneUseBinOp : PatFrag< (ops node:$src0, node:$src1), (op $src0, $src1), @@ -112,8 +147,6 @@ [{ return N->hasOneUse(); }] >; -def trunc_oneuse : HasOneUseUnaryOp; - let Properties = [SDNPCommutative, SDNPAssociative] in { def smax_oneuse : HasOneUseBinOp; def smin_oneuse : HasOneUseBinOp; @@ -239,6 +272,37 @@ [{(void)N; return false;}] >; +//===----------------------------------------------------------------------===// +// PatLeafs for Texture Constants +//===----------------------------------------------------------------------===// + +def TEX_ARRAY : PatLeaf< + (imm), + [{uint32_t TType = (uint32_t)N->getZExtValue(); + return TType == 9 || TType == 10 || TType == 16; + }] +>; + +def TEX_RECT : PatLeaf< + (imm), + [{uint32_t TType = (uint32_t)N->getZExtValue(); + return TType == 5; + }] +>; + +def TEX_SHADOW : PatLeaf< + (imm), + [{uint32_t TType = (uint32_t)N->getZExtValue(); + return (TType >= 6 && TType <= 8) || TType == 13; + }] +>; + +def TEX_SHADOW_ARRAY : PatLeaf< + (imm), + [{uint32_t TType = (uint32_t)N->getZExtValue(); + return TType == 11 || TType == 12 || TType == 17; + }] +>; //===----------------------------------------------------------------------===// // Load/Store Pattern Fragments @@ -746,11 +810,3 @@ (AMDGPUrcp (fsqrt vt:$src)), (RsqInst $src) >; - -include "R600Instructions.td" -include "R700Instructions.td" -include "EvergreenInstructions.td" -include "CaymanInstructions.td" - -include "SIInstrInfo.td" - Index: lib/Target/AMDGPU/AMDGPUIntrinsics.td =================================================================== --- lib/Target/AMDGPU/AMDGPUIntrinsics.td +++ lib/Target/AMDGPU/AMDGPUIntrinsics.td @@ -14,5 +14,3 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in { def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>; } - -include "SIIntrinsics.td" Index: lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp +++ lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp @@ -117,7 +117,6 @@ return false; const TargetMachine &TM = TPC->getTM(); - const AMDGPUSubtarget &ST = TM.getSubtarget(F); bool Changed = false; for (auto *U : F.users()) { @@ -125,7 +124,7 @@ if (!CI) continue; - Changed |= ST.makeLIDRangeMetadata(CI); + Changed |= AMDGPUCommonSubtarget::get(TM, F).makeLIDRangeMetadata(CI); } return Changed; } Index: lib/Target/AMDGPU/AMDGPUMCInstLower.h =================================================================== --- lib/Target/AMDGPU/AMDGPUMCInstLower.h +++ lib/Target/AMDGPU/AMDGPUMCInstLower.h @@ -12,7 +12,6 @@ namespace llvm { -class AMDGPUSubtarget; class AsmPrinter; class MachineBasicBlock; class MachineInstr; @@ -21,17 +20,18 @@ class MCExpr; class MCInst; class MCOperand; +class TargetSubtargetInfo; class AMDGPUMCInstLower { MCContext &Ctx; - const AMDGPUSubtarget &ST; + const TargetSubtargetInfo &ST; const AsmPrinter &AP; const MCExpr *getLongBranchBlockExpr(const MachineBasicBlock &SrcBB, const MachineOperand &MO) const; public: - AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &ST, + AMDGPUMCInstLower(MCContext &ctx, const TargetSubtargetInfo &ST, const AsmPrinter &AP); bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const; Index: lib/Target/AMDGPU/AMDGPUMCInstLower.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -39,7 +39,8 @@ #include "AMDGPUGenMCPseudoLowering.inc" -AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &st, +AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx, + const TargetSubtargetInfo &st, const AsmPrinter &ap): Ctx(ctx), ST(st), AP(ap) { } @@ -66,9 +67,10 @@ const MCExpr *DestBBSym = MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx); const MCExpr *SrcBBSym = MCSymbolRefExpr::create(SrcBB.getSymbol(), Ctx); + const SIInstrInfo *TII = static_cast(ST.getInstrInfo()); assert(SrcBB.front().getOpcode() == AMDGPU::S_GETPC_B64 && - ST.getInstrInfo()->get(AMDGPU::S_GETPC_B64).Size == 4); + TII->get(AMDGPU::S_GETPC_B64).Size == 4); // s_getpc_b64 returns the address of next instruction. const MCConstantExpr *One = MCConstantExpr::create(4, Ctx); @@ -90,7 +92,10 @@ MCOp = MCOperand::createImm(MO.getImm()); return true; case MachineOperand::MO_Register: - MCOp = MCOperand::createReg(AMDGPU::getMCReg(MO.getReg(), ST)); + if (ST.getTargetTriple().getArch() == Triple::amdgcn) + MCOp = MCOperand::createReg(AMDGPU::getMCReg(MO.getReg(), ST)); + else + MCOp = MCOperand::createReg(MO.getReg()); return true; case MachineOperand::MO_MachineBasicBlock: { if (MO.getTargetFlags() != 0) { @@ -130,33 +135,38 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { unsigned Opcode = MI->getOpcode(); - const auto *TII = ST.getInstrInfo(); - - // FIXME: Should be able to handle this with emitPseudoExpansionLowering. We - // need to select it to the subtarget specific version, and there's no way to - // do that with a single pseudo source operation. - if (Opcode == AMDGPU::S_SETPC_B64_return) - Opcode = AMDGPU::S_SETPC_B64; - else if (Opcode == AMDGPU::SI_CALL) { - // SI_CALL is just S_SWAPPC_B64 with an additional operand to track the - // called function (which we need to remove here). - OutMI.setOpcode(TII->pseudoToMCOpcode(AMDGPU::S_SWAPPC_B64)); - MCOperand Dest, Src; - lowerOperand(MI->getOperand(0), Dest); - lowerOperand(MI->getOperand(1), Src); - OutMI.addOperand(Dest); - OutMI.addOperand(Src); - return; - } else if (Opcode == AMDGPU::SI_TCRETURN) { - // TODO: How to use branch immediate and avoid register+add? - Opcode = AMDGPU::S_SETPC_B64; - } + int MCOpcode = Opcode; + auto &STI = MI->getParent()->getParent()->getSubtarget(); + + if (STI.getTargetTriple().getArch() == Triple::amdgcn) { + const auto *TII = static_cast(STI.getInstrInfo()); + + // FIXME: Should be able to handle this with emitPseudoExpansionLowering. We + // need to select it to the subtarget specific version, and there's no way to + // do that with a single pseudo source operation. + if (Opcode == AMDGPU::S_SETPC_B64_return) + Opcode = AMDGPU::S_SETPC_B64; + else if (Opcode == AMDGPU::SI_CALL) { + // SI_CALL is just S_SWAPPC_B64 with an additional operand to track the + // called function (which we need to remove here). + OutMI.setOpcode(TII->pseudoToMCOpcode(AMDGPU::S_SWAPPC_B64)); + MCOperand Dest, Src; + lowerOperand(MI->getOperand(0), Dest); + lowerOperand(MI->getOperand(1), Src); + OutMI.addOperand(Dest); + OutMI.addOperand(Src); + return; + } else if (Opcode == AMDGPU::SI_TCRETURN) { + // TODO: How to use branch immediate and avoid register+add? + Opcode = AMDGPU::S_SETPC_B64; + } - int MCOpcode = TII->pseudoToMCOpcode(Opcode); - if (MCOpcode == -1) { - LLVMContext &C = MI->getParent()->getParent()->getFunction().getContext(); - C.emitError("AMDGPUMCInstLower::lower - Pseudo instruction doesn't have " - "a target-specific version: " + Twine(MI->getOpcode())); + if (MCOpcode == -1) { + LLVMContext &C = MI->getParent()->getParent()->getFunction().getContext(); + C.emitError("AMDGPUMCInstLower::lower - Pseudo instruction doesn't have " + "a target-specific version: " + Twine(MI->getOpcode())); + } + MCOpcode = TII->pseudoToMCOpcode(Opcode); } OutMI.setOpcode(MCOpcode); @@ -170,7 +180,7 @@ bool AMDGPUAsmPrinter::lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const { - const AMDGPUSubtarget &STI = MF->getSubtarget(); + auto &STI = MF->getSubtarget(); AMDGPUMCInstLower MCInstLowering(OutContext, STI, *this); return MCInstLowering.lowerOperand(MO, MCOp); } @@ -201,7 +211,7 @@ if (emitPseudoExpansionLowering(*OutStreamer, MI)) return; - const AMDGPUSubtarget &STI = MF->getSubtarget(); + const TargetSubtargetInfo &STI = MF->getSubtarget(); AMDGPUMCInstLower MCInstLowering(OutContext, STI, *this); StringRef Err; @@ -259,7 +269,7 @@ MCInstLowering.lower(MI, TmpInst); EmitToStreamer(*OutStreamer, TmpInst); - if (STI.dumpCode()) { + if (AMDGPUSubtarget::get(*MF).dumpCode()) { // Disassemble instruction/operands to text. DisasmLines.resize(DisasmLines.size() + 1); std::string &DisasmLine = DisasmLines.back(); Index: lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -152,7 +152,7 @@ IsAMDGCN = TT.getArch() == Triple::amdgcn; IsAMDHSA = TT.getOS() == Triple::AMDHSA; - const AMDGPUSubtarget &ST = TM->getSubtarget(F); + const AMDGPUCommonSubtarget &ST = AMDGPUCommonSubtarget::get(*TM, F); if (!ST.isPromoteAllocaEnabled()) return false; @@ -174,8 +174,8 @@ std::pair AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) { - const AMDGPUSubtarget &ST = TM->getSubtarget( - *Builder.GetInsertBlock()->getParent()); + const Function &F = *Builder.GetInsertBlock()->getParent(); + const AMDGPUCommonSubtarget &ST = AMDGPUCommonSubtarget::get(*TM, F); if (!IsAMDHSA) { Function *LocalSizeYFn @@ -261,8 +261,8 @@ } Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) { - const AMDGPUSubtarget &ST = TM->getSubtarget( - *Builder.GetInsertBlock()->getParent()); + const AMDGPUCommonSubtarget &ST = + AMDGPUCommonSubtarget::get(*TM, *Builder.GetInsertBlock()->getParent()); Intrinsic::ID IntrID = Intrinsic::ID::not_intrinsic; switch (N) { @@ -599,7 +599,7 @@ bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) { FunctionType *FTy = F.getFunctionType(); - const AMDGPUSubtarget &ST = TM->getSubtarget(F); + const AMDGPUCommonSubtarget &ST = AMDGPUCommonSubtarget::get(*TM, F); // If the function has any arguments in the local address space, then it's // possible these arguments require the entire local memory space, so @@ -725,8 +725,7 @@ if (!SufficientLDS) return false; - const AMDGPUSubtarget &ST = - TM->getSubtarget(ContainingFunction); + const AMDGPUCommonSubtarget &ST = AMDGPUCommonSubtarget::get(*TM, ContainingFunction); unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second; const DataLayout &DL = Mod->getDataLayout(); Index: lib/Target/AMDGPU/AMDGPURegisterInfo.td =================================================================== --- lib/Target/AMDGPU/AMDGPURegisterInfo.td +++ lib/Target/AMDGPU/AMDGPURegisterInfo.td @@ -19,5 +19,4 @@ } -include "R600RegisterInfo.td" include "SIRegisterInfo.td" Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -39,22 +39,134 @@ #define GET_SUBTARGETINFO_HEADER #include "AMDGPUGenSubtargetInfo.inc" +#define GET_SUBTARGETINFO_HEADER +#include "R600GenSubtargetInfo.inc" namespace llvm { class StringRef; -class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo { +class AMDGPUCommonSubtarget { +private: + Triple TargetTriple; + +protected: + // Dummy feature to use for assembler in tablegen. +public: + AMDGPUCommonSubtarget(const Triple &TT, StringRef GPU, StringRef FS, + const TargetMachine &TM); + + static const AMDGPUCommonSubtarget &get(const MachineFunction &MF); + static const AMDGPUCommonSubtarget &get(const TargetMachine &TM, + const Function &F); + virtual bool enableDX10Clamp() const = 0; + virtual unsigned getAlignmentForImplicitArgPtr() const = 0; + virtual bool isAmdCodeObjectV2(const MachineFunction &MF) const = 0; + virtual bool isAmdHsaOS() const = 0; + virtual bool isAmdPalOS() const = 0; + virtual bool has16BitInsts() const = 0; + virtual bool hasBCNT(unsigned Size) const = 0; + virtual bool hasBFE() const = 0; + virtual bool hasBFI() const = 0; + virtual bool hasMadMixInsts() const = 0; + virtual bool hasCARRY() const = 0; + virtual bool hasFFBH() const = 0; + virtual bool hasFFBL() const = 0; + virtual bool hasFminFmaxLegacy() const = 0; + virtual bool hasFP32Denormals() const = 0; + virtual bool hasFPExceptions() const = 0; + virtual bool hasMulI24() const = 0; + virtual bool hasMulU24() const = 0; + virtual bool hasSDWA() const = 0; + virtual bool hasVOP3PInsts() const = 0; + virtual int getLocalMemorySize() const = 0; + virtual bool dumpCode() const = 0; + virtual bool isPromoteAllocaEnabled() const = 0; + virtual FeatureBitset getFeatureBitsImpl() const = 0; + virtual unsigned getWavefrontSize() const = 0; + + /// \returns Default range flat work group size for a calling convention. + std::pair getDefaultFlatWorkGroupSize(CallingConv::ID CC) const; + + /// \returns Subtarget's default pair of minimum/maximum flat work group sizes + /// for function \p F, or minimum/maximum flat work group sizes explicitly + /// requested using "amdgpu-flat-work-group-size" attribute attached to + /// function \p F. + /// + /// \returns Subtarget's default values if explicitly requested values cannot + /// be converted to integer, or violate subtarget's specifications. + std::pair getFlatWorkGroupSizes(const Function &F) const; + + /// \returns Subtarget's default pair of minimum/maximum number of waves per + /// execution unit for function \p F, or minimum/maximum number of waves per + /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute + /// attached to function \p F. + /// + /// \returns Subtarget's default values if explicitly requested values cannot + /// be converted to integer, violate subtarget's specifications, or are not + /// compatible with minimum/maximum number of waves limited by flat work group + /// size, register usage, and/or lds usage. + std::pair getWavesPerEU(const Function &F) const; + + /// Return the amount of LDS that can be used that will not restrict the + /// occupancy lower than WaveCount. + unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, + const Function &) const; + + /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if + /// the given LDS memory size is the only constraint. + unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const; + + unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const; + + /// \returns Maximum number of work groups per compute unit supported by the + /// subtarget and limited by given \p FlatWorkGroupSize. + unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const { + return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(getFeatureBitsImpl(), + FlatWorkGroupSize); + } + + /// \returns Minimum flat work group size supported by the subtarget. + unsigned getMinFlatWorkGroupSize() const { + return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(getFeatureBitsImpl()); + } + + /// \returns Maximum flat work group size supported by the subtarget. + unsigned getMaxFlatWorkGroupSize() const { + return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(getFeatureBitsImpl()); + } + + /// \returns Maximum number of waves per execution unit supported by the + /// subtarget and limited by given \p FlatWorkGroupSize. + unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const { + return AMDGPU::IsaInfo::getMaxWavesPerEU(getFeatureBitsImpl(), + FlatWorkGroupSize); + } + + /// \returns Minimum number of waves per execution unit supported by the + /// subtarget. + unsigned getMinWavesPerEU() const { + return AMDGPU::IsaInfo::getMinWavesPerEU(getFeatureBitsImpl()); + } + + unsigned getMaxWavesPerEU() const { return 10; } + + /// Creates value range metadata on an workitemid.* inrinsic call or load. + bool makeLIDRangeMetadata(Instruction *I) const; + + virtual ~AMDGPUCommonSubtarget() {} +}; + +class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo, + public AMDGPUCommonSubtarget { public: enum Generation { - R600 = 0, - R700, - EVERGREEN, - NORTHERN_ISLANDS, - SOUTHERN_ISLANDS, - SEA_ISLANDS, - VOLCANIC_ISLANDS, - GFX9, + // Gap for R600 generations, so we can do comparisons between + // AMDGPUSubtarget and r600Subtarget. + SOUTHERN_ISLANDS = 4, + SEA_ISLANDS = 5, + VOLCANIC_ISLANDS = 6, + GFX9 = 7, }; enum { @@ -94,10 +206,19 @@ LLVMTrapHandlerRegValue = 1 }; +private: + SIFrameLowering FrameLowering; + + /// GlobalISel related APIs. + std::unique_ptr CallLoweringInfo; + std::unique_ptr InstSelector; + std::unique_ptr Legalizer; + std::unique_ptr RegBankInfo; + protected: // Basic subtarget description. Triple TargetTriple; - Generation Gen; + unsigned Gen; unsigned IsaVersion; unsigned WavefrontSize; int LocalMemorySize; @@ -177,7 +298,6 @@ // Dummy feature to use for assembler in tablegen. bool FeatureDisable; - InstrItineraryData InstrItins; SelectionDAGTargetInfo TSInfo; AMDGPUAS AS; @@ -189,13 +309,30 @@ AMDGPUSubtarget &initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS); - const AMDGPUInstrInfo *getInstrInfo() const override = 0; - const AMDGPUFrameLowering *getFrameLowering() const override = 0; - const AMDGPUTargetLowering *getTargetLowering() const override = 0; - const AMDGPURegisterInfo *getRegisterInfo() const override = 0; + virtual const SIInstrInfo *getInstrInfo() const override = 0; - const InstrItineraryData *getInstrItineraryData() const override { - return &InstrItins; + const SIFrameLowering *getFrameLowering() const override { + return &FrameLowering; + } + + virtual const SITargetLowering *getTargetLowering() const override = 0; + + virtual const SIRegisterInfo *getRegisterInfo() const override = 0; + + const CallLowering *getCallLowering() const override { + return CallLoweringInfo.get(); + } + + const InstructionSelector *getInstructionSelector() const override { + return InstSelector.get(); + } + + const LegalizerInfo *getLegalizerInfo() const override { + return Legalizer.get(); + } + + const RegisterBankInfo *getRegBankInfo() const override { + return RegBankInfo.get(); } // Nothing implemented, just prevent crashes on use. @@ -205,7 +342,7 @@ void ParseSubtargetFeatures(StringRef CPU, StringRef FS); - bool isAmdHsaOS() const { + bool isAmdHsaOS() const override { return TargetTriple.getOS() == Triple::AMDHSA; } @@ -213,15 +350,15 @@ return TargetTriple.getOS() == Triple::Mesa3D; } - bool isAmdPalOS() const { + bool isAmdPalOS() const override { return TargetTriple.getOS() == Triple::AMDPAL; } Generation getGeneration() const { - return Gen; + return (Generation)Gen; } - unsigned getWavefrontSize() const { + unsigned getWavefrontSize() const override { return WavefrontSize; } @@ -229,7 +366,7 @@ return Log2_32(WavefrontSize); } - int getLocalMemorySize() const { + int getLocalMemorySize() const override { return LocalMemorySize; } @@ -241,11 +378,15 @@ return MaxPrivateElementSize; } + FeatureBitset getFeatureBitsImpl() const override { + return getFeatureBits(); + } + AMDGPUAS getAMDGPUAS() const { return AS; } - bool has16BitInsts() const { + bool has16BitInsts() const override { return Has16BitInsts; } @@ -253,7 +394,7 @@ return HasIntClamp; } - bool hasVOP3PInsts() const { + bool hasVOP3PInsts() const override { return HasVOP3PInsts; } @@ -265,6 +406,10 @@ return MIMG_R128; } + bool hasHWFP64() const { + return FP64; + } + bool hasFastFMAF32() const { return FastFMAF32; } @@ -273,71 +418,64 @@ return HalfRate64Ops; } - bool hasAddr64() const { - return (getGeneration() < VOLCANIC_ISLANDS); + virtual bool hasAddr64() const { + return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS); } - bool hasBFE() const { - return (getGeneration() >= EVERGREEN); + bool hasBFE() const override { + return true; } - bool hasBFI() const { - return (getGeneration() >= EVERGREEN); + bool hasBFI() const override { + return true; } bool hasBFM() const { return hasBFE(); } - bool hasBCNT(unsigned Size) const { - if (Size == 32) - return (getGeneration() >= EVERGREEN); - - if (Size == 64) - return (getGeneration() >= SOUTHERN_ISLANDS); - - return false; + bool hasBCNT(unsigned Size) const override { + return true; } - bool hasMulU24() const { - return (getGeneration() >= EVERGREEN); + bool hasMulU24() const override { + return true; } - bool hasMulI24() const { - return (getGeneration() >= SOUTHERN_ISLANDS || - hasCaymanISA()); + bool hasMulI24() const override { + return true; } - bool hasFFBL() const { - return (getGeneration() >= EVERGREEN); + bool hasFFBL() const override { + return true; } - bool hasFFBH() const { - return (getGeneration() >= EVERGREEN); + bool hasFFBH() const override { + return true; } - bool hasMed3_16() const { - return getGeneration() >= GFX9; + virtual bool hasMed3_16() const { + return getGeneration() >= AMDGPUSubtarget::GFX9; } - bool hasMin3Max3_16() const { - return getGeneration() >= GFX9; + virtual bool hasMin3Max3_16() const { + return getGeneration() >= AMDGPUSubtarget::GFX9; } - bool hasMadMixInsts() const { + bool hasMadMixInsts() const override { return HasMadMixInsts; } - bool hasCARRY() const { - return (getGeneration() >= EVERGREEN); + bool hasCARRY() const override { + return true; } - bool hasBORROW() const { - return (getGeneration() >= EVERGREEN); + virtual bool hasBORROW() const { + return true; } - bool hasCaymanISA() const { - return CaymanISA; + virtual bool hasCaymanISA() const { + return false; } bool hasFMA() const { @@ -352,7 +490,7 @@ return EnableHugePrivateBuffer; } - bool isPromoteAllocaEnabled() const { + bool isPromoteAllocaEnabled() const override { return EnablePromoteAlloca; } @@ -360,7 +498,7 @@ return EnableUnsafeDSOffsetFolding; } - bool dumpCode() const { + bool dumpCode() const override { return DumpCode; } @@ -369,17 +507,11 @@ unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const; - /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if - /// the given LDS memory size is the only constraint. - unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const; - - unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const; - bool hasFP16Denormals() const { return FP64FP16Denormals; } - bool hasFP32Denormals() const { + bool hasFP32Denormals() const override { return FP32Denormals; } @@ -391,11 +523,11 @@ return getGeneration() >= AMDGPUSubtarget::GFX9; } - bool hasFPExceptions() const { + bool hasFPExceptions() const override { return FPExceptions; } - bool enableDX10Clamp() const { + bool enableDX10Clamp() const override { return DX10Clamp; } @@ -436,7 +568,7 @@ } bool hasApertureRegs() const { - return HasApertureRegs; + return HasApertureRegs; } bool isTrapHandlerEnabled() const { @@ -490,7 +622,7 @@ return isMesa3DOS() && AMDGPU::isShader(MF.getFunction().getCallingConv()); } - bool isAmdCodeObjectV2(const MachineFunction &MF) const { + bool isAmdCodeObjectV2(const MachineFunction &MF) const override { return isAmdHsaOS() || isMesaKernel(MF); } @@ -498,11 +630,11 @@ return getGeneration() >= SEA_ISLANDS; } - bool hasFminFmaxLegacy() const { + bool hasFminFmaxLegacy() const override { return getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; } - bool hasSDWA() const { + bool hasSDWA() const override { return HasSDWA; } @@ -536,7 +668,7 @@ return isAmdCodeObjectV2(MF) ? 0 : 36; } - unsigned getAlignmentForImplicitArgPtr() const { + unsigned getAlignmentForImplicitArgPtr() const override { return isAmdHsaOS() ? 8 : 4; } @@ -569,134 +701,39 @@ return true; } - void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b;} - bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal;} + void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; } + bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; } /// \returns Number of execution units per compute unit supported by the /// subtarget. unsigned getEUsPerCU() const { - return AMDGPU::IsaInfo::getEUsPerCU(getFeatureBits()); - } - - /// \returns Maximum number of work groups per compute unit supported by the - /// subtarget and limited by given \p FlatWorkGroupSize. - unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const { - return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(getFeatureBits(), - FlatWorkGroupSize); + return AMDGPU::IsaInfo::getEUsPerCU(MCSubtargetInfo::getFeatureBits()); } /// \returns Maximum number of waves per compute unit supported by the /// subtarget without any kind of limitation. unsigned getMaxWavesPerCU() const { - return AMDGPU::IsaInfo::getMaxWavesPerCU(getFeatureBits()); + return AMDGPU::IsaInfo::getMaxWavesPerCU(MCSubtargetInfo::getFeatureBits()); } /// \returns Maximum number of waves per compute unit supported by the /// subtarget and limited by given \p FlatWorkGroupSize. unsigned getMaxWavesPerCU(unsigned FlatWorkGroupSize) const { - return AMDGPU::IsaInfo::getMaxWavesPerCU(getFeatureBits(), + return AMDGPU::IsaInfo::getMaxWavesPerCU(MCSubtargetInfo::getFeatureBits(), FlatWorkGroupSize); } - /// \returns Minimum number of waves per execution unit supported by the - /// subtarget. - unsigned getMinWavesPerEU() const { - return AMDGPU::IsaInfo::getMinWavesPerEU(getFeatureBits()); - } - /// \returns Maximum number of waves per execution unit supported by the /// subtarget without any kind of limitation. unsigned getMaxWavesPerEU() const { - return AMDGPU::IsaInfo::getMaxWavesPerEU(getFeatureBits()); - } - - /// \returns Maximum number of waves per execution unit supported by the - /// subtarget and limited by given \p FlatWorkGroupSize. - unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const { - return AMDGPU::IsaInfo::getMaxWavesPerEU(getFeatureBits(), - FlatWorkGroupSize); - } - - /// \returns Minimum flat work group size supported by the subtarget. - unsigned getMinFlatWorkGroupSize() const { - return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(getFeatureBits()); - } - - /// \returns Maximum flat work group size supported by the subtarget. - unsigned getMaxFlatWorkGroupSize() const { - return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(getFeatureBits()); + return AMDGPU::IsaInfo::getMaxWavesPerEU(); } /// \returns Number of waves per work group supported by the subtarget and /// limited by given \p FlatWorkGroupSize. unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const { - return AMDGPU::IsaInfo::getWavesPerWorkGroup(getFeatureBits(), - FlatWorkGroupSize); - } - - /// \returns Default range flat work group size for a calling convention. - std::pair getDefaultFlatWorkGroupSize(CallingConv::ID CC) const; - - /// \returns Subtarget's default pair of minimum/maximum flat work group sizes - /// for function \p F, or minimum/maximum flat work group sizes explicitly - /// requested using "amdgpu-flat-work-group-size" attribute attached to - /// function \p F. - /// - /// \returns Subtarget's default values if explicitly requested values cannot - /// be converted to integer, or violate subtarget's specifications. - std::pair getFlatWorkGroupSizes(const Function &F) const; - - /// \returns Subtarget's default pair of minimum/maximum number of waves per - /// execution unit for function \p F, or minimum/maximum number of waves per - /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute - /// attached to function \p F. - /// - /// \returns Subtarget's default values if explicitly requested values cannot - /// be converted to integer, violate subtarget's specifications, or are not - /// compatible with minimum/maximum number of waves limited by flat work group - /// size, register usage, and/or lds usage. - std::pair getWavesPerEU(const Function &F) const; - - /// Creates value range metadata on an workitemid.* inrinsic call or load. - bool makeLIDRangeMetadata(Instruction *I) const; -}; - -class R600Subtarget final : public AMDGPUSubtarget { -private: - R600InstrInfo InstrInfo; - R600FrameLowering FrameLowering; - R600TargetLowering TLInfo; - -public: - R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS, - const TargetMachine &TM); - - const R600InstrInfo *getInstrInfo() const override { - return &InstrInfo; - } - - const R600FrameLowering *getFrameLowering() const override { - return &FrameLowering; - } - - const R600TargetLowering *getTargetLowering() const override { - return &TLInfo; - } - - const R600RegisterInfo *getRegisterInfo() const override { - return &InstrInfo.getRegisterInfo(); - } - - bool hasCFAluBug() const { - return CFALUBug; - } - - bool hasVertexCache() const { - return HasVertexCache; - } - - short getTexVTXClauseSize() const { - return TexVTXClauseSize; + return AMDGPU::IsaInfo::getWavesPerWorkGroup( + MCSubtargetInfo::getFeatureBits(), FlatWorkGroupSize); } }; @@ -747,6 +784,8 @@ const SIRegisterInfo *getRegisterInfo() const override { return &InstrInfo.getRegisterInfo(); } + // static wrappers + static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI); // XXX - Why is this here if it isn't in the default pass set? bool enableEarlyIfConversion() const override { @@ -756,7 +795,7 @@ void overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const override; - bool isVGPRSpillingEnabled(const Function& F) const; + bool isVGPRSpillingEnabled(const Function &F) const; unsigned getMaxNumUserSGPRs() const { return 16; @@ -804,7 +843,7 @@ bool debuggerSupported() const { return debuggerInsertNops() && debuggerReserveRegs() && - debuggerEmitPrologue(); + debuggerEmitPrologue(); } bool debuggerInsertNops() const { @@ -846,16 +885,18 @@ unsigned getKernArgSegmentSize(const MachineFunction &MF, unsigned ExplictArgBytes) const; - /// Return the maximum number of waves per SIMD for kernels using \p SGPRs SGPRs + /// Return the maximum number of waves per SIMD for kernels using \p SGPRs + /// SGPRs unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; - /// Return the maximum number of waves per SIMD for kernels using \p VGPRs VGPRs + /// Return the maximum number of waves per SIMD for kernels using \p VGPRs + /// VGPRs unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const; /// \returns true if the flat_scratch register should be initialized with the /// pointer to the wave's scratch memory rather than a size and offset. bool flatScratchIsPointer() const { - return getGeneration() >= GFX9; + return getGeneration() >= AMDGPUSubtarget::GFX9; } /// \returns true if the machine has merged shaders in which s0-s7 are @@ -866,35 +907,39 @@ /// \returns SGPR allocation granularity supported by the subtarget. unsigned getSGPRAllocGranule() const { - return AMDGPU::IsaInfo::getSGPRAllocGranule(getFeatureBits()); + return AMDGPU::IsaInfo::getSGPRAllocGranule( + MCSubtargetInfo::getFeatureBits()); } /// \returns SGPR encoding granularity supported by the subtarget. unsigned getSGPREncodingGranule() const { - return AMDGPU::IsaInfo::getSGPREncodingGranule(getFeatureBits()); + return AMDGPU::IsaInfo::getSGPREncodingGranule( + MCSubtargetInfo::getFeatureBits()); } /// \returns Total number of SGPRs supported by the subtarget. unsigned getTotalNumSGPRs() const { - return AMDGPU::IsaInfo::getTotalNumSGPRs(getFeatureBits()); + return AMDGPU::IsaInfo::getTotalNumSGPRs(MCSubtargetInfo::getFeatureBits()); } /// \returns Addressable number of SGPRs supported by the subtarget. unsigned getAddressableNumSGPRs() const { - return AMDGPU::IsaInfo::getAddressableNumSGPRs(getFeatureBits()); + return AMDGPU::IsaInfo::getAddressableNumSGPRs( + MCSubtargetInfo::getFeatureBits()); } /// \returns Minimum number of SGPRs that meets the given number of waves per /// execution unit requirement supported by the subtarget. unsigned getMinNumSGPRs(unsigned WavesPerEU) const { - return AMDGPU::IsaInfo::getMinNumSGPRs(getFeatureBits(), WavesPerEU); + return AMDGPU::IsaInfo::getMinNumSGPRs(MCSubtargetInfo::getFeatureBits(), + WavesPerEU); } /// \returns Maximum number of SGPRs that meets the given number of waves per /// execution unit requirement supported by the subtarget. unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const { - return AMDGPU::IsaInfo::getMaxNumSGPRs(getFeatureBits(), WavesPerEU, - Addressable); + return AMDGPU::IsaInfo::getMaxNumSGPRs(MCSubtargetInfo::getFeatureBits(), + WavesPerEU, Addressable); } /// \returns Reserved number of SGPRs for given function \p MF. @@ -912,34 +957,39 @@ /// \returns VGPR allocation granularity supported by the subtarget. unsigned getVGPRAllocGranule() const { - return AMDGPU::IsaInfo::getVGPRAllocGranule(getFeatureBits()); + return AMDGPU::IsaInfo::getVGPRAllocGranule( + MCSubtargetInfo::getFeatureBits()); } /// \returns VGPR encoding granularity supported by the subtarget. unsigned getVGPREncodingGranule() const { - return AMDGPU::IsaInfo::getVGPREncodingGranule(getFeatureBits()); + return AMDGPU::IsaInfo::getVGPREncodingGranule( + MCSubtargetInfo::getFeatureBits()); } /// \returns Total number of VGPRs supported by the subtarget. unsigned getTotalNumVGPRs() const { - return AMDGPU::IsaInfo::getTotalNumVGPRs(getFeatureBits()); + return AMDGPU::IsaInfo::getTotalNumVGPRs(MCSubtargetInfo::getFeatureBits()); } /// \returns Addressable number of VGPRs supported by the subtarget. unsigned getAddressableNumVGPRs() const { - return AMDGPU::IsaInfo::getAddressableNumVGPRs(getFeatureBits()); + return AMDGPU::IsaInfo::getAddressableNumVGPRs( + MCSubtargetInfo::getFeatureBits()); } /// \returns Minimum number of VGPRs that meets given number of waves per /// execution unit requirement supported by the subtarget. unsigned getMinNumVGPRs(unsigned WavesPerEU) const { - return AMDGPU::IsaInfo::getMinNumVGPRs(getFeatureBits(), WavesPerEU); + return AMDGPU::IsaInfo::getMinNumVGPRs(MCSubtargetInfo::getFeatureBits(), + WavesPerEU); } /// \returns Maximum number of VGPRs that meets given number of waves per /// execution unit requirement supported by the subtarget. unsigned getMaxNumVGPRs(unsigned WavesPerEU) const { - return AMDGPU::IsaInfo::getMaxNumVGPRs(getFeatureBits(), WavesPerEU); + return AMDGPU::IsaInfo::getMaxNumVGPRs(MCSubtargetInfo::getFeatureBits(), + WavesPerEU); } /// \returns Reserved number of VGPRs for given function \p MF. @@ -962,6 +1012,204 @@ const override; }; + +class R600Subtarget final : public R600GenSubtargetInfo, + public AMDGPUCommonSubtarget { +public: + enum Generation { R600 = 0, R700 = 1, EVERGREEN = 2, NORTHERN_ISLANDS = 3 }; + +private: + R600InstrInfo InstrInfo; + R600FrameLowering FrameLowering; + R600TargetLowering TLInfo; + unsigned WavefrontSize; + bool FMA; + bool CaymanISA; + bool CFALUBug; + bool DX10Clamp; + bool HasVertexCache; + bool FP32Denormals; + bool R600ALUInst; + bool DumpCode; + bool FP64; + bool EnablePromoteAlloca; + short TexVTXClauseSize; + Generation Gen; + int LocalMemorySize; + unsigned MaxPrivateElementSize; + int LDSBankCount; + InstrItineraryData InstrItins; + SelectionDAGTargetInfo TSInfo; + AMDGPUAS AS; + +public: + R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS, + const TargetMachine &TM); + + const R600InstrInfo *getInstrInfo() const override { return &InstrInfo; } + + const R600FrameLowering *getFrameLowering() const override { + return &FrameLowering; + } + + const R600TargetLowering *getTargetLowering() const override { + return &TLInfo; + } + + const R600RegisterInfo *getRegisterInfo() const override { + return &InstrInfo.getRegisterInfo(); + } + + const InstrItineraryData *getInstrItineraryData() const override { + return &InstrItins; + } + + // Nothing implemented, just prevent crashes on use. + const SelectionDAGTargetInfo *getSelectionDAGInfo() const override { + return &TSInfo; + } + + void ParseSubtargetFeatures(StringRef CPU, StringRef FS); + + Generation getGeneration() const { + return Gen; + } + + unsigned getStackAlignment() const { + return 4; + } + + bool isAmdCodeObjectV2(const MachineFunction &MF) const override { + return false; + } + + bool isAmdHsaOS() const override { + return false; + } + + bool isAmdPalOS() const override { + return false; + } + + bool dumpCode() const override { + return DumpCode; + } + + bool enableDX10Clamp() const override { + return DX10Clamp; + } + + R600Subtarget &initializeSubtargetDependencies(const Triple &TT, + StringRef GPU, StringRef FS); + + FeatureBitset getFeatureBitsImpl() const override { + return getFeatureBits(); + } + + unsigned getAlignmentForImplicitArgPtr() const override { + return 4; + } + + bool isPromoteAllocaEnabled() const override { + return EnablePromoteAlloca; + } + + bool hasAddr64() const { + return false; + } + + bool has16BitInsts() const override { + return false; + } + + unsigned getWavefrontSize() const override { + return WavefrontSize; + } + + int getLocalMemorySize() const override { + return LocalMemorySize; + } + + bool hasFP32Denormals() const override { + return FP32Denormals; + } + + bool hasBFE() const override { + return (getGeneration() >= EVERGREEN); + } + + bool hasBFI() const override { + return (getGeneration() >= EVERGREEN); + } + + bool hasBCNT(unsigned Size) const override { + if (Size == 32) + return (getGeneration() >= EVERGREEN); + + return false; + } + + bool hasBORROW() const { + return (getGeneration() >= EVERGREEN); + } + + bool hasMadMixInsts() const override { + return false; + } + + bool hasCARRY() const override { + return (getGeneration() >= EVERGREEN); + } + + bool hasCaymanISA() const { + return CaymanISA; + } + + bool hasFFBL() const override { + return (getGeneration() >= EVERGREEN); + } + + bool hasFFBH() const override { return (getGeneration() >= EVERGREEN); } + + bool hasFMA() const { return FMA; } + + bool hasFminFmaxLegacy() const override { return true; } + + bool hasFPExceptions() const override { return false; } + + bool hasMed3_16() const { return false; } + + bool hasMin3Max3_16() { return false; } + + bool hasMulU24() const override { return (getGeneration() >= EVERGREEN); } + + bool hasMulI24() const override { return hasCaymanISA(); } + + bool hasSDWA() const override { return false; } + + bool hasVOP3PInsts() const override { return false; } + + unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const { + return 36; + } + + bool hasCFAluBug() const { return CFALUBug; } + + bool hasVertexCache() const { return HasVertexCache; } + + short getTexVTXClauseSize() const { return TexVTXClauseSize; } + + AMDGPUAS getAMDGPUAS() const { return AS; } + + bool enableMachineScheduler() const override { + return true; + } + + bool enableSubRegLiveness() const override { + return true; + } +}; + } // end namespace llvm #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H Index: lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -23,6 +23,7 @@ #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/SmallString.h" #include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/MC/MCSubtargetInfo.h" #include "llvm/IR/MDBuilder.h" #include "llvm/CodeGen/TargetFrameLowering.h" #include @@ -34,9 +35,36 @@ #define GET_SUBTARGETINFO_TARGET_DESC #define GET_SUBTARGETINFO_CTOR #include "AMDGPUGenSubtargetInfo.inc" +#define GET_SUBTARGETINFO_TARGET_DESC +#define GET_SUBTARGETINFO_CTOR +#include "R600GenSubtargetInfo.inc" AMDGPUSubtarget::~AMDGPUSubtarget() = default; +R600Subtarget & +R600Subtarget::initializeSubtargetDependencies(const Triple &TT, + StringRef GPU, StringRef FS) { + SmallString<256> FullFS("+promote-alloca,+dx10-clamp,"); + FullFS += FS; + ParseSubtargetFeatures(GPU, FullFS); + + // FIXME: I don't think think Evergreen has any useful support for + // denormals, but should be checked. Should we issue a warning somewhere + // if someone tries to enable these? + if (getGeneration() <= R600Subtarget::NORTHERN_ISLANDS) { + FP32Denormals = false; + } + + // Set defaults if needed. + if (MaxPrivateElementSize == 0) + MaxPrivateElementSize = 4; + + if (LDSBankCount == 0) + LDSBankCount = 32; + + return *this; +} + AMDGPUSubtarget & AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS) { @@ -96,11 +124,16 @@ return *this; } +AMDGPUCommonSubtarget::AMDGPUCommonSubtarget(const Triple &TT, StringRef GPU, + StringRef FS, const TargetMachine &TM) { } + AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, - const TargetMachine &TM) - : AMDGPUGenSubtargetInfo(TT, GPU, FS), + const TargetMachine &TM) : + AMDGPUGenSubtargetInfo(TT, GPU, FS), + AMDGPUCommonSubtarget(TT, GPU, FS, TM), + FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), TargetTriple(TT), - Gen(TT.getArch() == Triple::amdgcn ? SOUTHERN_ISLANDS : R600), + Gen(SOUTHERN_ISLANDS), IsaVersion(ISAVersion0_0_0), WavefrontSize(0), LocalMemorySize(0), @@ -137,9 +170,6 @@ DumpCode(false), FP64(false), - FMA(false), - MIMG_R128(false), - IsGCN(false), GCN3Encoding(false), CIInsts(false), GFX9Insts(false), @@ -168,20 +198,14 @@ AddNoCarryInsts(false), HasUnpackedD16VMem(false), - R600ALUInst(false), - CaymanISA(false), - CFALUBug(false), - HasVertexCache(false), - TexVTXClauseSize(0), ScalarizeGlobal(false), - FeatureDisable(false), - InstrItins(getInstrItineraryForCPU(GPU)) { + FeatureDisable(false) { AS = AMDGPU::getAMDGPUAS(TT); initializeSubtargetDependencies(TT, GPU, FS); } -unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, +unsigned AMDGPUCommonSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves, const Function &F) const { if (NWaves == 1) return getLocalMemorySize(); @@ -191,7 +215,7 @@ return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; } -unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, +unsigned AMDGPUCommonSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &F) const { unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); @@ -204,13 +228,13 @@ } unsigned -AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { +AMDGPUCommonSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const { const auto *MFI = MF.getInfo(); return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction()); } std::pair -AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { +AMDGPUCommonSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const { switch (CC) { case CallingConv::AMDGPU_CS: case CallingConv::AMDGPU_KERNEL: @@ -228,7 +252,7 @@ } } -std::pair AMDGPUSubtarget::getFlatWorkGroupSizes( +std::pair AMDGPUCommonSubtarget::getFlatWorkGroupSizes( const Function &F) const { // FIXME: 1024 if function. // Default minimum/maximum flat work group sizes. @@ -258,7 +282,7 @@ return Requested; } -std::pair AMDGPUSubtarget::getWavesPerEU( +std::pair AMDGPUCommonSubtarget::getWavesPerEU( const Function &F) const { // Default minimum/maximum number of waves per execution unit. std::pair Default(1, getMaxWavesPerEU()); @@ -306,7 +330,7 @@ return Requested; } -bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { +bool AMDGPUCommonSubtarget::makeLIDRangeMetadata(Instruction *I) const { Function *Kernel = I->getParent()->getParent(); unsigned MinSize = 0; unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second; @@ -370,10 +394,15 @@ R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, const TargetMachine &TM) : - AMDGPUSubtarget(TT, GPU, FS, TM), + R600GenSubtargetInfo(TT, GPU, FS), + AMDGPUCommonSubtarget(TT, GPU, FS, TM), InstrInfo(*this), FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), - TLInfo(TM, *this) {} + TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)), + DX10Clamp(false), + InstrItins(getInstrItineraryForCPU(GPU)), + AS (AMDGPU::getAMDGPUAS(TT)) { + } SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM) @@ -617,3 +646,17 @@ std::vector> &Mutations) const { Mutations.push_back(llvm::make_unique(&InstrInfo)); } + +const AMDGPUCommonSubtarget &AMDGPUCommonSubtarget::get(const MachineFunction &MF) { + if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) + return static_cast(MF.getSubtarget()); + else + return static_cast(MF.getSubtarget()); +} + +const AMDGPUCommonSubtarget &AMDGPUCommonSubtarget::get(const TargetMachine &TM, const Function &F) { + if (TM.getTargetTriple().getArch() == Triple::amdgcn) + return static_cast(TM.getSubtarget(F)); + else + return static_cast(TM.getSubtarget(F)); +} Index: lib/Target/AMDGPU/AMDGPUTargetMachine.h =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -17,6 +17,7 @@ #include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" +#include "R600IntrinsicInfo.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" @@ -34,7 +35,6 @@ class AMDGPUTargetMachine : public LLVMTargetMachine { protected: std::unique_ptr TLOF; - AMDGPUIntrinsicInfo IntrinsicInfo; AMDGPUAS AS; StringRef getGPUName(const Function &F) const; @@ -49,12 +49,9 @@ CodeGenOpt::Level OL); ~AMDGPUTargetMachine() override; - const AMDGPUSubtarget *getSubtargetImpl() const; - const AMDGPUSubtarget *getSubtargetImpl(const Function &) const override = 0; + const TargetSubtargetInfo *getSubtargetImpl() const; + const TargetSubtargetInfo *getSubtargetImpl(const Function &) const override = 0; - const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override { - return &IntrinsicInfo; - } TargetTransformInfo getTargetTransformInfo(const Function &F) override; TargetLoweringObjectFile *getObjFileLowering() const override { @@ -79,6 +76,7 @@ class R600TargetMachine final : public AMDGPUTargetMachine { private: + R600IntrinsicInfo IntrinsicInfo; mutable StringMap> SubtargetMap; public: @@ -91,6 +89,10 @@ const R600Subtarget *getSubtargetImpl(const Function &) const override; + const R600IntrinsicInfo *getIntrinsicInfo() const override { + return &IntrinsicInfo; + } + bool isMachineVerifierClean() const override { return false; } @@ -102,6 +104,7 @@ class GCNTargetMachine final : public AMDGPUTargetMachine { private: + AMDGPUIntrinsicInfo IntrinsicInfo; mutable StringMap> SubtargetMap; public: @@ -114,6 +117,10 @@ const SISubtarget *getSubtargetImpl(const Function &) const override; + const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override { + return &IntrinsicInfo; + } + bool useIPRA() const override { return true; } Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -565,7 +565,10 @@ TargetTransformInfo AMDGPUTargetMachine::getTargetTransformInfo(const Function &F) { - return TargetTransformInfo(AMDGPUTTIImpl(this, F)); + if (getTargetTriple().getArch() == Triple::r600) + return TargetTransformInfo(R600TTIImpl(this, F)); + else + return TargetTransformInfo(AMDGPUTTIImpl(this, F)); } void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -45,8 +45,9 @@ friend BaseT; - const AMDGPUSubtarget *ST; - const AMDGPUTargetLowering *TLI; + const TargetMachine *TM; + const TargetSubtargetInfo *ST; + const TargetLowering *TLI; bool IsGraphicsShader; const FeatureBitset InlineFeatureIgnoreList = { @@ -74,8 +75,8 @@ AMDGPU::HalfRate64Ops }; - const AMDGPUSubtarget *getST() const { return ST; } - const AMDGPUTargetLowering *getTLI() const { return TLI; } + const TargetSubtargetInfo *getST() const { return ST; } + const TargetLowering *getTLI() const { return TLI; } static inline int getFullRateInstrCost() { return TargetTransformInfo::TCC_Basic; @@ -94,17 +95,24 @@ // On some parts, normal fp64 operations are half rate, and others // quarter. This also applies to some integer operations. inline int get64BitInstrCost() const { - return ST->hasHalfRate64Ops() ? + if (TM->getTargetTriple().getArch() == Triple::r600) + return getQuarterRateInstrCost(); + return static_cast(ST)->hasHalfRate64Ops() ? getHalfRateInstrCost() : getQuarterRateInstrCost(); } public: explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F) : BaseT(TM, F.getParent()->getDataLayout()), + TM(TM), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()), IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())) {} + bool isGCN() const { return TM->getTargetTriple().getArch() == Triple::amdgcn; } + + bool isR600() const { return TM->getTargetTriple().getArch() == Triple::r600; } + bool hasBranchDivergence() { return true; } void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, @@ -158,10 +166,12 @@ unsigned getFlatAddressSpace() const { // Don't bother running InferAddressSpaces pass on graphics shaders which // don't use flat addressing. - if (IsGraphicsShader) + if (IsGraphicsShader || TM->getTargetTriple().getArch() == Triple::r600) return -1; - return ST->hasFlatAddressSpace() ? - ST->getAMDGPUAS().FLAT_ADDRESS : ST->getAMDGPUAS().UNKNOWN_ADDRESS_SPACE; + const AMDGPUSubtarget *Subtarget = static_cast(ST); + return Subtarget->hasFlatAddressSpace() ? + Subtarget->getAMDGPUAS().FLAT_ADDRESS : + Subtarget->getAMDGPUAS().UNKNOWN_ADDRESS_SPACE; } unsigned getVectorSplitCost() { return 0; } @@ -175,6 +185,48 @@ unsigned getInliningThresholdMultiplier() { return 9; } }; +class R600TTIImpl final : public BasicTTIImplBase { + using BaseT = BasicTTIImplBase; + using TTI = TargetTransformInfo; + + friend BaseT; + + const TargetMachine *TM; + const TargetSubtargetInfo *ST; + const TargetLowering *TLI; + AMDGPUTTIImpl CommonTTI; + +public: + explicit R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F) + : BaseT(TM, F.getParent()->getDataLayout()), + TM(TM), + ST(TM->getSubtargetImpl(F)), + TLI(ST->getTargetLowering()), + CommonTTI(TM, F) {} + + const TargetSubtargetInfo *getST() const { return ST; } + const TargetLowering *getTLI() const { return TLI; } + + void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, + TTI::UnrollingPreferences &UP); + unsigned getHardwareNumberOfRegisters(bool Vec) const; + unsigned getNumberOfRegisters(bool Vec) const; + unsigned getRegisterBitWidth(bool Vector) const; + unsigned getMinVectorRegisterBitWidth() const; + unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const; + bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, unsigned Alignment, + unsigned AddrSpace) const; + bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, + unsigned Alignment, + unsigned AddrSpace) const; + bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, + unsigned Alignment, + unsigned AddrSpace) const; + unsigned getMaxInterleaveFactor(unsigned VF); + unsigned getCFInstrCost(unsigned Opcode); + int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index); +}; + } // end namespace llvm #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -102,7 +102,7 @@ unsigned ThresholdPrivate = UnrollThresholdPrivate; unsigned ThresholdLocal = UnrollThresholdLocal; unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal); - AMDGPUAS ASST = ST->getAMDGPUAS(); + const AMDGPUAS &ASST = AMDGPU::getAMDGPUAS(TM->getTargetTriple()); for (const BasicBlock *BB : L->getBlocks()) { const DataLayout &DL = BB->getModule()->getDataLayout(); unsigned LocalGEPsSeen = 0; @@ -212,12 +212,7 @@ unsigned AMDGPUTTIImpl::getHardwareNumberOfRegisters(bool Vec) const { // The concept of vector registers doesn't really exist. Some packed vector // operations operate on the normal 32-bit registers. - - // Number of VGPRs on SI. - if (ST->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) - return 256; - - return 4 * 128; // XXX - 4 channels. Should these count as vector instead? + return 256; } unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) const { @@ -256,11 +251,13 @@ } unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { - AMDGPUAS AS = ST->getAMDGPUAS(); + const Triple &Triple = TM->getTargetTriple(); + const AMDGPUAS &AS = AMDGPU::getAMDGPUAS(Triple); + auto Subtarget = static_cast(ST); if (AddrSpace == AS.GLOBAL_ADDRESS || AddrSpace == AS.CONSTANT_ADDRESS || AddrSpace == AS.CONSTANT_ADDRESS_32BIT) { - if (ST->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) + if (Triple.getArch() == Triple::r600) return 128; return 512; } @@ -269,13 +266,20 @@ return 128; if (AddrSpace == AS.LOCAL_ADDRESS || - AddrSpace == AS.REGION_ADDRESS) - return ST->useDS128() ? 128 : 64; + AddrSpace == AS.REGION_ADDRESS) { + if (Triple.getArch() == Triple::r600) + return 64; + return Subtarget->useDS128() ? 128 : 64; + } - if (AddrSpace == AS.PRIVATE_ADDRESS) - return 8 * ST->getMaxPrivateElementSize(); + if (AddrSpace == AS.PRIVATE_ADDRESS) { + if (Triple.getArch() == Triple::r600) + return 32; + + return 8 * Subtarget->getMaxPrivateElementSize(); + } - if (ST->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS && + if (Triple.getArch() == Triple::r600 && (AddrSpace == AS.PARAM_D_ADDRESS || AddrSpace == AS.PARAM_I_ADDRESS || (AddrSpace >= AS.CONSTANT_BUFFER_0 && @@ -290,9 +294,10 @@ // We allow vectorization of flat stores, even though we may need to decompose // them later if they may access private memory. We don't have enough context // here, and legalization can handle it. - if (AddrSpace == ST->getAMDGPUAS().PRIVATE_ADDRESS) { - return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) && - ChainSizeInBytes <= ST->getMaxPrivateElementSize(); + auto *Subtarget = static_cast(ST); + if (AddrSpace == AMDGPU::getAMDGPUAS(TM->getTargetTriple()).PRIVATE_ADDRESS) { + return (Alignment >= 4 || Subtarget->hasUnalignedScratchAccess()) && + ChainSizeInBytes <= Subtarget->getMaxPrivateElementSize(); } return true; } @@ -352,11 +357,12 @@ TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, TTI::OperandValueProperties Opd2PropInfo, ArrayRef Args ) { EVT OrigTy = TLI->getValueType(DL, Ty); - if (!OrigTy.isSimple()) { + if (isR600() || !OrigTy.isSimple()) { return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo); } + auto STI = static_cast(ST); // Legalize the type. std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); int ISD = TLI->InstructionOpcodeToISD(Opcode); @@ -414,7 +420,7 @@ if (SLT == MVT::f64) { int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost(); // Add cost of workaround. - if (ST->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) + if (STI->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) Cost += 3 * getFullRateInstrCost(); return LT.first * Cost * NElts; @@ -422,13 +428,13 @@ if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) { // TODO: This is more complicated, unsafe flags etc. - if ((SLT == MVT::f32 && !ST->hasFP32Denormals()) || - (SLT == MVT::f16 && ST->has16BitInsts())) { + if ((SLT == MVT::f32 && !STI->hasFP32Denormals()) || + (SLT == MVT::f16 && STI->has16BitInsts())) { return LT.first * getQuarterRateInstrCost() * NElts; } } - if (SLT == MVT::f16 && ST->has16BitInsts()) { + if (SLT == MVT::f16 && STI->has16BitInsts()) { // 2 x v_cvt_f32_f16 // f32 rcp // f32 fmul @@ -441,7 +447,7 @@ if (SLT == MVT::f32 || SLT == MVT::f16) { int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost(); - if (!ST->hasFP32Denormals()) { + if (!STI->hasFP32Denormals()) { // FP mode switches. Cost += 2 * getFullRateInstrCost(); } @@ -470,13 +476,15 @@ int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index) { + bool Has16BitInsts = isGCN() && + static_cast(ST)->has16BitInsts(); switch (Opcode) { case Instruction::ExtractElement: case Instruction::InsertElement: { unsigned EltSize = DL.getTypeSizeInBits(cast(ValTy)->getElementType()); if (EltSize < 32) { - if (EltSize == 16 && Index == 0 && ST->has16BitInsts()) + if (EltSize == 16 && Index == 0 && Has16BitInsts) return 0; return BaseT::getVectorInstrCost(Opcode, ValTy, Index); } @@ -534,7 +542,8 @@ // All other loads are not divergent, because if threads issue loads with the // same arguments, they will always get the same result. if (const LoadInst *Load = dyn_cast(V)) - return Load->getPointerAddressSpace() == ST->getAMDGPUAS().PRIVATE_ADDRESS; + return Load->getPointerAddressSpace() == + AMDGPU::getAMDGPUAS(TM->getTargetTriple()).PRIVATE_ADDRESS; // Atomics are divergent because they are executed sequentially: when an // atomic operation refers to the same address in each thread, then each @@ -568,7 +577,7 @@ unsigned AMDGPUTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp) { - if (ST->hasVOP3PInsts()) { + if (isGCN() && static_cast(ST)->hasVOP3PInsts()) { VectorType *VT = cast(Tp); if (VT->getNumElements() == 2 && DL.getTypeSizeInBits(VT->getElementType()) == 16) { @@ -601,3 +610,84 @@ FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList; return ((RealCallerBits & RealCalleeBits) == RealCalleeBits); } + +void R600TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, + TTI::UnrollingPreferences &UP) { + CommonTTI.getUnrollingPreferences(L, SE, UP); +} + +unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const { + return 4 * 128; // XXX - 4 channels. Should these count as vector instead? +} + +unsigned R600TTIImpl::getNumberOfRegisters(bool Vec) const { + return getHardwareNumberOfRegisters(Vec); +} + +unsigned R600TTIImpl::getRegisterBitWidth(bool Vector) const { + return 32; +} + +unsigned R600TTIImpl::getMinVectorRegisterBitWidth() const { + return 32; +} + +unsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { + const AMDGPUAS &AS = AMDGPU::getAMDGPUAS(TM->getTargetTriple()); + if (AddrSpace == AS.GLOBAL_ADDRESS || + AddrSpace == AS.CONSTANT_ADDRESS) + return 128; + if (AddrSpace == AS.LOCAL_ADDRESS || + AddrSpace == AS.REGION_ADDRESS) + return 64; + if (AddrSpace == AS.PRIVATE_ADDRESS) + return 32; + + if ((AddrSpace == AS.PARAM_D_ADDRESS || + AddrSpace == AS.PARAM_I_ADDRESS || + (AddrSpace >= AS.CONSTANT_BUFFER_0 && + AddrSpace <= AS.CONSTANT_BUFFER_15))) + return 128; + llvm_unreachable("unhandled address space"); +} + +bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, + unsigned Alignment, + unsigned AddrSpace) const { + // We allow vectorization of flat stores, even though we may need to decompose + // them later if they may access private memory. We don't have enough context + // here, and legalization can handle it. + if (AddrSpace == AMDGPU::getAMDGPUAS(TM->getTargetTriple()).PRIVATE_ADDRESS) + return false; + return true; +} + +bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, + unsigned Alignment, + unsigned AddrSpace) const { + return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); +} + +bool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, + unsigned Alignment, + unsigned AddrSpace) const { + return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); +} + +unsigned R600TTIImpl::getMaxInterleaveFactor(unsigned VF) { + // Disable unrolling if the loop is not vectorized. + // TODO: Enable this again. + if (VF == 1) + return 1; + + return 8; +} + +unsigned R600TTIImpl::getCFInstrCost(unsigned Opcode) { + return CommonTTI.getCFInstrCost(Opcode); +} + +int R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, + unsigned Index) { + return CommonTTI.getVectorInstrCost(Opcode, ValTy, Index); +} Index: lib/Target/AMDGPU/AMDILCFGStructurizer.cpp =================================================================== --- lib/Target/AMDGPU/AMDILCFGStructurizer.cpp +++ lib/Target/AMDGPU/AMDILCFGStructurizer.cpp @@ -437,19 +437,19 @@ for (;; --I) { if (I == MBB.end()) continue; - if (I->getOpcode() == AMDGPU::PRED_X) { + if (I->getOpcode() == R600::PRED_X) { switch (I->getOperand(2).getImm()) { - case AMDGPU::PRED_SETE_INT: - I->getOperand(2).setImm(AMDGPU::PRED_SETNE_INT); + case R600::PRED_SETE_INT: + I->getOperand(2).setImm(R600::PRED_SETNE_INT); return; - case AMDGPU::PRED_SETNE_INT: - I->getOperand(2).setImm(AMDGPU::PRED_SETE_INT); + case R600::PRED_SETNE_INT: + I->getOperand(2).setImm(R600::PRED_SETE_INT); return; - case AMDGPU::PRED_SETE: - I->getOperand(2).setImm(AMDGPU::PRED_SETNE); + case R600::PRED_SETE: + I->getOperand(2).setImm(R600::PRED_SETNE); return; - case AMDGPU::PRED_SETNE: - I->getOperand(2).setImm(AMDGPU::PRED_SETE); + case R600::PRED_SETNE: + I->getOperand(2).setImm(R600::PRED_SETE); return; default: llvm_unreachable("PRED_X Opcode invalid!"); @@ -518,10 +518,10 @@ int AMDGPUCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) { switch(OldOpcode) { - case AMDGPU::JUMP_COND: - case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET; - case AMDGPU::BRANCH_COND_i32: - case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALNZ_f32; + case R600::JUMP_COND: + case R600::JUMP: return R600::IF_PREDICATE_SET; + case R600::BRANCH_COND_i32: + case R600::BRANCH_COND_f32: return R600::IF_LOGICALNZ_f32; default: llvm_unreachable("internal error"); } return -1; @@ -529,10 +529,10 @@ int AMDGPUCFGStructurizer::getBranchZeroOpcode(int OldOpcode) { switch(OldOpcode) { - case AMDGPU::JUMP_COND: - case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET; - case AMDGPU::BRANCH_COND_i32: - case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALZ_f32; + case R600::JUMP_COND: + case R600::JUMP: return R600::IF_PREDICATE_SET; + case R600::BRANCH_COND_i32: + case R600::BRANCH_COND_f32: return R600::IF_LOGICALZ_f32; default: llvm_unreachable("internal error"); } return -1; @@ -540,8 +540,8 @@ int AMDGPUCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) { switch(OldOpcode) { - case AMDGPU::JUMP_COND: - case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALNZ_i32; + case R600::JUMP_COND: + case R600::JUMP: return R600::CONTINUE_LOGICALNZ_i32; default: llvm_unreachable("internal error"); } return -1; @@ -549,8 +549,8 @@ int AMDGPUCFGStructurizer::getContinueZeroOpcode(int OldOpcode) { switch(OldOpcode) { - case AMDGPU::JUMP_COND: - case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALZ_i32; + case R600::JUMP_COND: + case R600::JUMP: return R600::CONTINUE_LOGICALZ_i32; default: llvm_unreachable("internal error"); } return -1; @@ -578,9 +578,9 @@ bool AMDGPUCFGStructurizer::isCondBranch(MachineInstr *MI) { switch (MI->getOpcode()) { - case AMDGPU::JUMP_COND: - case AMDGPU::BRANCH_COND_i32: - case AMDGPU::BRANCH_COND_f32: return true; + case R600::JUMP_COND: + case R600::BRANCH_COND_i32: + case R600::BRANCH_COND_f32: return true; default: return false; } @@ -589,8 +589,8 @@ bool AMDGPUCFGStructurizer::isUncondBranch(MachineInstr *MI) { switch (MI->getOpcode()) { - case AMDGPU::JUMP: - case AMDGPU::BRANCH: + case R600::JUMP: + case R600::BRANCH: return true; default: return false; @@ -639,7 +639,7 @@ MachineBasicBlock::reverse_iterator It = MBB->rbegin(); if (It != MBB->rend()) { MachineInstr *instr = &(*It); - if (instr->getOpcode() == AMDGPU::RETURN) + if (instr->getOpcode() == R600::RETURN) return instr; } return nullptr; @@ -693,8 +693,8 @@ MachineBasicBlock::iterator E = MBB->end(); MachineBasicBlock::iterator It = Pre; while (It != E) { - if (Pre->getOpcode() == AMDGPU::CONTINUE - && It->getOpcode() == AMDGPU::ENDLOOP) + if (Pre->getOpcode() == R600::CONTINUE + && It->getOpcode() == R600::ENDLOOP) ContInstr.push_back(&*Pre); Pre = It; ++It; @@ -1338,15 +1338,15 @@ bool LandBlkHasOtherPred = (LandBlk->pred_size() > 2); - //insert AMDGPU::ENDIF to avoid special case "input landBlk == NULL" - MachineBasicBlock::iterator I = insertInstrBefore(LandBlk, AMDGPU::ENDIF); + //insert R600::ENDIF to avoid special case "input landBlk == NULL" + MachineBasicBlock::iterator I = insertInstrBefore(LandBlk, R600::ENDIF); if (LandBlkHasOtherPred) { report_fatal_error("Extra register needed to handle CFG"); unsigned CmpResReg = HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC); report_fatal_error("Extra compare instruction needed to handle CFG"); - insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET, + insertCondBranchBefore(LandBlk, I, R600::IF_PREDICATE_SET, CmpResReg, DebugLoc()); } @@ -1354,7 +1354,7 @@ // cause an assertion failure in the PostRA scheduling pass. unsigned InitReg = HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC); - insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET, InitReg, + insertCondBranchBefore(LandBlk, I, R600::IF_PREDICATE_SET, InitReg, DebugLoc()); if (MigrateTrue) { @@ -1364,7 +1364,7 @@ // (initVal != 1). report_fatal_error("Extra register needed to handle CFG"); } - insertInstrBefore(I, AMDGPU::ELSE); + insertInstrBefore(I, R600::ELSE); if (MigrateFalse) { migrateInstruction(FalseMBB, LandBlk, I); @@ -1376,7 +1376,7 @@ if (LandBlkHasOtherPred) { // add endif - insertInstrBefore(I, AMDGPU::ENDIF); + insertInstrBefore(I, R600::ENDIF); // put initReg = 2 to other predecessors of landBlk for (MachineBasicBlock::pred_iterator PI = LandBlk->pred_begin(), @@ -1463,7 +1463,7 @@ } if (FalseMBB) { - insertInstrBefore(I, AMDGPU::ELSE); + insertInstrBefore(I, R600::ELSE); MBB->splice(I, FalseMBB, FalseMBB->begin(), FalseMBB->end()); MBB->removeSuccessor(FalseMBB, true); @@ -1472,7 +1472,7 @@ retireBlock(FalseMBB); MLI->removeBlock(FalseMBB); } - insertInstrBefore(I, AMDGPU::ENDIF); + insertInstrBefore(I, R600::ENDIF); BranchMI->eraseFromParent(); @@ -1485,8 +1485,8 @@ DEBUG(dbgs() << "loopPattern header = BB" << DstBlk->getNumber() << " land = BB" << LandMBB->getNumber() << "\n";); - insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DebugLoc()); - insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DebugLoc()); + insertInstrBefore(DstBlk, R600::WHILELOOP, DebugLoc()); + insertInstrEnd(DstBlk, R600::ENDLOOP, DebugLoc()); DstBlk->replaceSuccessor(DstBlk, LandMBB); } @@ -1501,9 +1501,9 @@ MachineBasicBlock::iterator I = BranchMI; if (TrueBranch != LandMBB) reversePredicateSetter(I, *I->getParent()); - insertCondBranchBefore(ExitingMBB, I, AMDGPU::IF_PREDICATE_SET, AMDGPU::PREDICATE_BIT, DL); - insertInstrBefore(I, AMDGPU::BREAK); - insertInstrBefore(I, AMDGPU::ENDIF); + insertCondBranchBefore(ExitingMBB, I, R600::IF_PREDICATE_SET, R600::PREDICATE_BIT, DL); + insertInstrBefore(I, R600::BREAK); + insertInstrBefore(I, R600::ENDIF); //now branchInst can be erase safely BranchMI->eraseFromParent(); //now take care of successors, retire blocks @@ -1532,8 +1532,8 @@ getBranchZeroOpcode(OldOpcode); insertCondBranchBefore(I, BranchOpcode, DL); // insertEnd to ensure phi-moves, if exist, go before the continue-instr. - insertInstrEnd(ContingMBB, AMDGPU::CONTINUE, DL); - insertInstrEnd(ContingMBB, AMDGPU::ENDIF, DL); + insertInstrEnd(ContingMBB, R600::CONTINUE, DL); + insertInstrEnd(ContingMBB, R600::ENDIF, DL); } else { int BranchOpcode = TrueBranch == ContMBB ? getContinueNzeroOpcode(OldOpcode) : @@ -1548,7 +1548,7 @@ // location we've just inserted that reference here so it should be // representative insertEnd to ensure phi-moves, if exist, go before the // continue-instr. - insertInstrEnd(ContingMBB, AMDGPU::CONTINUE, + insertInstrEnd(ContingMBB, R600::CONTINUE, getLastDebugLocInBB(ContingMBB)); } } @@ -1680,7 +1680,7 @@ SmallVectorImpl &RetMBB) { MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock(); FuncRep->push_back(DummyExitBlk); //insert to function - insertInstrEnd(DummyExitBlk, AMDGPU::RETURN); + insertInstrEnd(DummyExitBlk, R600::RETURN); for (SmallVectorImpl::iterator It = RetMBB.begin(), E = RetMBB.end(); It != E; ++It) { Index: lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/CMakeLists.txt +++ lib/Target/AMDGPU/CMakeLists.txt @@ -4,7 +4,6 @@ tablegen(LLVM AMDGPUGenAsmWriter.inc -gen-asm-writer) tablegen(LLVM AMDGPUGenCallingConv.inc -gen-callingconv) tablegen(LLVM AMDGPUGenDAGISel.inc -gen-dag-isel) -tablegen(LLVM AMDGPUGenDFAPacketizer.inc -gen-dfa-packetizer) tablegen(LLVM AMDGPUGenDisassemblerTables.inc -gen-disassembler) tablegen(LLVM AMDGPUGenInstrInfo.inc -gen-instr-info) tablegen(LLVM AMDGPUGenIntrinsics.inc -gen-tgt-intrinsic) @@ -15,6 +14,17 @@ tablegen(LLVM AMDGPUGenSearchableTables.inc -gen-searchable-tables) tablegen(LLVM AMDGPUGenSubtargetInfo.inc -gen-subtarget) +set(LLVM_TARGET_DEFINITIONS R600.td) +tablegen(LLVM R600GenAsmWriter.inc -gen-asm-writer) +tablegen(LLVM R600GenCallingConv.inc -gen-callingconv) +tablegen(LLVM R600GenDAGISel.inc -gen-dag-isel) +tablegen(LLVM R600GenDFAPacketizer.inc -gen-dfa-packetizer) +tablegen(LLVM R600GenInstrInfo.inc -gen-instr-info) +tablegen(LLVM R600GenIntrinsics.inc -gen-tgt-intrinsic) +tablegen(LLVM R600GenMCCodeEmitter.inc -gen-emitter) +tablegen(LLVM R600GenRegisterInfo.inc -gen-register-info) +tablegen(LLVM R600GenSubtargetInfo.inc -gen-subtarget) + add_public_tablegen_target(AMDGPUCommonTableGen) add_llvm_target(AMDGPUCodeGen @@ -67,6 +77,7 @@ R600ExpandSpecialInstrs.cpp R600FrameLowering.cpp R600InstrInfo.cpp + R600IntrinsicInfo.cpp R600ISelLowering.cpp R600MachineFunctionInfo.cpp R600MachineScheduler.cpp Index: lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp =================================================================== --- lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -20,6 +20,7 @@ #include "Disassembler/AMDGPUDisassembler.h" #include "AMDGPU.h" #include "AMDGPURegisterInfo.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIDefines.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "Utils/AMDGPUBaseInfo.h" Index: lib/Target/AMDGPU/EvergreenInstructions.td =================================================================== --- lib/Target/AMDGPU/EvergreenInstructions.td +++ lib/Target/AMDGPU/EvergreenInstructions.td @@ -14,14 +14,13 @@ //===----------------------------------------------------------------------===// def isEG : Predicate< - "Subtarget->getGeneration() >= AMDGPUSubtarget::EVERGREEN && " - "Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS && " + "Subtarget->getGeneration() >= R600Subtarget::EVERGREEN && " "!Subtarget->hasCaymanISA()" >; def isEGorCayman : Predicate< - "Subtarget->getGeneration() == AMDGPUSubtarget::EVERGREEN ||" - "Subtarget->getGeneration() == AMDGPUSubtarget::NORTHERN_ISLANDS" + "Subtarget->getGeneration() == R600Subtarget::EVERGREEN ||" + "Subtarget->getGeneration() == R600Subtarget::NORTHERN_ISLANDS" >; class EGPat : AMDGPUPat { Index: lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h =================================================================== --- lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h +++ lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h @@ -216,13 +216,16 @@ raw_ostream &O); }; -// FIXME: R600 specific parts of AMDGPUInstrPrinter should be moved here, and -// MCTargetDesc should be using R600InstPrinter for the R600 target. -class R600InstPrinter : public AMDGPUInstPrinter { +class R600InstPrinter : public MCInstPrinter { public: R600InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, const MCRegisterInfo &MRI) - : AMDGPUInstPrinter(MAI, MII, MRI) {} + : MCInstPrinter(MAI, MII, MRI) {} + + void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot, + const MCSubtargetInfo &STI) override; + void printInstruction(const MCInst *MI, raw_ostream &O); + static const char *getRegisterName(unsigned RegNo); void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printBankSwizzle(const MCInst *MI, unsigned OpNo, raw_ostream &O); Index: lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp =================================================================== --- lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp +++ lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp @@ -505,11 +505,6 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { - if (!STI.getFeatureBits()[AMDGPU::FeatureGCN]) { - static_cast(this)->printOperand(MI, OpNo, O); - return; - } - if (OpNo >= MI->getNumOperands()) { O << "/*Missing OP" << OpNo << "*/"; return; @@ -955,11 +950,6 @@ void AMDGPUInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { - if (!STI.getFeatureBits()[AMDGPU::FeatureGCN]) { - static_cast(this)->printMemOperand(MI, OpNo, O); - return; - } - printOperand(MI, OpNo, STI, O); O << ", "; printOperand(MI, OpNo + 1, STI, O); @@ -985,16 +975,6 @@ O << Asm; } -void AMDGPUInstPrinter::printAbs(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - static_cast(this)->printAbs(MI, OpNo, O); -} - -void AMDGPUInstPrinter::printClamp(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - static_cast(this)->printClamp(MI, OpNo, O); -} - void AMDGPUInstPrinter::printHigh(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { @@ -1021,70 +1001,6 @@ O << " div:2"; } -void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - static_cast(this)->printLiteral(MI, OpNo, O); -} - -void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - static_cast(this)->printLast(MI, OpNo, O); -} - -void AMDGPUInstPrinter::printNeg(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - static_cast(this)->printNeg(MI, OpNo, O); -} - -void AMDGPUInstPrinter::printOMOD(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - static_cast(this)->printOMOD(MI, OpNo, O); -} - -void AMDGPUInstPrinter::printRel(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - static_cast(this)->printRel(MI, OpNo, O); -} - -void AMDGPUInstPrinter::printUpdateExecMask(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - static_cast(this)->printUpdateExecMask(MI, OpNo, O); -} - -void AMDGPUInstPrinter::printUpdatePred(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - static_cast(this)->printUpdatePred(MI, OpNo, O); -} - -void AMDGPUInstPrinter::printWrite(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - static_cast(this)->printWrite(MI, OpNo, O); -} - -void AMDGPUInstPrinter::printBankSwizzle(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, - raw_ostream &O) { - static_cast(this)->printBankSwizzle(MI, OpNo, O); -} - -void AMDGPUInstPrinter::printRSel(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - static_cast(this)->printRSel(MI, OpNo, O); -} - -void AMDGPUInstPrinter::printCT(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - static_cast(this)->printCT(MI, OpNo, O); -} - -void AMDGPUInstPrinter::printKCache(const MCInst *MI, unsigned OpNo, - const MCSubtargetInfo &STI, raw_ostream &O) { - static_cast(this)->printKCache(MI, OpNo, O); -} - void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { @@ -1289,6 +1205,13 @@ #include "AMDGPUGenAsmWriter.inc" +void R600InstPrinter::printInst(const MCInst *MI, raw_ostream &O, + StringRef Annot, const MCSubtargetInfo &STI) { + O.flush(); + printInstruction(MI, O); + printAnnotation(O, Annot); +} + void R600InstPrinter::printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O) { AMDGPUInstPrinter::printIfSet(MI, OpNo, O, '|'); @@ -1407,7 +1330,7 @@ if (Op.isReg()) { switch (Op.getReg()) { // This is the default predicate state, so we don't need to print it. - case AMDGPU::PRED_SEL_OFF: + case R600::PRED_SEL_OFF: break; default: @@ -1483,3 +1406,5 @@ O << " (MASKED)"; } } + +#include "R600GenAsmWriter.inc" Index: lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h =================================================================== --- lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h +++ lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h @@ -40,6 +40,7 @@ MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, MCContext &Ctx); +MCInstrInfo *createR600MCInstrInfo(); MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, @@ -59,6 +60,10 @@ #include "AMDGPUGenRegisterInfo.inc" #undef GET_REGINFO_ENUM +#define GET_REGINFO_ENUM +#include "R600GenRegisterInfo.inc" +#undef GET_REGINFO_ENUM + #define GET_INSTRINFO_ENUM #define GET_INSTRINFO_OPERAND_ENUM #define GET_INSTRINFO_SCHED_ENUM @@ -67,9 +72,20 @@ #undef GET_INSTRINFO_OPERAND_ENUM #undef GET_INSTRINFO_ENUM +#define GET_INSTRINFO_ENUM +#define GET_INSTRINFO_OPERAND_ENUM +#define GET_INSTRINFO_SCHED_ENUM +#include "R600GenInstrInfo.inc" +#undef GET_INSTRINFO_SCHED_ENUM +#undef GET_INSTRINFO_OPERAND_ENUM +#undef GET_INSTRINFO_ENUM #define GET_SUBTARGETINFO_ENUM #include "AMDGPUGenSubtargetInfo.inc" #undef GET_SUBTARGETINFO_ENUM +#define GET_SUBTARGETINFO_ENUM +#include "R600GenSubtargetInfo.inc" +#undef GET_SUBTARGETINFO_ENUM + #endif Index: lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp =================================================================== --- lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp +++ lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -37,9 +37,17 @@ #define GET_SUBTARGETINFO_MC_DESC #include "AMDGPUGenSubtargetInfo.inc" +#define NoSchedModel NoSchedModelR600 +#define GET_SUBTARGETINFO_MC_DESC +#include "R600GenSubtargetInfo.inc" +#undef NoSchedModelR600 + #define GET_REGINFO_MC_DESC #include "AMDGPUGenRegisterInfo.inc" +#define GET_REGINFO_MC_DESC +#include "R600GenRegisterInfo.inc" + static MCInstrInfo *createAMDGPUMCInstrInfo() { MCInstrInfo *X = new MCInstrInfo(); InitAMDGPUMCInstrInfo(X); @@ -48,12 +56,17 @@ static MCRegisterInfo *createAMDGPUMCRegisterInfo(const Triple &TT) { MCRegisterInfo *X = new MCRegisterInfo(); - InitAMDGPUMCRegisterInfo(X, 0); + if (TT.getArch() == Triple::r600) + InitR600MCRegisterInfo(X, 0); + else + InitAMDGPUMCRegisterInfo(X, 0); return X; } static MCSubtargetInfo * createAMDGPUMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { + if (TT.getArch() == Triple::r600) + return createR600MCSubtargetInfoImpl(TT, CPU, FS); return createAMDGPUMCSubtargetInfoImpl(TT, CPU, FS); } @@ -62,8 +75,10 @@ const MCAsmInfo &MAI, const MCInstrInfo &MII, const MCRegisterInfo &MRI) { - return T.getArch() == Triple::r600 ? new R600InstPrinter(MAI, MII, MRI) : - new AMDGPUInstPrinter(MAI, MII, MRI); + if (T.getArch() == Triple::r600) + return new R600InstPrinter(MAI, MII, MRI); + else + return new AMDGPUInstPrinter(MAI, MII, MRI); } static MCTargetStreamer *createAMDGPUAsmTargetStreamer(MCStreamer &S, @@ -89,10 +104,12 @@ } extern "C" void LLVMInitializeAMDGPUTargetMC() { + + TargetRegistry::RegisterMCInstrInfo(getTheGCNTarget(), createAMDGPUMCInstrInfo); + TargetRegistry::RegisterMCInstrInfo(getTheAMDGPUTarget(), createR600MCInstrInfo); for (Target *T : {&getTheAMDGPUTarget(), &getTheGCNTarget()}) { RegisterMCAsmInfo X(*T); - TargetRegistry::RegisterMCInstrInfo(*T, createAMDGPUMCInstrInfo); TargetRegistry::RegisterMCRegInfo(*T, createAMDGPUMCRegisterInfo); TargetRegistry::RegisterMCSubtargetInfo(*T, createAMDGPUMCSubtargetInfo); TargetRegistry::RegisterMCInstPrinter(*T, createAMDGPUMCInstPrinter); Index: lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt +++ lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt @@ -8,5 +8,6 @@ AMDGPUMCTargetDesc.cpp AMDGPUTargetStreamer.cpp R600MCCodeEmitter.cpp + R600MCTargetDesc.cpp SIMCCodeEmitter.cpp ) Index: lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp =================================================================== --- lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp +++ lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp @@ -15,7 +15,6 @@ //===----------------------------------------------------------------------===// #include "MCTargetDesc/AMDGPUFixupKinds.h" -#include "MCTargetDesc/AMDGPUMCCodeEmitter.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "R600Defines.h" #include "llvm/MC/MCCodeEmitter.h" @@ -36,30 +35,40 @@ namespace { -class R600MCCodeEmitter : public AMDGPUMCCodeEmitter { +class R600MCCodeEmitter : public MCCodeEmitter { const MCRegisterInfo &MRI; + const MCInstrInfo &MCII; public: R600MCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri) - : AMDGPUMCCodeEmitter(mcii), MRI(mri) {} + : MRI(mri), MCII(mcii) {} R600MCCodeEmitter(const R600MCCodeEmitter &) = delete; R600MCCodeEmitter &operator=(const R600MCCodeEmitter &) = delete; /// \brief Encode the instruction and write it to the OS. void encodeInstruction(const MCInst &MI, raw_ostream &OS, SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const override; + const MCSubtargetInfo &STI) const; /// \returns the encoding for an MCOperand. uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO, SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const override; + const MCSubtargetInfo &STI) const; private: + void Emit(uint32_t value, raw_ostream &OS) const; void Emit(uint64_t value, raw_ostream &OS) const; unsigned getHWReg(unsigned regNo) const; + + uint64_t getBinaryCodeForInstr(const MCInst &MI, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + uint64_t computeAvailableFeatures(const FeatureBitset &FB) const; + void verifyInstructionPredicates(const MCInst &MI, + uint64_t AvailableFeatures) const; + }; } // end anonymous namespace @@ -94,16 +103,16 @@ computeAvailableFeatures(STI.getFeatureBits())); const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); - if (MI.getOpcode() == AMDGPU::RETURN || - MI.getOpcode() == AMDGPU::FETCH_CLAUSE || - MI.getOpcode() == AMDGPU::ALU_CLAUSE || - MI.getOpcode() == AMDGPU::BUNDLE || - MI.getOpcode() == AMDGPU::KILL) { + if (MI.getOpcode() == R600::RETURN || + MI.getOpcode() == R600::FETCH_CLAUSE || + MI.getOpcode() == R600::ALU_CLAUSE || + MI.getOpcode() == R600::BUNDLE || + MI.getOpcode() == R600::KILL) { return; } else if (IS_VTX(Desc)) { uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups, STI); uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset - if (!(STI.getFeatureBits()[AMDGPU::FeatureCaymanISA])) { + if (!(STI.getFeatureBits()[R600::FeatureCaymanISA])) { InstWord2 |= 1 << 19; // Mega-Fetch bit } @@ -136,7 +145,7 @@ Emit((uint32_t) 0, OS); } else { uint64_t Inst = getBinaryCodeForInstr(MI, Fixups, STI); - if ((STI.getFeatureBits()[AMDGPU::FeatureR600ALUInst]) && + if ((STI.getFeatureBits()[R600::FeatureR600ALUInst]) && ((Desc.TSFlags & R600_InstFlag::OP1) || Desc.TSFlags & R600_InstFlag::OP2)) { uint64_t ISAOpCode = Inst & (0x3FFULL << 39); @@ -186,4 +195,4 @@ } #define ENABLE_INSTR_PREDICATE_VERIFIER -#include "AMDGPUGenMCCodeEmitter.inc" +#include "R600GenMCCodeEmitter.inc" Index: lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp @@ -0,0 +1,27 @@ +//===-- R600MCTargetDesc.cpp - R600 Target Descriptions -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief This file provides R600 specific target descriptions. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUMCTargetDesc.h" +#include "llvm/MC/MCInstrInfo.h" + +using namespace llvm; + +#define GET_INSTRINFO_MC_DESC +#include "R600GenInstrInfo.inc" + +MCInstrInfo *llvm::createR600MCInstrInfo() { + MCInstrInfo *X = new MCInstrInfo(); + InitR600MCInstrInfo(X); + return X; +} Index: lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp =================================================================== --- lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp +++ lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp @@ -438,3 +438,6 @@ llvm_unreachable("Encoding of this operand type is not supported yet."); return 0; } + +#define ENABLE_INSTR_PREDICATE_VERIFIER +#include "AMDGPUGenMCCodeEmitter.inc" Index: lib/Target/AMDGPU/Processors.td =================================================================== --- /dev/null +++ lib/Target/AMDGPU/Processors.td @@ -0,0 +1,154 @@ +//===-- Processors.td - GCN Processor definitions ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +// The code produced for "generic" is only useful for tests and cannot +// reasonably be expected to execute on any particular target. +def : ProcessorModel<"generic", NoSchedModel, []>; + +//===----------------------------------------------------------------------===// +// Southern Islands +//===----------------------------------------------------------------------===// + +def : ProcessorModel<"gfx600", SIFullSpeedModel, + [FeatureISAVersion6_0_0]>; + +def : ProcessorModel<"tahiti", SIFullSpeedModel, + [FeatureISAVersion6_0_0] +>; + +def : ProcessorModel<"gfx601", SIQuarterSpeedModel, + [FeatureISAVersion6_0_1] +>; + +def : ProcessorModel<"pitcairn", SIQuarterSpeedModel, + [FeatureISAVersion6_0_1]>; + +def : ProcessorModel<"verde", SIQuarterSpeedModel, + [FeatureISAVersion6_0_1]>; + +def : ProcessorModel<"oland", SIQuarterSpeedModel, + [FeatureISAVersion6_0_1]>; + +def : ProcessorModel<"hainan", SIQuarterSpeedModel, [FeatureISAVersion6_0_1]>; + +//===----------------------------------------------------------------------===// +// Sea Islands +//===----------------------------------------------------------------------===// + +def : ProcessorModel<"gfx700", SIQuarterSpeedModel, + [FeatureISAVersion7_0_0] +>; + +def : ProcessorModel<"bonaire", SIQuarterSpeedModel, + [FeatureISAVersion7_0_0] +>; + +def : ProcessorModel<"kaveri", SIQuarterSpeedModel, + [FeatureISAVersion7_0_0] +>; + +def : ProcessorModel<"gfx701", SIFullSpeedModel, + [FeatureISAVersion7_0_1] +>; + +def : ProcessorModel<"hawaii", SIFullSpeedModel, + [FeatureISAVersion7_0_1] +>; + +def : ProcessorModel<"gfx702", SIQuarterSpeedModel, + [FeatureISAVersion7_0_2] +>; + +def : ProcessorModel<"gfx703", SIQuarterSpeedModel, + [FeatureISAVersion7_0_3] +>; + +def : ProcessorModel<"kabini", SIQuarterSpeedModel, + [FeatureISAVersion7_0_3] +>; + +def : ProcessorModel<"mullins", SIQuarterSpeedModel, + [FeatureISAVersion7_0_3]>; + +//===----------------------------------------------------------------------===// +// Volcanic Islands +//===----------------------------------------------------------------------===// + +def : ProcessorModel<"tonga", SIQuarterSpeedModel, + [FeatureISAVersion8_0_2] +>; + +def : ProcessorModel<"iceland", SIQuarterSpeedModel, + [FeatureISAVersion8_0_0] +>; + +def : ProcessorModel<"carrizo", SIQuarterSpeedModel, + [FeatureISAVersion8_0_1] +>; + +def : ProcessorModel<"fiji", SIQuarterSpeedModel, + [FeatureISAVersion8_0_3] +>; + +def : ProcessorModel<"stoney", SIQuarterSpeedModel, + [FeatureISAVersion8_1_0] +>; + +def : ProcessorModel<"polaris10", SIQuarterSpeedModel, + [FeatureISAVersion8_0_3] +>; + +def : ProcessorModel<"polaris11", SIQuarterSpeedModel, + [FeatureISAVersion8_0_3] +>; + +def : ProcessorModel<"gfx800", SIQuarterSpeedModel, + [FeatureISAVersion8_0_0] +>; + +def : ProcessorModel<"gfx801", SIQuarterSpeedModel, + [FeatureISAVersion8_0_1] +>; + +def : ProcessorModel<"gfx802", SIQuarterSpeedModel, + [FeatureISAVersion8_0_2] +>; + +def : ProcessorModel<"gfx803", SIQuarterSpeedModel, + [FeatureISAVersion8_0_3] +>; + +def : ProcessorModel<"gfx804", SIQuarterSpeedModel, + [FeatureISAVersion8_0_4] +>; + +def : ProcessorModel<"gfx810", SIQuarterSpeedModel, + [FeatureISAVersion8_1_0] +>; + +//===----------------------------------------------------------------------===// +// GFX9 +//===----------------------------------------------------------------------===// + +def : ProcessorModel<"gfx900", SIQuarterSpeedModel, + [FeatureISAVersion9_0_0] +>; + +def : ProcessorModel<"gfx901", SIQuarterSpeedModel, + [FeatureISAVersion9_0_1] +>; + +def : ProcessorModel<"gfx902", SIQuarterSpeedModel, + [FeatureISAVersion9_0_2] +>; + +def : ProcessorModel<"gfx903", SIQuarterSpeedModel, + [FeatureISAVersion9_0_3] +>; + Index: lib/Target/AMDGPU/R600.td =================================================================== --- /dev/null +++ lib/Target/AMDGPU/R600.td @@ -0,0 +1,52 @@ + +include "llvm/Target/Target.td" + +def R600InstrInfo : InstrInfo { + let guessInstructionProperties = 1; + let noNamedPositionallyEncodedOperands = 1; +} + +def R600 : Target { + let InstructionSet = R600InstrInfo; + let AllowRegisterRenaming = 1; +} + +let Namespace = "R600" in { + +foreach Index = 0-15 in { + def sub#Index : SubRegIndex<32, !shl(Index, 5)>; +} + +include "R600RegisterInfo.td" + +} + +def NullALU : InstrItinClass; +def ALU_NULL : FuncUnit; + +include "AMDGPUFeatures.td" +include "R600Schedule.td" +include "R600Processors.td" +include "R600Intrinsics.td" +include "AMDGPUInstrInfo.td" +include "AMDGPUInstructions.td" +include "R600Instructions.td" +include "R700Instructions.td" +include "EvergreenInstructions.td" +include "CaymanInstructions.td" + +// Calling convention for R600 +def CC_R600 : CallingConv<[ + CCIfInReg>> +]>; + +// Calling convention for compute kernels +def CC_R600_Kernel : CallingConv<[ + CCCustom<"allocateKernArg"> +]>; Index: lib/Target/AMDGPU/R600ClauseMergePass.cpp =================================================================== --- lib/Target/AMDGPU/R600ClauseMergePass.cpp +++ lib/Target/AMDGPU/R600ClauseMergePass.cpp @@ -34,8 +34,8 @@ static bool isCFAlu(const MachineInstr &MI) { switch (MI.getOpcode()) { - case AMDGPU::CF_ALU: - case AMDGPU::CF_ALU_PUSH_BEFORE: + case R600::CF_ALU: + case R600::CF_ALU_PUSH_BEFORE: return true; default: return false; @@ -85,20 +85,20 @@ unsigned R600ClauseMergePass::getCFAluSize(const MachineInstr &MI) const { assert(isCFAlu(MI)); return MI - .getOperand(TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::COUNT)) + .getOperand(TII->getOperandIdx(MI.getOpcode(), R600::OpName::COUNT)) .getImm(); } bool R600ClauseMergePass::isCFAluEnabled(const MachineInstr &MI) const { assert(isCFAlu(MI)); return MI - .getOperand(TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::Enabled)) + .getOperand(TII->getOperandIdx(MI.getOpcode(), R600::OpName::Enabled)) .getImm(); } void R600ClauseMergePass::cleanPotentialDisabledCFAlu( MachineInstr &CFAlu) const { - int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT); + int CntIdx = TII->getOperandIdx(R600::CF_ALU, R600::OpName::COUNT); MachineBasicBlock::iterator I = CFAlu, E = CFAlu.getParent()->end(); I++; do { @@ -117,7 +117,7 @@ bool R600ClauseMergePass::mergeIfPossible(MachineInstr &RootCFAlu, const MachineInstr &LatrCFAlu) const { assert(isCFAlu(RootCFAlu) && isCFAlu(LatrCFAlu)); - int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT); + int CntIdx = TII->getOperandIdx(R600::CF_ALU, R600::OpName::COUNT); unsigned RootInstCount = getCFAluSize(RootCFAlu), LaterInstCount = getCFAluSize(LatrCFAlu); unsigned CumuledInsts = RootInstCount + LaterInstCount; @@ -125,15 +125,15 @@ DEBUG(dbgs() << "Excess inst counts\n"); return false; } - if (RootCFAlu.getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE) + if (RootCFAlu.getOpcode() == R600::CF_ALU_PUSH_BEFORE) return false; // Is KCache Bank 0 compatible ? int Mode0Idx = - TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_MODE0); + TII->getOperandIdx(R600::CF_ALU, R600::OpName::KCACHE_MODE0); int KBank0Idx = - TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK0); + TII->getOperandIdx(R600::CF_ALU, R600::OpName::KCACHE_BANK0); int KBank0LineIdx = - TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR0); + TII->getOperandIdx(R600::CF_ALU, R600::OpName::KCACHE_ADDR0); if (LatrCFAlu.getOperand(Mode0Idx).getImm() && RootCFAlu.getOperand(Mode0Idx).getImm() && (LatrCFAlu.getOperand(KBank0Idx).getImm() != @@ -145,11 +145,11 @@ } // Is KCache Bank 1 compatible ? int Mode1Idx = - TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_MODE1); + TII->getOperandIdx(R600::CF_ALU, R600::OpName::KCACHE_MODE1); int KBank1Idx = - TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK1); + TII->getOperandIdx(R600::CF_ALU, R600::OpName::KCACHE_BANK1); int KBank1LineIdx = - TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR1); + TII->getOperandIdx(R600::CF_ALU, R600::OpName::KCACHE_ADDR1); if (LatrCFAlu.getOperand(Mode1Idx).getImm() && RootCFAlu.getOperand(Mode1Idx).getImm() && (LatrCFAlu.getOperand(KBank1Idx).getImm() != Index: lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp =================================================================== --- lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp +++ lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp @@ -94,7 +94,7 @@ } bool CFStack::requiresWorkAroundForInst(unsigned Opcode) { - if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST->hasCaymanISA() && + if (Opcode == R600::CF_ALU_PUSH_BEFORE && ST->hasCaymanISA() && getLoopDepth() > 1) return true; @@ -103,10 +103,10 @@ switch(Opcode) { default: return false; - case AMDGPU::CF_ALU_PUSH_BEFORE: - case AMDGPU::CF_ALU_ELSE_AFTER: - case AMDGPU::CF_ALU_BREAK: - case AMDGPU::CF_ALU_CONTINUE: + case R600::CF_ALU_PUSH_BEFORE: + case R600::CF_ALU_ELSE_AFTER: + case R600::CF_ALU_BREAK: + case R600::CF_ALU_CONTINUE: if (CurrentSubEntries == 0) return false; if (ST->getWavefrontSize() == 64) { @@ -168,8 +168,8 @@ void CFStack::pushBranch(unsigned Opcode, bool isWQM) { CFStack::StackItem Item = CFStack::ENTRY; switch(Opcode) { - case AMDGPU::CF_PUSH_EG: - case AMDGPU::CF_ALU_PUSH_BEFORE: + case R600::CF_PUSH_EG: + case R600::CF_ALU_PUSH_BEFORE: if (!isWQM) { if (!ST->hasCaymanISA() && !branchStackContains(CFStack::FIRST_NON_WQM_PUSH)) @@ -240,8 +240,8 @@ bool IsTrivialInst(MachineInstr &MI) const { switch (MI.getOpcode()) { - case AMDGPU::KILL: - case AMDGPU::RETURN: + case R600::KILL: + case R600::RETURN: return true; default: return false; @@ -253,41 +253,41 @@ bool isEg = (ST->getGeneration() >= R600Subtarget::EVERGREEN); switch (CFI) { case CF_TC: - Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600; + Opcode = isEg ? R600::CF_TC_EG : R600::CF_TC_R600; break; case CF_VC: - Opcode = isEg ? AMDGPU::CF_VC_EG : AMDGPU::CF_VC_R600; + Opcode = isEg ? R600::CF_VC_EG : R600::CF_VC_R600; break; case CF_CALL_FS: - Opcode = isEg ? AMDGPU::CF_CALL_FS_EG : AMDGPU::CF_CALL_FS_R600; + Opcode = isEg ? R600::CF_CALL_FS_EG : R600::CF_CALL_FS_R600; break; case CF_WHILE_LOOP: - Opcode = isEg ? AMDGPU::WHILE_LOOP_EG : AMDGPU::WHILE_LOOP_R600; + Opcode = isEg ? R600::WHILE_LOOP_EG : R600::WHILE_LOOP_R600; break; case CF_END_LOOP: - Opcode = isEg ? AMDGPU::END_LOOP_EG : AMDGPU::END_LOOP_R600; + Opcode = isEg ? R600::END_LOOP_EG : R600::END_LOOP_R600; break; case CF_LOOP_BREAK: - Opcode = isEg ? AMDGPU::LOOP_BREAK_EG : AMDGPU::LOOP_BREAK_R600; + Opcode = isEg ? R600::LOOP_BREAK_EG : R600::LOOP_BREAK_R600; break; case CF_LOOP_CONTINUE: - Opcode = isEg ? AMDGPU::CF_CONTINUE_EG : AMDGPU::CF_CONTINUE_R600; + Opcode = isEg ? R600::CF_CONTINUE_EG : R600::CF_CONTINUE_R600; break; case CF_JUMP: - Opcode = isEg ? AMDGPU::CF_JUMP_EG : AMDGPU::CF_JUMP_R600; + Opcode = isEg ? R600::CF_JUMP_EG : R600::CF_JUMP_R600; break; case CF_ELSE: - Opcode = isEg ? AMDGPU::CF_ELSE_EG : AMDGPU::CF_ELSE_R600; + Opcode = isEg ? R600::CF_ELSE_EG : R600::CF_ELSE_R600; break; case CF_POP: - Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600; + Opcode = isEg ? R600::POP_EG : R600::POP_R600; break; case CF_END: if (ST->hasCaymanISA()) { - Opcode = AMDGPU::CF_END_CM; + Opcode = R600::CF_END_CM; break; } - Opcode = isEg ? AMDGPU::CF_END_EG : AMDGPU::CF_END_R600; + Opcode = isEg ? R600::CF_END_EG : R600::CF_END_R600; break; } assert (Opcode && "No opcode selected"); @@ -305,21 +305,21 @@ continue; if (MO.isDef()) { unsigned Reg = MO.getReg(); - if (AMDGPU::R600_Reg128RegClass.contains(Reg)) + if (R600::R600_Reg128RegClass.contains(Reg)) DstMI = Reg; else DstMI = TRI->getMatchingSuperReg(Reg, AMDGPURegisterInfo::getSubRegFromChannel(TRI->getHWRegChan(Reg)), - &AMDGPU::R600_Reg128RegClass); + &R600::R600_Reg128RegClass); } if (MO.isUse()) { unsigned Reg = MO.getReg(); - if (AMDGPU::R600_Reg128RegClass.contains(Reg)) + if (R600::R600_Reg128RegClass.contains(Reg)) SrcMI = Reg; else SrcMI = TRI->getMatchingSuperReg(Reg, AMDGPURegisterInfo::getSubRegFromChannel(TRI->getHWRegChan(Reg)), - &AMDGPU::R600_Reg128RegClass); + &R600::R600_Reg128RegClass); } } if ((DstRegs.find(SrcMI) == DstRegs.end())) { @@ -359,15 +359,15 @@ void getLiteral(MachineInstr &MI, std::vector &Lits) const { static const unsigned LiteralRegs[] = { - AMDGPU::ALU_LITERAL_X, - AMDGPU::ALU_LITERAL_Y, - AMDGPU::ALU_LITERAL_Z, - AMDGPU::ALU_LITERAL_W + R600::ALU_LITERAL_X, + R600::ALU_LITERAL_Y, + R600::ALU_LITERAL_Z, + R600::ALU_LITERAL_W }; const SmallVector, 3> Srcs = TII->getSrcs(MI); for (const auto &Src:Srcs) { - if (Src.first->getReg() != AMDGPU::ALU_LITERAL_X) + if (Src.first->getReg() != R600::ALU_LITERAL_X) continue; int64_t Imm = Src.second; std::vector::iterator It = @@ -377,7 +377,7 @@ // Get corresponding Operand MachineOperand &Operand = MI.getOperand( - TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::literal)); + TII->getOperandIdx(MI.getOpcode(), R600::OpName::literal)); if (It != Lits.end()) { // Reuse existing literal reg @@ -400,7 +400,7 @@ unsigned LiteralPair0 = Literals[i]; unsigned LiteralPair1 = (i + 1 < e)?Literals[i + 1]:0; InsertPos = BuildMI(MBB, InsertPos->getDebugLoc(), - TII->get(AMDGPU::LITERALS)) + TII->get(R600::LITERALS)) .addImm(LiteralPair0) .addImm(LiteralPair1); } @@ -442,7 +442,7 @@ } for (unsigned i = 0, e = Literals.size(); i < e; i += 2) { MachineInstrBuilder MILit = BuildMI(MBB, I, I->getDebugLoc(), - TII->get(AMDGPU::LITERALS)); + TII->get(R600::LITERALS)); if (Literals[i]->isImm()) { MILit.addImm(Literals[i]->getImm()); } else { @@ -471,7 +471,7 @@ unsigned &CfCount) { CounterPropagateAddr(*Clause.first, CfCount); MachineBasicBlock *BB = Clause.first->getParent(); - BuildMI(BB, DL, TII->get(AMDGPU::FETCH_CLAUSE)).addImm(CfCount); + BuildMI(BB, DL, TII->get(R600::FETCH_CLAUSE)).addImm(CfCount); for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) { BB->splice(InsertPos, BB, Clause.second[i]); } @@ -483,7 +483,7 @@ Clause.first->getOperand(0).setImm(0); CounterPropagateAddr(*Clause.first, CfCount); MachineBasicBlock *BB = Clause.first->getParent(); - BuildMI(BB, DL, TII->get(AMDGPU::ALU_CLAUSE)).addImm(CfCount); + BuildMI(BB, DL, TII->get(R600::ALU_CLAUSE)).addImm(CfCount); for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) { BB->splice(InsertPos, BB, Clause.second[i]); } @@ -540,33 +540,33 @@ } MachineBasicBlock::iterator MI = I; - if (MI->getOpcode() != AMDGPU::ENDIF) + if (MI->getOpcode() != R600::ENDIF) LastAlu.back() = nullptr; - if (MI->getOpcode() == AMDGPU::CF_ALU) + if (MI->getOpcode() == R600::CF_ALU) LastAlu.back() = &*MI; I++; bool RequiresWorkAround = CFStack.requiresWorkAroundForInst(MI->getOpcode()); switch (MI->getOpcode()) { - case AMDGPU::CF_ALU_PUSH_BEFORE: + case R600::CF_ALU_PUSH_BEFORE: if (RequiresWorkAround) { DEBUG(dbgs() << "Applying bug work-around for ALU_PUSH_BEFORE\n"); - BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG)) + BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(R600::CF_PUSH_EG)) .addImm(CfCount + 1) .addImm(1); - MI->setDesc(TII->get(AMDGPU::CF_ALU)); + MI->setDesc(TII->get(R600::CF_ALU)); CfCount++; - CFStack.pushBranch(AMDGPU::CF_PUSH_EG); + CFStack.pushBranch(R600::CF_PUSH_EG); } else - CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE); + CFStack.pushBranch(R600::CF_ALU_PUSH_BEFORE); LLVM_FALLTHROUGH; - case AMDGPU::CF_ALU: + case R600::CF_ALU: I = MI; AluClauses.push_back(MakeALUClause(MBB, I)); DEBUG(dbgs() << CfCount << ":"; MI->dump();); CfCount++; break; - case AMDGPU::WHILELOOP: { + case R600::WHILELOOP: { CFStack.pushLoop(); MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_WHILE_LOOP)) @@ -579,7 +579,7 @@ CfCount++; break; } - case AMDGPU::ENDLOOP: { + case R600::ENDLOOP: { CFStack.popLoop(); std::pair> Pair = std::move(LoopStack.back()); @@ -591,7 +591,7 @@ CfCount++; break; } - case AMDGPU::IF_PREDICATE_SET: { + case R600::IF_PREDICATE_SET: { LastAlu.push_back(nullptr); MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_JUMP)) @@ -603,7 +603,7 @@ CfCount++; break; } - case AMDGPU::ELSE: { + case R600::ELSE: { MachineInstr * JumpInst = IfThenElseStack.back(); IfThenElseStack.pop_back(); CounterPropagateAddr(*JumpInst, CfCount); @@ -617,7 +617,7 @@ CfCount++; break; } - case AMDGPU::ENDIF: { + case R600::ENDIF: { CFStack.popBranch(); if (LastAlu.back()) { ToPopAfter.push_back(LastAlu.back()); @@ -639,7 +639,7 @@ MI->eraseFromParent(); break; } - case AMDGPU::BREAK: { + case R600::BREAK: { CfCount ++; MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_LOOP_BREAK)) @@ -648,7 +648,7 @@ MI->eraseFromParent(); break; } - case AMDGPU::CONTINUE: { + case R600::CONTINUE: { MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_LOOP_CONTINUE)) .addImm(0); @@ -657,12 +657,12 @@ CfCount++; break; } - case AMDGPU::RETURN: { + case R600::RETURN: { DebugLoc DL = MBB.findDebugLoc(MI); BuildMI(MBB, MI, DL, getHWInstrDesc(CF_END)); CfCount++; if (CfCount % 2) { - BuildMI(MBB, I, DL, TII->get(AMDGPU::PAD)); + BuildMI(MBB, I, DL, TII->get(R600::PAD)); CfCount++; } MI->eraseFromParent(); @@ -683,7 +683,7 @@ for (unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) { MachineInstr *Alu = ToPopAfter[i]; BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu), - TII->get(AMDGPU::CF_ALU_POP_AFTER)) + TII->get(R600::CF_ALU_POP_AFTER)) .addImm(Alu->getOperand(0).getImm()) .addImm(Alu->getOperand(1).getImm()) .addImm(Alu->getOperand(2).getImm()) Index: lib/Target/AMDGPU/R600EmitClauseMarkers.cpp =================================================================== --- lib/Target/AMDGPU/R600EmitClauseMarkers.cpp +++ lib/Target/AMDGPU/R600EmitClauseMarkers.cpp @@ -52,12 +52,12 @@ unsigned OccupiedDwords(MachineInstr &MI) const { switch (MI.getOpcode()) { - case AMDGPU::INTERP_PAIR_XY: - case AMDGPU::INTERP_PAIR_ZW: - case AMDGPU::INTERP_VEC_LOAD: - case AMDGPU::DOT_4: + case R600::INTERP_PAIR_XY: + case R600::INTERP_PAIR_ZW: + case R600::INTERP_VEC_LOAD: + case R600::DOT_4: return 4; - case AMDGPU::KILL: + case R600::KILL: return 0; default: break; @@ -77,7 +77,7 @@ E = MI.operands_end(); It != E; ++It) { MachineOperand &MO = *It; - if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X) + if (MO.isReg() && MO.getReg() == R600::ALU_LITERAL_X) ++NumLiteral; } return 1 + NumLiteral; @@ -89,12 +89,12 @@ if (TII->isVector(MI) || TII->isCubeOp(MI.getOpcode())) return true; switch (MI.getOpcode()) { - case AMDGPU::PRED_X: - case AMDGPU::INTERP_PAIR_XY: - case AMDGPU::INTERP_PAIR_ZW: - case AMDGPU::INTERP_VEC_LOAD: - case AMDGPU::COPY: - case AMDGPU::DOT_4: + case R600::PRED_X: + case R600::INTERP_PAIR_XY: + case R600::INTERP_PAIR_ZW: + case R600::INTERP_VEC_LOAD: + case R600::COPY: + case R600::DOT_4: return true; default: return false; @@ -103,9 +103,9 @@ bool IsTrivialInst(MachineInstr &MI) const { switch (MI.getOpcode()) { - case AMDGPU::KILL: - case AMDGPU::RETURN: - case AMDGPU::IMPLICIT_DEF: + case R600::KILL: + case R600::RETURN: + case R600::IMPLICIT_DEF: return true; default: return false; @@ -132,16 +132,16 @@ bool UpdateInstr = true) const { std::vector> UsedKCache; - if (!TII->isALUInstr(MI.getOpcode()) && MI.getOpcode() != AMDGPU::DOT_4) + if (!TII->isALUInstr(MI.getOpcode()) && MI.getOpcode() != R600::DOT_4) return true; const SmallVectorImpl> &Consts = TII->getSrcs(MI); assert( - (TII->isALUInstr(MI.getOpcode()) || MI.getOpcode() == AMDGPU::DOT_4) && + (TII->isALUInstr(MI.getOpcode()) || MI.getOpcode() == R600::DOT_4) && "Can't assign Const"); for (unsigned i = 0, n = Consts.size(); i < n; ++i) { - if (Consts[i].first->getReg() != AMDGPU::ALU_CONST) + if (Consts[i].first->getReg() != R600::ALU_CONST) continue; unsigned Sel = Consts[i].second; unsigned Chan = Sel & 3, Index = ((Sel >> 2) - 512) & 31; @@ -172,16 +172,16 @@ return true; for (unsigned i = 0, j = 0, n = Consts.size(); i < n; ++i) { - if (Consts[i].first->getReg() != AMDGPU::ALU_CONST) + if (Consts[i].first->getReg() != R600::ALU_CONST) continue; switch(UsedKCache[j].first) { case 0: Consts[i].first->setReg( - AMDGPU::R600_KC0RegClass.getRegister(UsedKCache[j].second)); + R600::R600_KC0RegClass.getRegister(UsedKCache[j].second)); break; case 1: Consts[i].first->setReg( - AMDGPU::R600_KC1RegClass.getRegister(UsedKCache[j].second)); + R600::R600_KC1RegClass.getRegister(UsedKCache[j].second)); break; default: llvm_unreachable("Wrong Cache Line"); @@ -253,7 +253,7 @@ break; if (AluInstCount > TII->getMaxAlusPerClause()) break; - if (I->getOpcode() == AMDGPU::PRED_X) { + if (I->getOpcode() == R600::PRED_X) { // We put PRED_X in its own clause to ensure that ifcvt won't create // clauses with more than 128 insts. // IfCvt is indeed checking that "then" and "else" branches of an if @@ -289,7 +289,7 @@ AluInstCount += OccupiedDwords(*I); } unsigned Opcode = PushBeforeModifier ? - AMDGPU::CF_ALU_PUSH_BEFORE : AMDGPU::CF_ALU; + R600::CF_ALU_PUSH_BEFORE : R600::CF_ALU; BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), TII->get(Opcode)) // We don't use the ADDR field until R600ControlFlowFinalizer pass, where // it is safe to assume it is 0. However if we always put 0 here, the ifcvt @@ -322,7 +322,7 @@ BB != BB_E; ++BB) { MachineBasicBlock &MBB = *BB; MachineBasicBlock::iterator I = MBB.begin(); - if (I != MBB.end() && I->getOpcode() == AMDGPU::CF_ALU) + if (I != MBB.end() && I->getOpcode() == R600::CF_ALU) continue; // BB was already parsed for (MachineBasicBlock::iterator E = MBB.end(); I != E;) { if (isALU(*I)) { Index: lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp =================================================================== --- lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp +++ lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp @@ -96,16 +96,16 @@ // Expand LDS_*_RET instructions if (TII->isLDSRetInstr(MI.getOpcode())) { - int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst); + int DstIdx = TII->getOperandIdx(MI.getOpcode(), R600::OpName::dst); assert(DstIdx != -1); MachineOperand &DstOp = MI.getOperand(DstIdx); MachineInstr *Mov = TII->buildMovInstr(&MBB, I, - DstOp.getReg(), AMDGPU::OQAP); - DstOp.setReg(AMDGPU::OQAP); + DstOp.getReg(), R600::OQAP); + DstOp.setReg(R600::OQAP); int LDSPredSelIdx = TII->getOperandIdx(MI.getOpcode(), - AMDGPU::OpName::pred_sel); + R600::OpName::pred_sel); int MovPredSelIdx = TII->getOperandIdx(Mov->getOpcode(), - AMDGPU::OpName::pred_sel); + R600::OpName::pred_sel); // Copy the pred_sel bit Mov->getOperand(MovPredSelIdx).setReg( MI.getOperand(LDSPredSelIdx).getReg()); @@ -114,7 +114,7 @@ switch (MI.getOpcode()) { default: break; // Expand PRED_X to one of the PRED_SET instructions. - case AMDGPU::PRED_X: { + case R600::PRED_X: { uint64_t Flags = MI.getOperand(3).getImm(); // The native opcode used by PRED_X is stored as an immediate in the // third operand. @@ -122,17 +122,18 @@ MI.getOperand(2).getImm(), // opcode MI.getOperand(0).getReg(), // dst MI.getOperand(1).getReg(), // src0 - AMDGPU::ZERO); // src1 + R600::ZERO); // src1 TII->addFlag(*PredSet, 0, MO_FLAG_MASK); if (Flags & MO_FLAG_PUSH) { - TII->setImmOperand(*PredSet, AMDGPU::OpName::update_exec_mask, 1); + TII->setImmOperand(*PredSet, R600::OpName::update_exec_mask, 1); } else { - TII->setImmOperand(*PredSet, AMDGPU::OpName::update_pred, 1); + TII->setImmOperand(*PredSet, R600::OpName::update_pred, 1); } MI.eraseFromParent(); continue; } - case AMDGPU::DOT_4: { + case R600::DOT_4: { + const R600RegisterInfo &TRI = TII->getRegisterInfo(); unsigned DstReg = MI.getOperand(0).getReg(); @@ -141,7 +142,7 @@ for (unsigned Chan = 0; Chan < 4; ++Chan) { bool Mask = (Chan != TRI.getHWRegChan(DstReg)); unsigned SubDstReg = - AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan); + R600::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan); MachineInstr *BMI = TII->buildSlotOfVectorInstruction(MBB, &MI, Chan, SubDstReg); if (Chan > 0) { @@ -156,10 +157,10 @@ // While not strictly necessary from hw point of view, we force // all src operands of a dot4 inst to belong to the same slot. unsigned Src0 = BMI->getOperand( - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0)) + TII->getOperandIdx(Opcode, R600::OpName::src0)) .getReg(); unsigned Src1 = BMI->getOperand( - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1)) + TII->getOperandIdx(Opcode, R600::OpName::src1)) .getReg(); (void) Src0; (void) Src1; @@ -206,14 +207,14 @@ // T0_W = CUBE T1_Y, T1_Z for (unsigned Chan = 0; Chan < 4; Chan++) { unsigned DstReg = MI.getOperand( - TII->getOperandIdx(MI, AMDGPU::OpName::dst)).getReg(); + TII->getOperandIdx(MI, R600::OpName::dst)).getReg(); unsigned Src0 = MI.getOperand( - TII->getOperandIdx(MI, AMDGPU::OpName::src0)).getReg(); + TII->getOperandIdx(MI, R600::OpName::src0)).getReg(); unsigned Src1 = 0; // Determine the correct source registers if (!IsCube) { - int Src1Idx = TII->getOperandIdx(MI, AMDGPU::OpName::src1); + int Src1Idx = TII->getOperandIdx(MI, R600::OpName::src1); if (Src1Idx != -1) { Src1 = MI.getOperand(Src1Idx).getReg(); } @@ -241,7 +242,7 @@ // the current Channel. Mask = (Chan != TRI.getHWRegChan(DstReg)); unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK; - DstReg = AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan); + DstReg = R600::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan); } // Set the IsLast bit @@ -250,11 +251,11 @@ // Add the new instruction unsigned Opcode = MI.getOpcode(); switch (Opcode) { - case AMDGPU::CUBE_r600_pseudo: - Opcode = AMDGPU::CUBE_r600_real; + case R600::CUBE_r600_pseudo: + Opcode = R600::CUBE_r600_real; break; - case AMDGPU::CUBE_eg_pseudo: - Opcode = AMDGPU::CUBE_eg_real; + case R600::CUBE_eg_pseudo: + Opcode = R600::CUBE_eg_real; break; default: break; @@ -271,12 +272,12 @@ if (NotLast) { TII->addFlag(*NewMI, 0, MO_FLAG_NOT_LAST); } - SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::clamp); - SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::literal); - SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_abs); - SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_abs); - SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_neg); - SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_neg); + SetFlagInNewMI(NewMI, &MI, R600::OpName::clamp); + SetFlagInNewMI(NewMI, &MI, R600::OpName::literal); + SetFlagInNewMI(NewMI, &MI, R600::OpName::src0_abs); + SetFlagInNewMI(NewMI, &MI, R600::OpName::src1_abs); + SetFlagInNewMI(NewMI, &MI, R600::OpName::src0_neg); + SetFlagInNewMI(NewMI, &MI, R600::OpName::src1_neg); } MI.eraseFromParent(); } Index: lib/Target/AMDGPU/R600ISelLowering.h =================================================================== --- lib/Target/AMDGPU/R600ISelLowering.h +++ lib/Target/AMDGPU/R600ISelLowering.h @@ -23,6 +23,8 @@ class R600Subtarget; class R600TargetLowering final : public AMDGPUTargetLowering { + + const R600Subtarget *Subtarget; public: R600TargetLowering(const TargetMachine &TM, const R600Subtarget &STI); @@ -36,6 +38,7 @@ void ReplaceNodeResults(SDNode * N, SmallVectorImpl &Results, SelectionDAG &DAG) const override; + CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const; SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, Index: lib/Target/AMDGPU/R600ISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/R600ISelLowering.cpp +++ lib/Target/AMDGPU/R600ISelLowering.cpp @@ -14,11 +14,11 @@ #include "R600ISelLowering.h" #include "AMDGPUFrameLowering.h" -#include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" #include "R600Defines.h" #include "R600FrameLowering.h" #include "R600InstrInfo.h" +#include "R600IntrinsicInfo.h" #include "R600MachineFunctionInfo.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "Utils/AMDGPUBaseInfo.h" @@ -51,17 +51,31 @@ using namespace llvm; +static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State) { + MachineFunction &MF = State.getMachineFunction(); + AMDGPUMachineFunction *MFI = MF.getInfo(); + + uint64_t Offset = MFI->allocateKernArg(LocVT.getStoreSize(), + ArgFlags.getOrigAlign()); + State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo)); + return true; +} + +#include "R600GenCallingConv.inc" + R600TargetLowering::R600TargetLowering(const TargetMachine &TM, const R600Subtarget &STI) - : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) { - addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass); - addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass); - addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass); - addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass); - addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass); - addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass); + : AMDGPUTargetLowering(TM, STI), Subtarget(&STI), Gen(STI.getGeneration()) { + addRegisterClass(MVT::f32, &R600::R600_Reg32RegClass); + addRegisterClass(MVT::i32, &R600::R600_Reg32RegClass); + addRegisterClass(MVT::v2f32, &R600::R600_Reg64RegClass); + addRegisterClass(MVT::v2i32, &R600::R600_Reg64RegClass); + addRegisterClass(MVT::v4f32, &R600::R600_Reg128RegClass); + addRegisterClass(MVT::v4i32, &R600::R600_Reg128RegClass); - computeRegisterProperties(STI.getRegisterInfo()); + computeRegisterProperties(Subtarget->getRegisterInfo()); // Legalize loads and stores to the private address space. setOperationAction(ISD::LOAD, MVT::i32, Custom); @@ -148,6 +162,11 @@ setOperationAction(ISD::FSUB, MVT::f32, Expand); + setOperationAction(ISD::FCEIL, MVT::f64, Custom); + setOperationAction(ISD::FTRUNC, MVT::f64, Custom); + setOperationAction(ISD::FRINT, MVT::f64, Custom); + setOperationAction(ISD::FFLOOR, MVT::f64, Custom); + setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); @@ -246,14 +265,10 @@ setTargetDAGCombine(ISD::LOAD); } -const R600Subtarget *R600TargetLowering::getSubtarget() const { - return static_cast(Subtarget); -} - static inline bool isEOP(MachineBasicBlock::iterator I) { if (std::next(I) == I->getParent()->end()) return false; - return std::next(I)->getOpcode() == AMDGPU::RETURN; + return std::next(I)->getOpcode() == R600::RETURN; } MachineBasicBlock * @@ -262,24 +277,24 @@ MachineFunction *MF = BB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); MachineBasicBlock::iterator I = MI; - const R600InstrInfo *TII = getSubtarget()->getInstrInfo(); + const R600InstrInfo *TII = Subtarget->getInstrInfo(); switch (MI.getOpcode()) { default: // Replace LDS_*_RET instruction that don't have any uses with the // equivalent LDS_*_NORET instruction. if (TII->isLDSRetInstr(MI.getOpcode())) { - int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst); + int DstIdx = TII->getOperandIdx(MI.getOpcode(), R600::OpName::dst); assert(DstIdx != -1); MachineInstrBuilder NewMI; // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add // LDS_1A2D support and remove this special case. if (!MRI.use_empty(MI.getOperand(DstIdx).getReg()) || - MI.getOpcode() == AMDGPU::LDS_CMPST_RET) + MI.getOpcode() == R600::LDS_CMPST_RET) return BB; NewMI = BuildMI(*BB, I, BB->findDebugLoc(I), - TII->get(AMDGPU::getLDSNoRetOp(MI.getOpcode()))); + TII->get(R600::getLDSNoRetOp(MI.getOpcode()))); for (unsigned i = 1, e = MI.getNumOperands(); i < e; ++i) { NewMI.add(MI.getOperand(i)); } @@ -287,31 +302,31 @@ return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); } break; - case AMDGPU::CLAMP_R600: { + case R600::CLAMP_R600: { MachineInstr *NewMI = TII->buildDefaultInstruction( - *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(), + *BB, I, R600::MOV, MI.getOperand(0).getReg(), MI.getOperand(1).getReg()); TII->addFlag(*NewMI, 0, MO_FLAG_CLAMP); break; } - case AMDGPU::FABS_R600: { + case R600::FABS_R600: { MachineInstr *NewMI = TII->buildDefaultInstruction( - *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(), + *BB, I, R600::MOV, MI.getOperand(0).getReg(), MI.getOperand(1).getReg()); TII->addFlag(*NewMI, 0, MO_FLAG_ABS); break; } - case AMDGPU::FNEG_R600: { + case R600::FNEG_R600: { MachineInstr *NewMI = TII->buildDefaultInstruction( - *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(), + *BB, I, R600::MOV, MI.getOperand(0).getReg(), MI.getOperand(1).getReg()); TII->addFlag(*NewMI, 0, MO_FLAG_NEG); break; } - case AMDGPU::MASK_WRITE: { + case R600::MASK_WRITE: { unsigned maskedRegister = MI.getOperand(0).getReg(); assert(TargetRegisterInfo::isVirtualRegister(maskedRegister)); MachineInstr * defInstr = MRI.getVRegDef(maskedRegister); @@ -319,7 +334,7 @@ break; } - case AMDGPU::MOV_IMM_F32: + case R600::MOV_IMM_F32: TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(), MI.getOperand(1) .getFPImm() ->getValueAPF() @@ -327,39 +342,39 @@ .getZExtValue()); break; - case AMDGPU::MOV_IMM_I32: + case R600::MOV_IMM_I32: TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(), MI.getOperand(1).getImm()); break; - case AMDGPU::MOV_IMM_GLOBAL_ADDR: { + case R600::MOV_IMM_GLOBAL_ADDR: { //TODO: Perhaps combine this instruction with the next if possible auto MIB = TII->buildDefaultInstruction( - *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_LITERAL_X); - int Idx = TII->getOperandIdx(*MIB, AMDGPU::OpName::literal); + *BB, MI, R600::MOV, MI.getOperand(0).getReg(), R600::ALU_LITERAL_X); + int Idx = TII->getOperandIdx(*MIB, R600::OpName::literal); //TODO: Ugh this is rather ugly MIB->getOperand(Idx) = MI.getOperand(1); break; } - case AMDGPU::CONST_COPY: { + case R600::CONST_COPY: { MachineInstr *NewMI = TII->buildDefaultInstruction( - *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_CONST); - TII->setImmOperand(*NewMI, AMDGPU::OpName::src0_sel, + *BB, MI, R600::MOV, MI.getOperand(0).getReg(), R600::ALU_CONST); + TII->setImmOperand(*NewMI, R600::OpName::src0_sel, MI.getOperand(1).getImm()); break; } - case AMDGPU::RAT_WRITE_CACHELESS_32_eg: - case AMDGPU::RAT_WRITE_CACHELESS_64_eg: - case AMDGPU::RAT_WRITE_CACHELESS_128_eg: + case R600::RAT_WRITE_CACHELESS_32_eg: + case R600::RAT_WRITE_CACHELESS_64_eg: + case R600::RAT_WRITE_CACHELESS_128_eg: BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode())) .add(MI.getOperand(0)) .add(MI.getOperand(1)) .addImm(isEOP(I)); // Set End of program bit break; - case AMDGPU::RAT_STORE_TYPED_eg: + case R600::RAT_STORE_TYPED_eg: BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode())) .add(MI.getOperand(0)) .add(MI.getOperand(1)) @@ -367,49 +382,49 @@ .addImm(isEOP(I)); // Set End of program bit break; - case AMDGPU::BRANCH: - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP)) + case R600::BRANCH: + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::JUMP)) .add(MI.getOperand(0)); break; - case AMDGPU::BRANCH_COND_f32: { + case R600::BRANCH_COND_f32: { MachineInstr *NewMI = - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), - AMDGPU::PREDICATE_BIT) + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::PRED_X), + R600::PREDICATE_BIT) .add(MI.getOperand(1)) - .addImm(AMDGPU::PRED_SETNE) + .addImm(R600::PRED_SETNE) .addImm(0); // Flags TII->addFlag(*NewMI, 0, MO_FLAG_PUSH); - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::JUMP_COND)) .add(MI.getOperand(0)) - .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); + .addReg(R600::PREDICATE_BIT, RegState::Kill); break; } - case AMDGPU::BRANCH_COND_i32: { + case R600::BRANCH_COND_i32: { MachineInstr *NewMI = - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X), - AMDGPU::PREDICATE_BIT) + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::PRED_X), + R600::PREDICATE_BIT) .add(MI.getOperand(1)) - .addImm(AMDGPU::PRED_SETNE_INT) + .addImm(R600::PRED_SETNE_INT) .addImm(0); // Flags TII->addFlag(*NewMI, 0, MO_FLAG_PUSH); - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND)) + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(R600::JUMP_COND)) .add(MI.getOperand(0)) - .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); + .addReg(R600::PREDICATE_BIT, RegState::Kill); break; } - case AMDGPU::EG_ExportSwz: - case AMDGPU::R600_ExportSwz: { + case R600::EG_ExportSwz: + case R600::R600_ExportSwz: { // Instruction is left unmodified if its not the last one of its type bool isLastInstructionOfItsType = true; unsigned InstExportType = MI.getOperand(1).getImm(); for (MachineBasicBlock::iterator NextExportInst = std::next(I), EndBlock = BB->end(); NextExportInst != EndBlock; NextExportInst = std::next(NextExportInst)) { - if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz || - NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) { + if (NextExportInst->getOpcode() == R600::EG_ExportSwz || + NextExportInst->getOpcode() == R600::R600_ExportSwz) { unsigned CurrentInstExportType = NextExportInst->getOperand(1) .getImm(); if (CurrentInstExportType == InstExportType) { @@ -421,7 +436,7 @@ bool EOP = isEOP(I); if (!EOP && !isLastInstructionOfItsType) return BB; - unsigned CfInst = (MI.getOpcode() == AMDGPU::EG_ExportSwz) ? 84 : 40; + unsigned CfInst = (MI.getOpcode() == R600::EG_ExportSwz) ? 84 : 40; BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode())) .add(MI.getOperand(0)) .add(MI.getOperand(1)) @@ -434,7 +449,7 @@ .addImm(EOP); break; } - case AMDGPU::RETURN: { + case R600::RETURN: { return BB; } } @@ -479,7 +494,7 @@ unsigned IntrinsicID = cast(Op.getOperand(1))->getZExtValue(); switch (IntrinsicID) { - case AMDGPUIntrinsic::r600_store_swizzle: { + case r600Intrinsic::r600_store_swizzle: { SDLoc DL(Op); const SDValue Args[8] = { Chain, @@ -506,14 +521,14 @@ EVT VT = Op.getValueType(); SDLoc DL(Op); switch (IntrinsicID) { - case AMDGPUIntrinsic::r600_tex: - case AMDGPUIntrinsic::r600_texc: { + case r600Intrinsic::r600_tex: + case r600Intrinsic::r600_texc: { unsigned TextureOp; switch (IntrinsicID) { - case AMDGPUIntrinsic::r600_tex: + case r600Intrinsic::r600_tex: TextureOp = 0; break; - case AMDGPUIntrinsic::r600_texc: + case r600Intrinsic::r600_texc: TextureOp = 1; break; default: @@ -543,7 +558,7 @@ }; return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs); } - case AMDGPUIntrinsic::r600_dot4: { + case r600Intrinsic::r600_dot4: { SDValue Args[8] = { DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), DAG.getConstant(0, DL, MVT::i32)), @@ -590,23 +605,23 @@ return LowerImplicitParameter(DAG, VT, DL, 8); case Intrinsic::r600_read_tgid_x: - return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass, - AMDGPU::T1_X, VT); + return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass, + R600::T1_X, VT); case Intrinsic::r600_read_tgid_y: - return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass, - AMDGPU::T1_Y, VT); + return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass, + R600::T1_Y, VT); case Intrinsic::r600_read_tgid_z: - return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass, - AMDGPU::T1_Z, VT); + return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass, + R600::T1_Z, VT); case Intrinsic::r600_read_tidig_x: - return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass, - AMDGPU::T0_X, VT); + return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass, + R600::T0_X, VT); case Intrinsic::r600_read_tidig_y: - return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass, - AMDGPU::T0_Y, VT); + return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass, + R600::T0_Y, VT); case Intrinsic::r600_read_tidig_z: - return CreateLiveInRegisterRaw(DAG, &AMDGPU::R600_TReg32RegClass, - AMDGPU::T0_Z, VT); + return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass, + R600::T0_Z, VT); case Intrinsic::r600_recipsqrt_ieee: return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); @@ -1528,7 +1543,7 @@ SDValue R600TargetLowering::lowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); - const R600FrameLowering *TFL = getSubtarget()->getFrameLowering(); + const R600FrameLowering *TFL = Subtarget->getFrameLowering(); FrameIndexSDNode *FIN = cast(Op); @@ -1540,6 +1555,28 @@ Op.getValueType()); } +CCAssignFn *R600TargetLowering::CCAssignFnForCall(CallingConv::ID CC, + bool IsVarArg) const { + switch (CC) { + case CallingConv::AMDGPU_KERNEL: + case CallingConv::SPIR_KERNEL: + case CallingConv::C: + case CallingConv::Fast: + case CallingConv::Cold: + return CC_R600_Kernel; + case CallingConv::AMDGPU_VS: + case CallingConv::AMDGPU_GS: + case CallingConv::AMDGPU_PS: + case CallingConv::AMDGPU_CS: + case CallingConv::AMDGPU_HS: + case CallingConv::AMDGPU_ES: + case CallingConv::AMDGPU_LS: + return CC_R600; + default: + report_fatal_error("Unsupported calling convention."); + } +} + /// XXX Only kernel functions are supported, so we can assume for now that /// every function is a kernel function, but in the future we should use /// separate calling conventions for kernel and non-kernel functions. @@ -1572,7 +1609,7 @@ } if (AMDGPU::isShader(CallConv)) { - unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass); + unsigned Reg = MF.addLiveIn(VA.getLocReg(), &R600::R600_Reg128RegClass); SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT); InVals.push_back(Register); continue; @@ -1990,26 +2027,26 @@ SDValue &Src, SDValue &Neg, SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) const { - const R600InstrInfo *TII = getSubtarget()->getInstrInfo(); + const R600InstrInfo *TII = Subtarget->getInstrInfo(); if (!Src.isMachineOpcode()) return false; switch (Src.getMachineOpcode()) { - case AMDGPU::FNEG_R600: + case R600::FNEG_R600: if (!Neg.getNode()) return false; Src = Src.getOperand(0); Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32); return true; - case AMDGPU::FABS_R600: + case R600::FABS_R600: if (!Abs.getNode()) return false; Src = Src.getOperand(0); Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32); return true; - case AMDGPU::CONST_COPY: { + case R600::CONST_COPY: { unsigned Opcode = ParentNode->getMachineOpcode(); - bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; + bool HasDst = TII->getOperandIdx(Opcode, R600::OpName::dst) > -1; if (!Sel.getNode()) return false; @@ -2020,17 +2057,17 @@ // Gather constants values int SrcIndices[] = { - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src2), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W) + TII->getOperandIdx(Opcode, R600::OpName::src0), + TII->getOperandIdx(Opcode, R600::OpName::src1), + TII->getOperandIdx(Opcode, R600::OpName::src2), + TII->getOperandIdx(Opcode, R600::OpName::src0_X), + TII->getOperandIdx(Opcode, R600::OpName::src0_Y), + TII->getOperandIdx(Opcode, R600::OpName::src0_Z), + TII->getOperandIdx(Opcode, R600::OpName::src0_W), + TII->getOperandIdx(Opcode, R600::OpName::src1_X), + TII->getOperandIdx(Opcode, R600::OpName::src1_Y), + TII->getOperandIdx(Opcode, R600::OpName::src1_Z), + TII->getOperandIdx(Opcode, R600::OpName::src1_W) }; std::vector Consts; for (int OtherSrcIdx : SrcIndices) { @@ -2043,7 +2080,7 @@ } if (RegisterSDNode *Reg = dyn_cast(ParentNode->getOperand(OtherSrcIdx))) { - if (Reg->getReg() == AMDGPU::ALU_CONST) { + if (Reg->getReg() == R600::ALU_CONST) { ConstantSDNode *Cst = cast(ParentNode->getOperand(OtherSelIdx)); Consts.push_back(Cst->getZExtValue()); @@ -2058,30 +2095,30 @@ } Sel = CstOffset; - Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32); + Src = DAG.getRegister(R600::ALU_CONST, MVT::f32); return true; } - case AMDGPU::MOV_IMM_GLOBAL_ADDR: + case R600::MOV_IMM_GLOBAL_ADDR: // Check if the Imm slot is used. Taken from below. if (cast(Imm)->getZExtValue()) return false; Imm = Src.getOperand(0); - Src = DAG.getRegister(AMDGPU::ALU_LITERAL_X, MVT::i32); + Src = DAG.getRegister(R600::ALU_LITERAL_X, MVT::i32); return true; - case AMDGPU::MOV_IMM_I32: - case AMDGPU::MOV_IMM_F32: { - unsigned ImmReg = AMDGPU::ALU_LITERAL_X; + case R600::MOV_IMM_I32: + case R600::MOV_IMM_F32: { + unsigned ImmReg = R600::ALU_LITERAL_X; uint64_t ImmValue = 0; - if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) { + if (Src.getMachineOpcode() == R600::MOV_IMM_F32) { ConstantFPSDNode *FPC = dyn_cast(Src.getOperand(0)); float FloatValue = FPC->getValueAPF().convertToFloat(); if (FloatValue == 0.0) { - ImmReg = AMDGPU::ZERO; + ImmReg = R600::ZERO; } else if (FloatValue == 0.5) { - ImmReg = AMDGPU::HALF; + ImmReg = R600::HALF; } else if (FloatValue == 1.0) { - ImmReg = AMDGPU::ONE; + ImmReg = R600::ONE; } else { ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue(); } @@ -2089,9 +2126,9 @@ ConstantSDNode *C = dyn_cast(Src.getOperand(0)); uint64_t Value = C->getZExtValue(); if (Value == 0) { - ImmReg = AMDGPU::ZERO; + ImmReg = R600::ZERO; } else if (Value == 1) { - ImmReg = AMDGPU::ONE_INT; + ImmReg = R600::ONE_INT; } else { ImmValue = Value; } @@ -2100,7 +2137,7 @@ // Check that we aren't already using an immediate. // XXX: It's possible for an instruction to have more than one // immediate operand, but this is not supported yet. - if (ImmReg == AMDGPU::ALU_LITERAL_X) { + if (ImmReg == R600::ALU_LITERAL_X) { if (!Imm.getNode()) return false; ConstantSDNode *C = dyn_cast(Imm); @@ -2120,7 +2157,7 @@ /// \brief Fold the instructions after selecting them SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node, SelectionDAG &DAG) const { - const R600InstrInfo *TII = getSubtarget()->getInstrInfo(); + const R600InstrInfo *TII = Subtarget->getInstrInfo(); if (!Node->isMachineOpcode()) return Node; @@ -2129,36 +2166,36 @@ std::vector Ops(Node->op_begin(), Node->op_end()); - if (Opcode == AMDGPU::DOT_4) { + if (Opcode == R600::DOT_4) { int OperandIdx[] = { - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W) + TII->getOperandIdx(Opcode, R600::OpName::src0_X), + TII->getOperandIdx(Opcode, R600::OpName::src0_Y), + TII->getOperandIdx(Opcode, R600::OpName::src0_Z), + TII->getOperandIdx(Opcode, R600::OpName::src0_W), + TII->getOperandIdx(Opcode, R600::OpName::src1_X), + TII->getOperandIdx(Opcode, R600::OpName::src1_Y), + TII->getOperandIdx(Opcode, R600::OpName::src1_Z), + TII->getOperandIdx(Opcode, R600::OpName::src1_W) }; int NegIdx[] = { - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W) + TII->getOperandIdx(Opcode, R600::OpName::src0_neg_X), + TII->getOperandIdx(Opcode, R600::OpName::src0_neg_Y), + TII->getOperandIdx(Opcode, R600::OpName::src0_neg_Z), + TII->getOperandIdx(Opcode, R600::OpName::src0_neg_W), + TII->getOperandIdx(Opcode, R600::OpName::src1_neg_X), + TII->getOperandIdx(Opcode, R600::OpName::src1_neg_Y), + TII->getOperandIdx(Opcode, R600::OpName::src1_neg_Z), + TII->getOperandIdx(Opcode, R600::OpName::src1_neg_W) }; int AbsIdx[] = { - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W) + TII->getOperandIdx(Opcode, R600::OpName::src0_abs_X), + TII->getOperandIdx(Opcode, R600::OpName::src0_abs_Y), + TII->getOperandIdx(Opcode, R600::OpName::src0_abs_Z), + TII->getOperandIdx(Opcode, R600::OpName::src0_abs_W), + TII->getOperandIdx(Opcode, R600::OpName::src1_abs_X), + TII->getOperandIdx(Opcode, R600::OpName::src1_abs_Y), + TII->getOperandIdx(Opcode, R600::OpName::src1_abs_Z), + TII->getOperandIdx(Opcode, R600::OpName::src1_abs_W) }; for (unsigned i = 0; i < 8; i++) { if (OperandIdx[i] < 0) @@ -2166,7 +2203,7 @@ SDValue &Src = Ops[OperandIdx[i] - 1]; SDValue &Neg = Ops[NegIdx[i] - 1]; SDValue &Abs = Ops[AbsIdx[i] - 1]; - bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; + bool HasDst = TII->getOperandIdx(Opcode, R600::OpName::dst) > -1; int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]); if (HasDst) SelIdx--; @@ -2174,19 +2211,19 @@ if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG)) return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); } - } else if (Opcode == AMDGPU::REG_SEQUENCE) { + } else if (Opcode == R600::REG_SEQUENCE) { for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) { SDValue &Src = Ops[i]; if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG)) return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); } - } else if (Opcode == AMDGPU::CLAMP_R600) { + } else if (Opcode == R600::CLAMP_R600) { SDValue Src = Node->getOperand(0); if (!Src.isMachineOpcode() || !TII->hasInstrModifiers(Src.getMachineOpcode())) return Node; int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(), - AMDGPU::OpName::clamp); + R600::OpName::clamp); if (ClampIdx < 0) return Node; SDLoc DL(Node); @@ -2198,18 +2235,18 @@ if (!TII->hasInstrModifiers(Opcode)) return Node; int OperandIdx[] = { - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src2) + TII->getOperandIdx(Opcode, R600::OpName::src0), + TII->getOperandIdx(Opcode, R600::OpName::src1), + TII->getOperandIdx(Opcode, R600::OpName::src2) }; int NegIdx[] = { - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg) + TII->getOperandIdx(Opcode, R600::OpName::src0_neg), + TII->getOperandIdx(Opcode, R600::OpName::src1_neg), + TII->getOperandIdx(Opcode, R600::OpName::src2_neg) }; int AbsIdx[] = { - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs), - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs), + TII->getOperandIdx(Opcode, R600::OpName::src0_abs), + TII->getOperandIdx(Opcode, R600::OpName::src1_abs), -1 }; for (unsigned i = 0; i < 3; i++) { @@ -2219,9 +2256,9 @@ SDValue &Neg = Ops[NegIdx[i] - 1]; SDValue FakeAbs; SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs; - bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1; + bool HasDst = TII->getOperandIdx(Opcode, R600::OpName::dst) > -1; int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]); - int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal); + int ImmIdx = TII->getOperandIdx(Opcode, R600::OpName::literal); if (HasDst) { SelIdx--; ImmIdx--; Index: lib/Target/AMDGPU/R600InstrFormats.td =================================================================== --- lib/Target/AMDGPU/R600InstrFormats.td +++ lib/Target/AMDGPU/R600InstrFormats.td @@ -41,7 +41,7 @@ bit LDS_1A2D = 0; let SubtargetPredicate = isR600toCayman; - let Namespace = "AMDGPU"; + let Namespace = "R600"; let OutOperandList = outs; let InOperandList = ins; let AsmString = asm; Index: lib/Target/AMDGPU/R600InstrInfo.h =================================================================== --- lib/Target/AMDGPU/R600InstrInfo.h +++ lib/Target/AMDGPU/R600InstrInfo.h @@ -15,8 +15,11 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_R600INSTRINFO_H #define LLVM_LIB_TARGET_AMDGPU_R600INSTRINFO_H -#include "AMDGPUInstrInfo.h" #include "R600RegisterInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" + +#define GET_INSTRINFO_HEADER +#include "R600GenInstrInfo.inc" namespace llvm { @@ -34,7 +37,7 @@ class MachineInstrBuilder; class R600Subtarget; -class R600InstrInfo final : public AMDGPUInstrInfo { +class R600InstrInfo final : public R600GenInstrInfo { private: const R600RegisterInfo RI; const R600Subtarget &ST; @@ -324,7 +327,7 @@ PseudoSourceValue::PSVKind Kind) const override; }; -namespace AMDGPU { +namespace R600 { int getLDSNoRetOp(uint16_t Opcode); Index: lib/Target/AMDGPU/R600InstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/R600InstrInfo.cpp +++ lib/Target/AMDGPU/R600InstrInfo.cpp @@ -45,10 +45,15 @@ using namespace llvm; #define GET_INSTRINFO_CTOR_DTOR -#include "AMDGPUGenDFAPacketizer.inc" +#include "R600GenDFAPacketizer.inc" + +#define GET_INSTRINFO_CTOR_DTOR +#define GET_INSTRMAP_INFO +#define GET_INSTRINFO_NAMED_OPS +#include "R600GenInstrInfo.inc" R600InstrInfo::R600InstrInfo(const R600Subtarget &ST) - : AMDGPUInstrInfo(ST), RI(), ST(ST) {} + : R600GenInstrInfo(-1, -1), RI(), ST(ST) {} bool R600InstrInfo::isVector(const MachineInstr &MI) const { return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR; @@ -59,31 +64,31 @@ const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, bool KillSrc) const { unsigned VectorComponents = 0; - if ((AMDGPU::R600_Reg128RegClass.contains(DestReg) || - AMDGPU::R600_Reg128VerticalRegClass.contains(DestReg)) && - (AMDGPU::R600_Reg128RegClass.contains(SrcReg) || - AMDGPU::R600_Reg128VerticalRegClass.contains(SrcReg))) { + if ((R600::R600_Reg128RegClass.contains(DestReg) || + R600::R600_Reg128VerticalRegClass.contains(DestReg)) && + (R600::R600_Reg128RegClass.contains(SrcReg) || + R600::R600_Reg128VerticalRegClass.contains(SrcReg))) { VectorComponents = 4; - } else if((AMDGPU::R600_Reg64RegClass.contains(DestReg) || - AMDGPU::R600_Reg64VerticalRegClass.contains(DestReg)) && - (AMDGPU::R600_Reg64RegClass.contains(SrcReg) || - AMDGPU::R600_Reg64VerticalRegClass.contains(SrcReg))) { + } else if((R600::R600_Reg64RegClass.contains(DestReg) || + R600::R600_Reg64VerticalRegClass.contains(DestReg)) && + (R600::R600_Reg64RegClass.contains(SrcReg) || + R600::R600_Reg64VerticalRegClass.contains(SrcReg))) { VectorComponents = 2; } if (VectorComponents > 0) { for (unsigned I = 0; I < VectorComponents; I++) { unsigned SubRegIndex = AMDGPURegisterInfo::getSubRegFromChannel(I); - buildDefaultInstruction(MBB, MI, AMDGPU::MOV, + buildDefaultInstruction(MBB, MI, R600::MOV, RI.getSubReg(DestReg, SubRegIndex), RI.getSubReg(SrcReg, SubRegIndex)) .addReg(DestReg, RegState::Define | RegState::Implicit); } } else { - MachineInstr *NewMI = buildDefaultInstruction(MBB, MI, AMDGPU::MOV, + MachineInstr *NewMI = buildDefaultInstruction(MBB, MI, R600::MOV, DestReg, SrcReg); - NewMI->getOperand(getOperandIdx(*NewMI, AMDGPU::OpName::src0)) + NewMI->getOperand(getOperandIdx(*NewMI, R600::OpName::src0)) .setIsKill(KillSrc); } } @@ -104,9 +109,9 @@ switch(Opcode) { default: return false; - case AMDGPU::MOV: - case AMDGPU::MOV_IMM_F32: - case AMDGPU::MOV_IMM_I32: + case R600::MOV: + case R600::MOV_IMM_F32: + case R600::MOV_IMM_I32: return true; } } @@ -118,10 +123,10 @@ bool R600InstrInfo::isCubeOp(unsigned Opcode) const { switch(Opcode) { default: return false; - case AMDGPU::CUBE_r600_pseudo: - case AMDGPU::CUBE_r600_real: - case AMDGPU::CUBE_eg_pseudo: - case AMDGPU::CUBE_eg_real: + case R600::CUBE_r600_pseudo: + case R600::CUBE_r600_real: + case R600::CUBE_eg_pseudo: + case R600::CUBE_eg_real: return true; } } @@ -149,7 +154,7 @@ } bool R600InstrInfo::isLDSRetInstr(unsigned Opcode) const { - return isLDSInstr(Opcode) && getOperandIdx(Opcode, AMDGPU::OpName::dst) != -1; + return isLDSInstr(Opcode) && getOperandIdx(Opcode, R600::OpName::dst) != -1; } bool R600InstrInfo::canBeConsideredALU(const MachineInstr &MI) const { @@ -158,12 +163,12 @@ if (isVector(MI) || isCubeOp(MI.getOpcode())) return true; switch (MI.getOpcode()) { - case AMDGPU::PRED_X: - case AMDGPU::INTERP_PAIR_XY: - case AMDGPU::INTERP_PAIR_ZW: - case AMDGPU::INTERP_VEC_LOAD: - case AMDGPU::COPY: - case AMDGPU::DOT_4: + case R600::PRED_X: + case R600::INTERP_PAIR_XY: + case R600::INTERP_PAIR_ZW: + case R600::INTERP_VEC_LOAD: + case R600::COPY: + case R600::DOT_4: return true; default: return false; @@ -173,7 +178,7 @@ bool R600InstrInfo::isTransOnly(unsigned Opcode) const { if (ST.hasCaymanISA()) return false; - return (get(Opcode).getSchedClass() == AMDGPU::Sched::TransALU); + return (get(Opcode).getSchedClass() == R600::Sched::TransALU); } bool R600InstrInfo::isTransOnly(const MachineInstr &MI) const { @@ -181,7 +186,7 @@ } bool R600InstrInfo::isVectorOnly(unsigned Opcode) const { - return (get(Opcode).getSchedClass() == AMDGPU::Sched::VecALU); + return (get(Opcode).getSchedClass() == R600::Sched::VecALU); } bool R600InstrInfo::isVectorOnly(const MachineInstr &MI) const { @@ -215,8 +220,8 @@ bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const { switch (Opcode) { - case AMDGPU::KILLGT: - case AMDGPU::GROUP_BARRIER: + case R600::KILLGT: + case R600::GROUP_BARRIER: return true; default: return false; @@ -224,11 +229,11 @@ } bool R600InstrInfo::usesAddressRegister(MachineInstr &MI) const { - return MI.findRegisterUseOperandIdx(AMDGPU::AR_X) != -1; + return MI.findRegisterUseOperandIdx(R600::AR_X) != -1; } bool R600InstrInfo::definesAddressRegister(MachineInstr &MI) const { - return MI.findRegisterDefOperandIdx(AMDGPU::AR_X) != -1; + return MI.findRegisterDefOperandIdx(R600::AR_X) != -1; } bool R600InstrInfo::readsLDSSrcReg(const MachineInstr &MI) const { @@ -242,7 +247,7 @@ TargetRegisterInfo::isVirtualRegister(I->getReg())) continue; - if (AMDGPU::R600_LDS_SRC_REGRegClass.contains(I->getReg())) + if (R600::R600_LDS_SRC_REGRegClass.contains(I->getReg())) return true; } return false; @@ -250,17 +255,17 @@ int R600InstrInfo::getSelIdx(unsigned Opcode, unsigned SrcIdx) const { static const unsigned SrcSelTable[][2] = { - {AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel}, - {AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel}, - {AMDGPU::OpName::src2, AMDGPU::OpName::src2_sel}, - {AMDGPU::OpName::src0_X, AMDGPU::OpName::src0_sel_X}, - {AMDGPU::OpName::src0_Y, AMDGPU::OpName::src0_sel_Y}, - {AMDGPU::OpName::src0_Z, AMDGPU::OpName::src0_sel_Z}, - {AMDGPU::OpName::src0_W, AMDGPU::OpName::src0_sel_W}, - {AMDGPU::OpName::src1_X, AMDGPU::OpName::src1_sel_X}, - {AMDGPU::OpName::src1_Y, AMDGPU::OpName::src1_sel_Y}, - {AMDGPU::OpName::src1_Z, AMDGPU::OpName::src1_sel_Z}, - {AMDGPU::OpName::src1_W, AMDGPU::OpName::src1_sel_W} + {R600::OpName::src0, R600::OpName::src0_sel}, + {R600::OpName::src1, R600::OpName::src1_sel}, + {R600::OpName::src2, R600::OpName::src2_sel}, + {R600::OpName::src0_X, R600::OpName::src0_sel_X}, + {R600::OpName::src0_Y, R600::OpName::src0_sel_Y}, + {R600::OpName::src0_Z, R600::OpName::src0_sel_Z}, + {R600::OpName::src0_W, R600::OpName::src0_sel_W}, + {R600::OpName::src1_X, R600::OpName::src1_sel_X}, + {R600::OpName::src1_Y, R600::OpName::src1_sel_Y}, + {R600::OpName::src1_Z, R600::OpName::src1_sel_Z}, + {R600::OpName::src1_W, R600::OpName::src1_sel_W} }; for (const auto &Row : SrcSelTable) { @@ -275,23 +280,23 @@ R600InstrInfo::getSrcs(MachineInstr &MI) const { SmallVector, 3> Result; - if (MI.getOpcode() == AMDGPU::DOT_4) { + if (MI.getOpcode() == R600::DOT_4) { static const unsigned OpTable[8][2] = { - {AMDGPU::OpName::src0_X, AMDGPU::OpName::src0_sel_X}, - {AMDGPU::OpName::src0_Y, AMDGPU::OpName::src0_sel_Y}, - {AMDGPU::OpName::src0_Z, AMDGPU::OpName::src0_sel_Z}, - {AMDGPU::OpName::src0_W, AMDGPU::OpName::src0_sel_W}, - {AMDGPU::OpName::src1_X, AMDGPU::OpName::src1_sel_X}, - {AMDGPU::OpName::src1_Y, AMDGPU::OpName::src1_sel_Y}, - {AMDGPU::OpName::src1_Z, AMDGPU::OpName::src1_sel_Z}, - {AMDGPU::OpName::src1_W, AMDGPU::OpName::src1_sel_W}, + {R600::OpName::src0_X, R600::OpName::src0_sel_X}, + {R600::OpName::src0_Y, R600::OpName::src0_sel_Y}, + {R600::OpName::src0_Z, R600::OpName::src0_sel_Z}, + {R600::OpName::src0_W, R600::OpName::src0_sel_W}, + {R600::OpName::src1_X, R600::OpName::src1_sel_X}, + {R600::OpName::src1_Y, R600::OpName::src1_sel_Y}, + {R600::OpName::src1_Z, R600::OpName::src1_sel_Z}, + {R600::OpName::src1_W, R600::OpName::src1_sel_W}, }; for (unsigned j = 0; j < 8; j++) { MachineOperand &MO = MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][0])); unsigned Reg = MO.getReg(); - if (Reg == AMDGPU::ALU_CONST) { + if (Reg == R600::ALU_CONST) { MachineOperand &Sel = MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1])); Result.push_back(std::make_pair(&MO, Sel.getImm())); @@ -303,9 +308,9 @@ } static const unsigned OpTable[3][2] = { - {AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel}, - {AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel}, - {AMDGPU::OpName::src2, AMDGPU::OpName::src2_sel}, + {R600::OpName::src0, R600::OpName::src0_sel}, + {R600::OpName::src1, R600::OpName::src1_sel}, + {R600::OpName::src2, R600::OpName::src2_sel}, }; for (unsigned j = 0; j < 3; j++) { @@ -314,15 +319,15 @@ break; MachineOperand &MO = MI.getOperand(SrcIdx); unsigned Reg = MO.getReg(); - if (Reg == AMDGPU::ALU_CONST) { + if (Reg == R600::ALU_CONST) { MachineOperand &Sel = MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1])); Result.push_back(std::make_pair(&MO, Sel.getImm())); continue; } - if (Reg == AMDGPU::ALU_LITERAL_X) { + if (Reg == R600::ALU_LITERAL_X) { MachineOperand &Operand = - MI.getOperand(getOperandIdx(MI.getOpcode(), AMDGPU::OpName::literal)); + MI.getOperand(getOperandIdx(MI.getOpcode(), R600::OpName::literal)); if (Operand.isImm()) { Result.push_back(std::make_pair(&MO, Operand.getImm())); continue; @@ -346,7 +351,7 @@ ++i; unsigned Reg = Src.first->getReg(); int Index = RI.getEncodingValue(Reg) & 0xff; - if (Reg == AMDGPU::OQAP) { + if (Reg == R600::OQAP) { Result.push_back(std::make_pair(Index, 0U)); } if (PV.find(Reg) != PV.end()) { @@ -436,7 +441,7 @@ const std::pair &Src = Srcs[j]; if (Src.first < 0 || Src.first == 255) continue; - if (Src.first == GET_REG_INDEX(RI.getEncodingValue(AMDGPU::OQAP))) { + if (Src.first == GET_REG_INDEX(RI.getEncodingValue(R600::OQAP))) { if (Swz[i] != R600InstrInfo::ALU_VEC_012_SCL_210 && Swz[i] != R600InstrInfo::ALU_VEC_021_SCL_122) { // The value from output queue A (denoted by register OQAP) can @@ -542,7 +547,7 @@ for (unsigned i = 0, e = IG.size(); i < e; ++i) { IGSrcs.push_back(ExtractSrcs(*IG[i], PV, ConstCount)); unsigned Op = getOperandIdx(IG[i]->getOpcode(), - AMDGPU::OpName::bank_swizzle); + R600::OpName::bank_swizzle); ValidSwizzle.push_back( (R600InstrInfo::BankSwizzle) IG[i]->getOperand(Op).getImm()); } @@ -611,14 +616,14 @@ continue; for (const auto &Src : getSrcs(MI)) { - if (Src.first->getReg() == AMDGPU::ALU_LITERAL_X) + if (Src.first->getReg() == R600::ALU_LITERAL_X) Literals.insert(Src.second); if (Literals.size() > 4) return false; - if (Src.first->getReg() == AMDGPU::ALU_CONST) + if (Src.first->getReg() == R600::ALU_CONST) Consts.push_back(Src.second); - if (AMDGPU::R600_KC0RegClass.contains(Src.first->getReg()) || - AMDGPU::R600_KC1RegClass.contains(Src.first->getReg())) { + if (R600::R600_KC0RegClass.contains(Src.first->getReg()) || + R600::R600_KC1RegClass.contains(Src.first->getReg())) { unsigned Index = RI.getEncodingValue(Src.first->getReg()) & 0xff; unsigned Chan = RI.getHWRegChan(Src.first->getReg()); Consts.push_back((Index << 2) | Chan); @@ -637,7 +642,7 @@ static bool isPredicateSetter(unsigned Opcode) { switch (Opcode) { - case AMDGPU::PRED_X: + case R600::PRED_X: return true; default: return false; @@ -659,12 +664,12 @@ static bool isJump(unsigned Opcode) { - return Opcode == AMDGPU::JUMP || Opcode == AMDGPU::JUMP_COND; + return Opcode == R600::JUMP || Opcode == R600::JUMP_COND; } static bool isBranch(unsigned Opcode) { - return Opcode == AMDGPU::BRANCH || Opcode == AMDGPU::BRANCH_COND_i32 || - Opcode == AMDGPU::BRANCH_COND_f32; + return Opcode == R600::BRANCH || Opcode == R600::BRANCH_COND_i32 || + Opcode == R600::BRANCH_COND_f32; } bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB, @@ -679,7 +684,7 @@ if (I == MBB.end()) return false; - // AMDGPU::BRANCH* instructions are only available after isel and are not + // R600::BRANCH* instructions are only available after isel and are not // handled if (isBranch(I->getOpcode())) return true; @@ -688,7 +693,7 @@ } // Remove successive JUMP - while (I != MBB.begin() && std::prev(I)->getOpcode() == AMDGPU::JUMP) { + while (I != MBB.begin() && std::prev(I)->getOpcode() == R600::JUMP) { MachineBasicBlock::iterator PriorI = std::prev(I); if (AllowModify) I->removeFromParent(); @@ -699,10 +704,10 @@ // If there is only one terminator instruction, process it. unsigned LastOpc = LastInst.getOpcode(); if (I == MBB.begin() || !isJump((--I)->getOpcode())) { - if (LastOpc == AMDGPU::JUMP) { + if (LastOpc == R600::JUMP) { TBB = LastInst.getOperand(0).getMBB(); return false; - } else if (LastOpc == AMDGPU::JUMP_COND) { + } else if (LastOpc == R600::JUMP_COND) { auto predSet = I; while (!isPredicateSetter(predSet->getOpcode())) { predSet = --I; @@ -710,7 +715,7 @@ TBB = LastInst.getOperand(0).getMBB(); Cond.push_back(predSet->getOperand(1)); Cond.push_back(predSet->getOperand(2)); - Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false)); + Cond.push_back(MachineOperand::CreateReg(R600::PRED_SEL_ONE, false)); return false; } return true; // Can't handle indirect branch. @@ -721,7 +726,7 @@ unsigned SecondLastOpc = SecondLastInst.getOpcode(); // If the block ends with a B and a Bcc, handle it. - if (SecondLastOpc == AMDGPU::JUMP_COND && LastOpc == AMDGPU::JUMP) { + if (SecondLastOpc == R600::JUMP_COND && LastOpc == R600::JUMP) { auto predSet = --I; while (!isPredicateSetter(predSet->getOpcode())) { predSet = --I; @@ -730,7 +735,7 @@ FBB = LastInst.getOperand(0).getMBB(); Cond.push_back(predSet->getOperand(1)); Cond.push_back(predSet->getOperand(2)); - Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false)); + Cond.push_back(MachineOperand::CreateReg(R600::PRED_SEL_ONE, false)); return false; } @@ -742,8 +747,8 @@ MachineBasicBlock::iterator FindLastAluClause(MachineBasicBlock &MBB) { for (MachineBasicBlock::reverse_iterator It = MBB.rbegin(), E = MBB.rend(); It != E; ++It) { - if (It->getOpcode() == AMDGPU::CF_ALU || - It->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE) + if (It->getOpcode() == R600::CF_ALU || + It->getOpcode() == R600::CF_ALU_PUSH_BEFORE) return It.getReverse(); } return MBB.end(); @@ -760,7 +765,7 @@ if (!FBB) { if (Cond.empty()) { - BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(TBB); + BuildMI(&MBB, DL, get(R600::JUMP)).addMBB(TBB); return 1; } else { MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end()); @@ -768,14 +773,14 @@ addFlag(*PredSet, 0, MO_FLAG_PUSH); PredSet->getOperand(2).setImm(Cond[1].getImm()); - BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND)) + BuildMI(&MBB, DL, get(R600::JUMP_COND)) .addMBB(TBB) - .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); + .addReg(R600::PREDICATE_BIT, RegState::Kill); MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB); if (CfAlu == MBB.end()) return 1; - assert (CfAlu->getOpcode() == AMDGPU::CF_ALU); - CfAlu->setDesc(get(AMDGPU::CF_ALU_PUSH_BEFORE)); + assert (CfAlu->getOpcode() == R600::CF_ALU); + CfAlu->setDesc(get(R600::CF_ALU_PUSH_BEFORE)); return 1; } } else { @@ -783,15 +788,15 @@ assert(PredSet && "No previous predicate !"); addFlag(*PredSet, 0, MO_FLAG_PUSH); PredSet->getOperand(2).setImm(Cond[1].getImm()); - BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND)) + BuildMI(&MBB, DL, get(R600::JUMP_COND)) .addMBB(TBB) - .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); - BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(FBB); + .addReg(R600::PREDICATE_BIT, RegState::Kill); + BuildMI(&MBB, DL, get(R600::JUMP)).addMBB(FBB); MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB); if (CfAlu == MBB.end()) return 2; - assert (CfAlu->getOpcode() == AMDGPU::CF_ALU); - CfAlu->setDesc(get(AMDGPU::CF_ALU_PUSH_BEFORE)); + assert (CfAlu->getOpcode() == R600::CF_ALU); + CfAlu->setDesc(get(R600::CF_ALU_PUSH_BEFORE)); return 2; } } @@ -812,18 +817,18 @@ switch (I->getOpcode()) { default: return 0; - case AMDGPU::JUMP_COND: { + case R600::JUMP_COND: { MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I); clearFlag(*predSet, 0, MO_FLAG_PUSH); I->eraseFromParent(); MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB); if (CfAlu == MBB.end()) break; - assert (CfAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE); - CfAlu->setDesc(get(AMDGPU::CF_ALU)); + assert (CfAlu->getOpcode() == R600::CF_ALU_PUSH_BEFORE); + CfAlu->setDesc(get(R600::CF_ALU)); break; } - case AMDGPU::JUMP: + case R600::JUMP: I->eraseFromParent(); break; } @@ -837,18 +842,18 @@ // FIXME: only one case?? default: return 1; - case AMDGPU::JUMP_COND: { + case R600::JUMP_COND: { MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I); clearFlag(*predSet, 0, MO_FLAG_PUSH); I->eraseFromParent(); MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB); if (CfAlu == MBB.end()) break; - assert (CfAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE); - CfAlu->setDesc(get(AMDGPU::CF_ALU)); + assert (CfAlu->getOpcode() == R600::CF_ALU_PUSH_BEFORE); + CfAlu->setDesc(get(R600::CF_ALU)); break; } - case AMDGPU::JUMP: + case R600::JUMP: I->eraseFromParent(); break; } @@ -863,9 +868,9 @@ unsigned Reg = MI.getOperand(idx).getReg(); switch (Reg) { default: return false; - case AMDGPU::PRED_SEL_ONE: - case AMDGPU::PRED_SEL_ZERO: - case AMDGPU::PREDICATE_BIT: + case R600::PRED_SEL_ONE: + case R600::PRED_SEL_ZERO: + case R600::PREDICATE_BIT: return true; } } @@ -876,9 +881,9 @@ // be predicated. Until we have proper support for instruction clauses in the // backend, we will mark KILL* instructions as unpredicable. - if (MI.getOpcode() == AMDGPU::KILLGT) { + if (MI.getOpcode() == R600::KILLGT) { return false; - } else if (MI.getOpcode() == AMDGPU::CF_ALU) { + } else if (MI.getOpcode() == R600::CF_ALU) { // If the clause start in the middle of MBB then the MBB has more // than a single clause, unable to predicate several clauses. if (MI.getParent()->begin() != MachineBasicBlock::const_iterator(MI)) @@ -888,7 +893,7 @@ } else if (isVector(MI)) { return false; } else { - return AMDGPUInstrInfo::isPredicable(MI); + return TargetInstrInfo::isPredicable(MI); } } @@ -929,17 +934,17 @@ R600InstrInfo::reverseBranchCondition(SmallVectorImpl &Cond) const { MachineOperand &MO = Cond[1]; switch (MO.getImm()) { - case AMDGPU::PRED_SETE_INT: - MO.setImm(AMDGPU::PRED_SETNE_INT); + case R600::PRED_SETE_INT: + MO.setImm(R600::PRED_SETNE_INT); break; - case AMDGPU::PRED_SETNE_INT: - MO.setImm(AMDGPU::PRED_SETE_INT); + case R600::PRED_SETNE_INT: + MO.setImm(R600::PRED_SETE_INT); break; - case AMDGPU::PRED_SETE: - MO.setImm(AMDGPU::PRED_SETNE); + case R600::PRED_SETE: + MO.setImm(R600::PRED_SETNE); break; - case AMDGPU::PRED_SETNE: - MO.setImm(AMDGPU::PRED_SETE); + case R600::PRED_SETNE: + MO.setImm(R600::PRED_SETE); break; default: return true; @@ -947,11 +952,11 @@ MachineOperand &MO2 = Cond[2]; switch (MO2.getReg()) { - case AMDGPU::PRED_SEL_ZERO: - MO2.setReg(AMDGPU::PRED_SEL_ONE); + case R600::PRED_SEL_ZERO: + MO2.setReg(R600::PRED_SEL_ONE); break; - case AMDGPU::PRED_SEL_ONE: - MO2.setReg(AMDGPU::PRED_SEL_ZERO); + case R600::PRED_SEL_ONE: + MO2.setReg(R600::PRED_SEL_ZERO); break; default: return true; @@ -968,22 +973,22 @@ ArrayRef Pred) const { int PIdx = MI.findFirstPredOperandIdx(); - if (MI.getOpcode() == AMDGPU::CF_ALU) { + if (MI.getOpcode() == R600::CF_ALU) { MI.getOperand(8).setImm(0); return true; } - if (MI.getOpcode() == AMDGPU::DOT_4) { - MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_X)) + if (MI.getOpcode() == R600::DOT_4) { + MI.getOperand(getOperandIdx(MI, R600::OpName::pred_sel_X)) .setReg(Pred[2].getReg()); - MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_Y)) + MI.getOperand(getOperandIdx(MI, R600::OpName::pred_sel_Y)) .setReg(Pred[2].getReg()); - MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_Z)) + MI.getOperand(getOperandIdx(MI, R600::OpName::pred_sel_Z)) .setReg(Pred[2].getReg()); - MI.getOperand(getOperandIdx(MI, AMDGPU::OpName::pred_sel_W)) + MI.getOperand(getOperandIdx(MI, R600::OpName::pred_sel_W)) .setReg(Pred[2].getReg()); MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI); - MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit); + MIB.addReg(R600::PREDICATE_BIT, RegState::Implicit); return true; } @@ -991,7 +996,7 @@ MachineOperand &PMO = MI.getOperand(PIdx); PMO.setReg(Pred[2].getReg()); MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI); - MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit); + MIB.addReg(R600::PREDICATE_BIT, RegState::Implicit); return true; } @@ -1021,20 +1026,20 @@ default: { MachineBasicBlock *MBB = MI.getParent(); int OffsetOpIdx = - AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::addr); + R600::getNamedOperandIdx(MI.getOpcode(), R600::OpName::addr); // addr is a custom operand with multiple MI operands, and only the // first MI operand is given a name. int RegOpIdx = OffsetOpIdx + 1; int ChanOpIdx = - AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::chan); + R600::getNamedOperandIdx(MI.getOpcode(), R600::OpName::chan); if (isRegisterLoad(MI)) { int DstOpIdx = - AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst); + R600::getNamedOperandIdx(MI.getOpcode(), R600::OpName::dst); unsigned RegIndex = MI.getOperand(RegOpIdx).getImm(); unsigned Channel = MI.getOperand(ChanOpIdx).getImm(); unsigned Address = calculateIndirectAddress(RegIndex, Channel); unsigned OffsetReg = MI.getOperand(OffsetOpIdx).getReg(); - if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) { + if (OffsetReg == R600::INDIRECT_BASE_ADDR) { buildMovInstr(MBB, MI, MI.getOperand(DstOpIdx).getReg(), getIndirectAddrRegClass()->getRegister(Address)); } else { @@ -1043,12 +1048,12 @@ } } else if (isRegisterStore(MI)) { int ValOpIdx = - AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::val); + R600::getNamedOperandIdx(MI.getOpcode(), R600::OpName::val); unsigned RegIndex = MI.getOperand(RegOpIdx).getImm(); unsigned Channel = MI.getOperand(ChanOpIdx).getImm(); unsigned Address = calculateIndirectAddress(RegIndex, Channel); unsigned OffsetReg = MI.getOperand(OffsetOpIdx).getReg(); - if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) { + if (OffsetReg == R600::INDIRECT_BASE_ADDR) { buildMovInstr(MBB, MI, getIndirectAddrRegClass()->getRegister(Address), MI.getOperand(ValOpIdx).getReg()); } else { @@ -1063,15 +1068,15 @@ MBB->erase(MI); return true; } - case AMDGPU::R600_EXTRACT_ELT_V2: - case AMDGPU::R600_EXTRACT_ELT_V4: + case R600::R600_EXTRACT_ELT_V2: + case R600::R600_EXTRACT_ELT_V4: buildIndirectRead(MI.getParent(), MI, MI.getOperand(0).getReg(), RI.getHWRegIndex(MI.getOperand(1).getReg()), // Address MI.getOperand(2).getReg(), RI.getHWRegChan(MI.getOperand(1).getReg())); break; - case AMDGPU::R600_INSERT_ELT_V2: - case AMDGPU::R600_INSERT_ELT_V4: + case R600::R600_INSERT_ELT_V2: + case R600::R600_INSERT_ELT_V4: buildIndirectWrite(MI.getParent(), MI, MI.getOperand(2).getReg(), // Value RI.getHWRegIndex(MI.getOperand(1).getReg()), // Address MI.getOperand(3).getReg(), // Offset @@ -1096,14 +1101,14 @@ for (int Index = getIndirectIndexBegin(MF); Index <= End; ++Index) { for (unsigned Chan = 0; Chan < StackWidth; ++Chan) { - unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister((4 * Index) + Chan); + unsigned Reg = R600::R600_TReg32RegClass.getRegister((4 * Index) + Chan); TRI.reserveRegisterTuples(Reserved, Reg); } } } const TargetRegisterClass *R600InstrInfo::getIndirectAddrRegClass() const { - return &AMDGPU::R600_TReg32_XRegClass; + return &R600::R600_TReg32_XRegClass; } MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB, @@ -1121,20 +1126,20 @@ unsigned AddrReg; switch (AddrChan) { default: llvm_unreachable("Invalid Channel"); - case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break; - case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break; - case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break; - case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break; + case 0: AddrReg = R600::R600_AddrRegClass.getRegister(Address); break; + case 1: AddrReg = R600::R600_Addr_YRegClass.getRegister(Address); break; + case 2: AddrReg = R600::R600_Addr_ZRegClass.getRegister(Address); break; + case 3: AddrReg = R600::R600_Addr_WRegClass.getRegister(Address); break; } - MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg, - AMDGPU::AR_X, OffsetReg); - setImmOperand(*MOVA, AMDGPU::OpName::write, 0); + MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, R600::MOVA_INT_eg, + R600::AR_X, OffsetReg); + setImmOperand(*MOVA, R600::OpName::write, 0); - MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV, + MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, R600::MOV, AddrReg, ValueReg) - .addReg(AMDGPU::AR_X, + .addReg(R600::AR_X, RegState::Implicit | RegState::Kill); - setImmOperand(*Mov, AMDGPU::OpName::dst_rel, 1); + setImmOperand(*Mov, R600::OpName::dst_rel, 1); return Mov; } @@ -1153,21 +1158,21 @@ unsigned AddrReg; switch (AddrChan) { default: llvm_unreachable("Invalid Channel"); - case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break; - case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break; - case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break; - case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break; + case 0: AddrReg = R600::R600_AddrRegClass.getRegister(Address); break; + case 1: AddrReg = R600::R600_Addr_YRegClass.getRegister(Address); break; + case 2: AddrReg = R600::R600_Addr_ZRegClass.getRegister(Address); break; + case 3: AddrReg = R600::R600_Addr_WRegClass.getRegister(Address); break; } - MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg, - AMDGPU::AR_X, + MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, R600::MOVA_INT_eg, + R600::AR_X, OffsetReg); - setImmOperand(*MOVA, AMDGPU::OpName::write, 0); - MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV, + setImmOperand(*MOVA, R600::OpName::write, 0); + MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, R600::MOV, ValueReg, AddrReg) - .addReg(AMDGPU::AR_X, + .addReg(R600::AR_X, RegState::Implicit | RegState::Kill); - setImmOperand(*Mov, AMDGPU::OpName::src0_rel, 1); + setImmOperand(*Mov, R600::OpName::src0_rel, 1); return Mov; } @@ -1265,7 +1270,7 @@ //XXX: The r600g finalizer expects this to be 1, once we've moved the //scheduling to the backend, we can change the default to 0. MIB.addImm(1) // $last - .addReg(AMDGPU::PRED_SEL_OFF) // $pred_sel + .addReg(R600::PRED_SEL_OFF) // $pred_sel .addImm(0) // $literal .addImm(0); // $bank_swizzle @@ -1286,23 +1291,23 @@ static unsigned getSlotedOps(unsigned Op, unsigned Slot) { switch (Op) { - OPERAND_CASE(AMDGPU::OpName::update_exec_mask) - OPERAND_CASE(AMDGPU::OpName::update_pred) - OPERAND_CASE(AMDGPU::OpName::write) - OPERAND_CASE(AMDGPU::OpName::omod) - OPERAND_CASE(AMDGPU::OpName::dst_rel) - OPERAND_CASE(AMDGPU::OpName::clamp) - OPERAND_CASE(AMDGPU::OpName::src0) - OPERAND_CASE(AMDGPU::OpName::src0_neg) - OPERAND_CASE(AMDGPU::OpName::src0_rel) - OPERAND_CASE(AMDGPU::OpName::src0_abs) - OPERAND_CASE(AMDGPU::OpName::src0_sel) - OPERAND_CASE(AMDGPU::OpName::src1) - OPERAND_CASE(AMDGPU::OpName::src1_neg) - OPERAND_CASE(AMDGPU::OpName::src1_rel) - OPERAND_CASE(AMDGPU::OpName::src1_abs) - OPERAND_CASE(AMDGPU::OpName::src1_sel) - OPERAND_CASE(AMDGPU::OpName::pred_sel) + OPERAND_CASE(R600::OpName::update_exec_mask) + OPERAND_CASE(R600::OpName::update_pred) + OPERAND_CASE(R600::OpName::write) + OPERAND_CASE(R600::OpName::omod) + OPERAND_CASE(R600::OpName::dst_rel) + OPERAND_CASE(R600::OpName::clamp) + OPERAND_CASE(R600::OpName::src0) + OPERAND_CASE(R600::OpName::src0_neg) + OPERAND_CASE(R600::OpName::src0_rel) + OPERAND_CASE(R600::OpName::src0_abs) + OPERAND_CASE(R600::OpName::src0_sel) + OPERAND_CASE(R600::OpName::src1) + OPERAND_CASE(R600::OpName::src1_neg) + OPERAND_CASE(R600::OpName::src1_rel) + OPERAND_CASE(R600::OpName::src1_abs) + OPERAND_CASE(R600::OpName::src1_sel) + OPERAND_CASE(R600::OpName::pred_sel) default: llvm_unreachable("Wrong Operand"); } @@ -1313,39 +1318,39 @@ MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction( MachineBasicBlock &MBB, MachineInstr *MI, unsigned Slot, unsigned DstReg) const { - assert (MI->getOpcode() == AMDGPU::DOT_4 && "Not Implemented"); + assert (MI->getOpcode() == R600::DOT_4 && "Not Implemented"); unsigned Opcode; if (ST.getGeneration() <= R600Subtarget::R700) - Opcode = AMDGPU::DOT4_r600; + Opcode = R600::DOT4_r600; else - Opcode = AMDGPU::DOT4_eg; + Opcode = R600::DOT4_eg; MachineBasicBlock::iterator I = MI; MachineOperand &Src0 = MI->getOperand( - getOperandIdx(MI->getOpcode(), getSlotedOps(AMDGPU::OpName::src0, Slot))); + getOperandIdx(MI->getOpcode(), getSlotedOps(R600::OpName::src0, Slot))); MachineOperand &Src1 = MI->getOperand( - getOperandIdx(MI->getOpcode(), getSlotedOps(AMDGPU::OpName::src1, Slot))); + getOperandIdx(MI->getOpcode(), getSlotedOps(R600::OpName::src1, Slot))); MachineInstr *MIB = buildDefaultInstruction( MBB, I, Opcode, DstReg, Src0.getReg(), Src1.getReg()); static const unsigned Operands[14] = { - AMDGPU::OpName::update_exec_mask, - AMDGPU::OpName::update_pred, - AMDGPU::OpName::write, - AMDGPU::OpName::omod, - AMDGPU::OpName::dst_rel, - AMDGPU::OpName::clamp, - AMDGPU::OpName::src0_neg, - AMDGPU::OpName::src0_rel, - AMDGPU::OpName::src0_abs, - AMDGPU::OpName::src0_sel, - AMDGPU::OpName::src1_neg, - AMDGPU::OpName::src1_rel, - AMDGPU::OpName::src1_abs, - AMDGPU::OpName::src1_sel, + R600::OpName::update_exec_mask, + R600::OpName::update_pred, + R600::OpName::write, + R600::OpName::omod, + R600::OpName::dst_rel, + R600::OpName::clamp, + R600::OpName::src0_neg, + R600::OpName::src0_rel, + R600::OpName::src0_abs, + R600::OpName::src0_sel, + R600::OpName::src1_neg, + R600::OpName::src1_rel, + R600::OpName::src1_abs, + R600::OpName::src1_sel, }; MachineOperand &MO = MI->getOperand(getOperandIdx(MI->getOpcode(), - getSlotedOps(AMDGPU::OpName::pred_sel, Slot))); - MIB->getOperand(getOperandIdx(Opcode, AMDGPU::OpName::pred_sel)) + getSlotedOps(R600::OpName::pred_sel, Slot))); + MIB->getOperand(getOperandIdx(Opcode, R600::OpName::pred_sel)) .setReg(MO.getReg()); for (unsigned i = 0; i < 14; i++) { @@ -1362,16 +1367,16 @@ MachineBasicBlock::iterator I, unsigned DstReg, uint64_t Imm) const { - MachineInstr *MovImm = buildDefaultInstruction(BB, I, AMDGPU::MOV, DstReg, - AMDGPU::ALU_LITERAL_X); - setImmOperand(*MovImm, AMDGPU::OpName::literal, Imm); + MachineInstr *MovImm = buildDefaultInstruction(BB, I, R600::MOV, DstReg, + R600::ALU_LITERAL_X); + setImmOperand(*MovImm, R600::OpName::literal, Imm); return MovImm; } MachineInstr *R600InstrInfo::buildMovInstr(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, unsigned DstReg, unsigned SrcReg) const { - return buildDefaultInstruction(*MBB, I, AMDGPU::MOV, DstReg, SrcReg); + return buildDefaultInstruction(*MBB, I, R600::MOV, DstReg, SrcReg); } int R600InstrInfo::getOperandIdx(const MachineInstr &MI, unsigned Op) const { @@ -1379,7 +1384,7 @@ } int R600InstrInfo::getOperandIdx(unsigned Opcode, unsigned Op) const { - return AMDGPU::getNamedOperandIdx(Opcode, Op); + return R600::getNamedOperandIdx(Opcode, Op); } void R600InstrInfo::setImmOperand(MachineInstr &MI, unsigned Op, @@ -1406,25 +1411,25 @@ bool IsOP3 = (TargetFlags & R600_InstFlag::OP3) == R600_InstFlag::OP3; switch (Flag) { case MO_FLAG_CLAMP: - FlagIndex = getOperandIdx(MI, AMDGPU::OpName::clamp); + FlagIndex = getOperandIdx(MI, R600::OpName::clamp); break; case MO_FLAG_MASK: - FlagIndex = getOperandIdx(MI, AMDGPU::OpName::write); + FlagIndex = getOperandIdx(MI, R600::OpName::write); break; case MO_FLAG_NOT_LAST: case MO_FLAG_LAST: - FlagIndex = getOperandIdx(MI, AMDGPU::OpName::last); + FlagIndex = getOperandIdx(MI, R600::OpName::last); break; case MO_FLAG_NEG: switch (SrcIdx) { case 0: - FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src0_neg); + FlagIndex = getOperandIdx(MI, R600::OpName::src0_neg); break; case 1: - FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src1_neg); + FlagIndex = getOperandIdx(MI, R600::OpName::src1_neg); break; case 2: - FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src2_neg); + FlagIndex = getOperandIdx(MI, R600::OpName::src2_neg); break; } break; @@ -1435,10 +1440,10 @@ (void)IsOP3; switch (SrcIdx) { case 0: - FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src0_abs); + FlagIndex = getOperandIdx(MI, R600::OpName::src0_abs); break; case 1: - FlagIndex = getOperandIdx(MI, AMDGPU::OpName::src1_abs); + FlagIndex = getOperandIdx(MI, R600::OpName::src1_abs); break; } break; @@ -1499,15 +1504,15 @@ switch (Kind) { case PseudoSourceValue::Stack: case PseudoSourceValue::FixedStack: - return AMDGPUASI.PRIVATE_ADDRESS; + return ST.getAMDGPUAS().PRIVATE_ADDRESS; case PseudoSourceValue::ConstantPool: case PseudoSourceValue::GOT: case PseudoSourceValue::JumpTable: case PseudoSourceValue::GlobalValueCallEntry: case PseudoSourceValue::ExternalSymbolCallEntry: case PseudoSourceValue::TargetCustom: - return AMDGPUASI.CONSTANT_ADDRESS; + return ST.getAMDGPUAS().CONSTANT_ADDRESS; } llvm_unreachable("Invalid pseudo source kind"); - return AMDGPUASI.PRIVATE_ADDRESS; + return ST.getAMDGPUAS().PRIVATE_ADDRESS; } Index: lib/Target/AMDGPU/R600Instructions.td =================================================================== --- lib/Target/AMDGPU/R600Instructions.td +++ lib/Target/AMDGPU/R600Instructions.td @@ -12,20 +12,19 @@ // //===----------------------------------------------------------------------===// -include "R600Intrinsics.td" include "R600InstrFormats.td" // FIXME: Should not be arbitrarily split from other R600 inst classes. class R600WrapperInst pattern = []> : AMDGPUInst, PredicateControl { let SubtargetPredicate = isR600toCayman; + let Namespace = "R600"; } class InstR600ISA pattern = []> : InstR600 { - let Namespace = "AMDGPU"; } def MEMxi : Operand { @@ -87,6 +86,12 @@ def R600_Pred : PredicateOperand; +let isTerminator = 1, isReturn = 1, hasCtrlDep = 1, + usesCustomInserter = 1, Namespace = "R600" in { + def RETURN : ILFormat<(outs), (ins variable_ops), + "RETURN", [(AMDGPUendpgm)] + >; +} let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { @@ -220,34 +225,6 @@ } // End mayLoad = 1, mayStore = 0, hasSideEffects = 0 -def TEX_SHADOW : PatLeaf< - (imm), - [{uint32_t TType = (uint32_t)N->getZExtValue(); - return (TType >= 6 && TType <= 8) || TType == 13; - }] ->; - -def TEX_RECT : PatLeaf< - (imm), - [{uint32_t TType = (uint32_t)N->getZExtValue(); - return TType == 5; - }] ->; - -def TEX_ARRAY : PatLeaf< - (imm), - [{uint32_t TType = (uint32_t)N->getZExtValue(); - return TType == 9 || TType == 10 || TType == 16; - }] ->; - -def TEX_SHADOW_ARRAY : PatLeaf< - (imm), - [{uint32_t TType = (uint32_t)N->getZExtValue(); - return TType == 11 || TType == 12 || TType == 17; - }] ->; - class EG_CF_RAT cfinst, bits <6> ratinst, bits<4> ratid, bits<4> mask, dag outs, dag ins, string asm, list pattern> : InstR600ISA , @@ -358,6 +335,8 @@ // R600 SDNodes //===----------------------------------------------------------------------===// +let Namespace = "R600" in { + def INTERP_PAIR_XY : AMDGPUShaderInst < (outs R600_TReg32_X:$dst0, R600_TReg32_Y:$dst1), (ins i32imm:$src0, R600_TReg32_Y:$src1, R600_TReg32_X:$src2), @@ -370,6 +349,8 @@ "INTERP_PAIR_ZW $src0 $src1 $src2 : $dst0 dst1", []>; +} + def CONST_ADDRESS: SDNode<"AMDGPUISD::CONST_ADDRESS", SDTypeProfile<1, -1, [SDTCisInt<0>, SDTCisPtrTy<1>]>, [SDNPVariadic] @@ -417,11 +398,15 @@ // Interpolation Instructions //===----------------------------------------------------------------------===// +let Namespace = "R600" in { + def INTERP_VEC_LOAD : AMDGPUShaderInst < (outs R600_Reg128:$dst), (ins i32imm:$src0), "INTERP_LOAD $src0 : $dst">; +} + def INTERP_XY : R600_2OP <0xD6, "INTERP_XY", []> { let bank_swizzle = 5; } @@ -661,7 +646,7 @@ let isCodeGenOnly = 1, isPseudo = 1 in { -let usesCustomInserter = 1 in { +let Namespace = "R600", usesCustomInserter = 1 in { class CLAMP : AMDGPUShaderInst < (outs rc:$dst), @@ -800,7 +785,9 @@ (ins immType:$imm), "", [] ->; +> { + let Namespace = "R600"; +} } // end let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1 @@ -1015,7 +1002,7 @@ } -let isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" in { +let isCodeGenOnly = 1, isPseudo = 1, Namespace = "R600" in { class R600_VEC2OP pattern> : InstR600 <(outs R600_Reg32:$dst), (ins // Slot X UEM:$update_exec_mask_X, UP:$update_pred_X, WRITE:$write_X, @@ -1335,7 +1322,9 @@ // Regist loads and stores - for indirect addressing //===----------------------------------------------------------------------===// +let Namespace = "R600" in { defm R600_ : RegisterLoadStore ; +} // Hardcode channel to 0 // NOTE: LSHR is not available here. LSHR is per family instruction @@ -1387,11 +1376,12 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in { -def MASK_WRITE : AMDGPUShaderInst < +def MASK_WRITE : InstR600 < (outs), (ins R600_Reg32:$src), "MASK_WRITE $src", - [] + [], + NullALU >; } // End mayLoad = 0, mayStore = 0, hasSideEffects = 1 @@ -1422,7 +1412,7 @@ // Constant Buffer Addressing Support //===----------------------------------------------------------------------===// -let usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" in { +let usesCustomInserter = 1, isCodeGenOnly = 1, isPseudo = 1, Namespace = "R600" in { def CONST_COPY : Instruction { let OutOperandList = (outs R600_Reg32:$dst); let InOperandList = (ins i32imm:$src); @@ -1545,23 +1535,6 @@ //===---------------------------------------------------------------------===// // Flow and Program control Instructions //===---------------------------------------------------------------------===// -class ILFormat pattern> -: Instruction { - - let Namespace = "AMDGPU"; - dag OutOperandList = outs; - dag InOperandList = ins; - let Pattern = pattern; - let AsmString = !strconcat(asmstr, "\n"); - let isPseudo = 1; - let Itinerary = NullALU; - bit hasIEEEFlag = 0; - bit hasZeroOpFlag = 0; - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let isCodeGenOnly = 1; -} multiclass BranchConditional { def _i32 : ILFormat<(outs), @@ -1593,23 +1566,14 @@ // Custom Inserter for Branches and returns, this eventually will be a // separate pass //===---------------------------------------------------------------------===// -let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1 in { +let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1, + Namespace = "R600" in { def BRANCH : ILFormat<(outs), (ins brtarget:$target), "; Pseudo unconditional branch instruction", [(br bb:$target)]>; defm BRANCH_COND : BranchConditional; } -//===---------------------------------------------------------------------===// -// Return instruction -//===---------------------------------------------------------------------===// -let isTerminator = 1, isReturn = 1, hasCtrlDep = 1, - usesCustomInserter = 1 in { - def RETURN : ILFormat<(outs), (ins variable_ops), - "RETURN", [(AMDGPUendpgm)] - >; -} - //===----------------------------------------------------------------------===// // Branch Instructions //===----------------------------------------------------------------------===// @@ -1740,7 +1704,7 @@ // KIL Patterns def KIL : R600Pat < - (int_AMDGPU_kill f32:$src0), + (int_r600_kill f32:$src0), (MASK_WRITE (KILLGT (f32 ZERO), $src0)) >; Index: lib/Target/AMDGPU/R600IntrinsicInfo.h =================================================================== --- /dev/null +++ lib/Target/AMDGPU/R600IntrinsicInfo.h @@ -0,0 +1,58 @@ +//===- R600IntrinsicInfo.h - R600 Intrinsic Information ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +/// \file +/// \brief Interface for the R600 Implementation of the Intrinsic Info class. +// +//===-----------------------------------------------------------------------===// +#ifndef LLVM_LIB_TARGET_AMDGPU_R600INTRINSICINFO_H +#define LLVM_LIB_TARGET_AMDGPU_R600INTRINSICINFO_H + +#include "llvm/IR/Intrinsics.h" +#include "llvm/Target/TargetIntrinsicInfo.h" + +namespace llvm { +class TargetMachine; + +namespace r600Intrinsic { +enum ID { + last_non_R600_intrinsic = Intrinsic::num_intrinsics - 1, +#define GET_INTRINSIC_ENUM_VALUES +#include "R600GenIntrinsics.inc" +#undef GET_INTRINSIC_ENUM_VALUES + , num_R600_intrinsics +}; + +} // end namespace R600Intrinsic + +class R600IntrinsicInfo final : public TargetIntrinsicInfo { +public: + R600IntrinsicInfo(); + + StringRef getName(unsigned IntrId, ArrayRef Tys = None) const; + + std::string getName(unsigned IntrId, Type **Tys = nullptr, + unsigned NumTys = 0) const override; + + unsigned lookupName(const char *Name, unsigned Len) const override; + bool isOverloaded(unsigned IID) const override; + Function *getDeclaration(Module *M, unsigned ID, + Type **Tys = nullptr, + unsigned NumTys = 0) const override; + + Function *getDeclaration(Module *M, unsigned ID, + ArrayRef = None) const; + + FunctionType *getType(LLVMContext &Context, unsigned ID, + ArrayRef Tys = None) const; +}; + +} // end namespace llvm + +#endif Index: lib/Target/AMDGPU/R600IntrinsicInfo.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/R600IntrinsicInfo.cpp @@ -0,0 +1,103 @@ +//===- R600IntrinsicInfo.cpp - R600 Intrinsic Information -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +/// \file +/// \brief R600 Implementation of the IntrinsicInfo class. +// +//===-----------------------------------------------------------------------===// + +#include "R600IntrinsicInfo.h" +#include "AMDGPUSubtarget.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" + +using namespace llvm; + +R600IntrinsicInfo::R600IntrinsicInfo() + : TargetIntrinsicInfo() {} + +static const char *const IntrinsicNameTable[] = { +#define GET_INTRINSIC_NAME_TABLE +#include "R600GenIntrinsics.inc" +#undef GET_INTRINSIC_NAME_TABLE +}; + +namespace { +#define GET_INTRINSIC_ATTRIBUTES +#include "R600GenIntrinsics.inc" +#undef GET_INTRINSIC_ATTRIBUTES +} + +StringRef R600IntrinsicInfo::getName(unsigned IntrID, + ArrayRef Tys) const { + if (IntrID < Intrinsic::num_intrinsics) + return StringRef(); + + assert(IntrID < r600Intrinsic::num_R600_intrinsics && + "Invalid intrinsic ID"); + + return IntrinsicNameTable[IntrID - Intrinsic::num_intrinsics]; +} + +std::string R600IntrinsicInfo::getName(unsigned IntrID, Type **Tys, + unsigned NumTys) const { + return getName(IntrID, makeArrayRef(Tys, NumTys)).str(); +} + +FunctionType *R600IntrinsicInfo::getType(LLVMContext &Context, unsigned ID, + ArrayRef Tys) const { + // FIXME: Re-use Intrinsic::getType machinery + llvm_unreachable("unhandled intrinsic"); +} + +unsigned R600IntrinsicInfo::lookupName(const char *NameData, + unsigned Len) const { + StringRef Name(NameData, Len); + if (!Name.startswith("llvm.")) + return 0; // All intrinsics start with 'llvm.' + + // Look for a name match in our table. If the intrinsic is not overloaded, + // require an exact match. If it is overloaded, require a prefix match. The + // R600 enum enum starts at Intrinsic::num_intrinsics. + int Idx = Intrinsic::lookupLLVMIntrinsicByName(IntrinsicNameTable, Name); + if (Idx >= 0) { + bool IsPrefixMatch = Name.size() > strlen(IntrinsicNameTable[Idx]); + return IsPrefixMatch == isOverloaded(Idx + 1) + ? Intrinsic::num_intrinsics + Idx + : 0; + } + + return 0; +} + +bool R600IntrinsicInfo::isOverloaded(unsigned id) const { +// Overload Table +#define GET_INTRINSIC_OVERLOAD_TABLE +#include "R600GenIntrinsics.inc" +#undef GET_INTRINSIC_OVERLOAD_TABLE +} + +Function *R600IntrinsicInfo::getDeclaration(Module *M, unsigned IntrID, + ArrayRef Tys) const { + FunctionType *FTy = getType(M->getContext(), IntrID, Tys); + Function *F + = cast(M->getOrInsertFunction(getName(IntrID, Tys), FTy)); + + AttributeList AS = + getAttributes(M->getContext(), static_cast(IntrID)); + F->setAttributes(AS); + return F; +} + +Function *R600IntrinsicInfo::getDeclaration(Module *M, unsigned IntrID, + Type **Tys, + unsigned NumTys) const { + return getDeclaration(M, IntrID, makeArrayRef(Tys, NumTys)); +} Index: lib/Target/AMDGPU/R600Intrinsics.td =================================================================== --- lib/Target/AMDGPU/R600Intrinsics.td +++ lib/Target/AMDGPU/R600Intrinsics.td @@ -59,5 +59,7 @@ def int_r600_dot4 : Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable] >; + +def int_r600_kill : Intrinsic<[], [llvm_float_ty], []>; } // End TargetPrefix = "r600", isTarget = 1 Index: lib/Target/AMDGPU/R600MachineScheduler.cpp =================================================================== --- lib/Target/AMDGPU/R600MachineScheduler.cpp +++ lib/Target/AMDGPU/R600MachineScheduler.cpp @@ -164,7 +164,7 @@ for (MachineInstr::mop_iterator It = SU->getInstr()->operands_begin(), E = SU->getInstr()->operands_end(); It != E; ++It) { MachineOperand &MO = *It; - if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X) + if (MO.isReg() && MO.getReg() == R600::ALU_LITERAL_X) ++CurEmitted; } } @@ -184,7 +184,7 @@ static bool isPhysicalRegCopy(MachineInstr *MI) { - if (MI->getOpcode() != AMDGPU::COPY) + if (MI->getOpcode() != R600::COPY) return false; return !TargetRegisterInfo::isVirtualRegister(MI->getOperand(1).getReg()); @@ -227,14 +227,14 @@ return AluTrans; switch (MI->getOpcode()) { - case AMDGPU::PRED_X: + case R600::PRED_X: return AluPredX; - case AMDGPU::INTERP_PAIR_XY: - case AMDGPU::INTERP_PAIR_ZW: - case AMDGPU::INTERP_VEC_LOAD: - case AMDGPU::DOT_4: + case R600::INTERP_PAIR_XY: + case R600::INTERP_PAIR_ZW: + case R600::INTERP_VEC_LOAD: + case R600::DOT_4: return AluT_XYZW; - case AMDGPU::COPY: + case R600::COPY: if (MI->getOperand(1).isUndef()) { // MI will become a KILL, don't considers it in scheduling return AluDiscarded; @@ -249,7 +249,7 @@ if(TII->isVector(*MI) || TII->isCubeOp(MI->getOpcode()) || TII->isReductionOp(MI->getOpcode()) || - MI->getOpcode() == AMDGPU::GROUP_BARRIER) { + MI->getOpcode() == R600::GROUP_BARRIER) { return AluT_XYZW; } @@ -260,13 +260,13 @@ // Is the result already assigned to a channel ? unsigned DestSubReg = MI->getOperand(0).getSubReg(); switch (DestSubReg) { - case AMDGPU::sub0: + case R600::sub0: return AluT_X; - case AMDGPU::sub1: + case R600::sub1: return AluT_Y; - case AMDGPU::sub2: + case R600::sub2: return AluT_Z; - case AMDGPU::sub3: + case R600::sub3: return AluT_W; default: break; @@ -274,16 +274,16 @@ // Is the result already member of a X/Y/Z/W class ? unsigned DestReg = MI->getOperand(0).getReg(); - if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_XRegClass) || - regBelongsToClass(DestReg, &AMDGPU::R600_AddrRegClass)) + if (regBelongsToClass(DestReg, &R600::R600_TReg32_XRegClass) || + regBelongsToClass(DestReg, &R600::R600_AddrRegClass)) return AluT_X; - if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_YRegClass)) + if (regBelongsToClass(DestReg, &R600::R600_TReg32_YRegClass)) return AluT_Y; - if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass)) + if (regBelongsToClass(DestReg, &R600::R600_TReg32_ZRegClass)) return AluT_Z; - if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_WRegClass)) + if (regBelongsToClass(DestReg, &R600::R600_TReg32_WRegClass)) return AluT_W; - if (regBelongsToClass(DestReg, &AMDGPU::R600_Reg128RegClass)) + if (regBelongsToClass(DestReg, &R600::R600_Reg128RegClass)) return AluT_XYZW; // LDS src registers cannot be used in the Trans slot. @@ -304,13 +304,13 @@ } switch (Opcode) { - case AMDGPU::PRED_X: - case AMDGPU::COPY: - case AMDGPU::CONST_COPY: - case AMDGPU::INTERP_PAIR_XY: - case AMDGPU::INTERP_PAIR_ZW: - case AMDGPU::INTERP_VEC_LOAD: - case AMDGPU::DOT_4: + case R600::PRED_X: + case R600::COPY: + case R600::CONST_COPY: + case R600::INTERP_PAIR_XY: + case R600::INTERP_PAIR_ZW: + case R600::INTERP_VEC_LOAD: + case R600::DOT_4: return IDAlu; default: return IDOther; @@ -356,7 +356,7 @@ } void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) { - int DstIndex = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst); + int DstIndex = TII->getOperandIdx(MI->getOpcode(), R600::OpName::dst); if (DstIndex == -1) { return; } @@ -373,16 +373,16 @@ // Constrains the regclass of DestReg to assign it to Slot switch (Slot) { case 0: - MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_XRegClass); + MRI->constrainRegClass(DestReg, &R600::R600_TReg32_XRegClass); break; case 1: - MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_YRegClass); + MRI->constrainRegClass(DestReg, &R600::R600_TReg32_YRegClass); break; case 2: - MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass); + MRI->constrainRegClass(DestReg, &R600::R600_TReg32_ZRegClass); break; case 3: - MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_WRegClass); + MRI->constrainRegClass(DestReg, &R600::R600_TReg32_WRegClass); break; } } Index: lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp =================================================================== --- lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp +++ lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp @@ -79,7 +79,7 @@ std::vector UndefReg; RegSeqInfo(MachineRegisterInfo &MRI, MachineInstr *MI) : Instr(MI) { - assert(MI->getOpcode() == AMDGPU::REG_SEQUENCE); + assert(MI->getOpcode() == R600::REG_SEQUENCE); for (unsigned i = 1, e = Instr->getNumOperands(); i < e; i+=2) { MachineOperand &MO = Instr->getOperand(i); unsigned Chan = Instr->getOperand(i + 1).getImm(); @@ -159,8 +159,8 @@ if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST) return true; switch (MI.getOpcode()) { - case AMDGPU::R600_ExportSwz: - case AMDGPU::EG_ExportSwz: + case R600::R600_ExportSwz: + case R600::EG_ExportSwz: return true; default: return false; @@ -213,12 +213,12 @@ std::vector UpdatedUndef = BaseRSI->UndefReg; for (DenseMap::iterator It = RSI->RegToChan.begin(), E = RSI->RegToChan.end(); It != E; ++It) { - unsigned DstReg = MRI->createVirtualRegister(&AMDGPU::R600_Reg128RegClass); + unsigned DstReg = MRI->createVirtualRegister(&R600::R600_Reg128RegClass); unsigned SubReg = (*It).first; unsigned Swizzle = (*It).second; unsigned Chan = getReassignedChan(RemapChan, Swizzle); - MachineInstr *Tmp = BuildMI(MBB, Pos, DL, TII->get(AMDGPU::INSERT_SUBREG), + MachineInstr *Tmp = BuildMI(MBB, Pos, DL, TII->get(R600::INSERT_SUBREG), DstReg) .addReg(SrcVec) .addReg(SubReg) @@ -234,7 +234,7 @@ SrcVec = DstReg; } MachineInstr *NewMI = - BuildMI(MBB, Pos, DL, TII->get(AMDGPU::COPY), Reg).addReg(SrcVec); + BuildMI(MBB, Pos, DL, TII->get(R600::COPY), Reg).addReg(SrcVec); DEBUG(dbgs() << " ->"; NewMI->dump();); DEBUG(dbgs() << " Updating Swizzle:\n"); @@ -354,7 +354,7 @@ for (MachineBasicBlock::iterator MII = MB->begin(), MIIE = MB->end(); MII != MIIE; ++MII) { MachineInstr &MI = *MII; - if (MI.getOpcode() != AMDGPU::REG_SEQUENCE) { + if (MI.getOpcode() != R600::REG_SEQUENCE) { if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST) { unsigned Reg = MI.getOperand(1).getReg(); for (MachineRegisterInfo::def_instr_iterator Index: lib/Target/AMDGPU/R600Packetizer.cpp =================================================================== --- lib/Target/AMDGPU/R600Packetizer.cpp +++ lib/Target/AMDGPU/R600Packetizer.cpp @@ -84,39 +84,39 @@ LastDstChan = BISlot; if (TII->isPredicated(*BI)) continue; - int OperandIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::write); + int OperandIdx = TII->getOperandIdx(BI->getOpcode(), R600::OpName::write); if (OperandIdx > -1 && BI->getOperand(OperandIdx).getImm() == 0) continue; - int DstIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::dst); + int DstIdx = TII->getOperandIdx(BI->getOpcode(), R600::OpName::dst); if (DstIdx == -1) { continue; } unsigned Dst = BI->getOperand(DstIdx).getReg(); if (isTrans || TII->isTransOnly(*BI)) { - Result[Dst] = AMDGPU::PS; + Result[Dst] = R600::PS; continue; } - if (BI->getOpcode() == AMDGPU::DOT4_r600 || - BI->getOpcode() == AMDGPU::DOT4_eg) { - Result[Dst] = AMDGPU::PV_X; + if (BI->getOpcode() == R600::DOT4_r600 || + BI->getOpcode() == R600::DOT4_eg) { + Result[Dst] = R600::PV_X; continue; } - if (Dst == AMDGPU::OQAP) { + if (Dst == R600::OQAP) { continue; } unsigned PVReg = 0; switch (TRI.getHWRegChan(Dst)) { case 0: - PVReg = AMDGPU::PV_X; + PVReg = R600::PV_X; break; case 1: - PVReg = AMDGPU::PV_Y; + PVReg = R600::PV_Y; break; case 2: - PVReg = AMDGPU::PV_Z; + PVReg = R600::PV_Z; break; case 3: - PVReg = AMDGPU::PV_W; + PVReg = R600::PV_W; break; default: llvm_unreachable("Invalid Chan"); @@ -129,9 +129,9 @@ void substitutePV(MachineInstr &MI, const DenseMap &PVs) const { unsigned Ops[] = { - AMDGPU::OpName::src0, - AMDGPU::OpName::src1, - AMDGPU::OpName::src2 + R600::OpName::src0, + R600::OpName::src1, + R600::OpName::src2 }; for (unsigned i = 0; i < 3; i++) { int OperandIdx = TII->getOperandIdx(MI.getOpcode(), Ops[i]); @@ -171,7 +171,7 @@ return true; if (!TII->isALUInstr(MI.getOpcode())) return true; - if (MI.getOpcode() == AMDGPU::GROUP_BARRIER) + if (MI.getOpcode() == R600::GROUP_BARRIER) return true; // XXX: This can be removed once the packetizer properly handles all the // LDS instruction group restrictions. @@ -185,8 +185,8 @@ if (getSlot(*MII) == getSlot(*MIJ)) ConsideredInstUsesAlreadyWrittenVectorElement = true; // Does MII and MIJ share the same pred_sel ? - int OpI = TII->getOperandIdx(MII->getOpcode(), AMDGPU::OpName::pred_sel), - OpJ = TII->getOperandIdx(MIJ->getOpcode(), AMDGPU::OpName::pred_sel); + int OpI = TII->getOperandIdx(MII->getOpcode(), R600::OpName::pred_sel), + OpJ = TII->getOperandIdx(MIJ->getOpcode(), R600::OpName::pred_sel); unsigned PredI = (OpI > -1)?MII->getOperand(OpI).getReg():0, PredJ = (OpJ > -1)?MIJ->getOperand(OpJ).getReg():0; if (PredI != PredJ) @@ -220,7 +220,7 @@ } void setIsLastBit(MachineInstr *MI, unsigned Bit) const { - unsigned LastOp = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::last); + unsigned LastOp = TII->getOperandIdx(MI->getOpcode(), R600::OpName::last); MI->getOperand(LastOp).setImm(Bit); } @@ -301,11 +301,11 @@ for (unsigned i = 0, e = CurrentPacketMIs.size(); i < e; i++) { MachineInstr *MI = CurrentPacketMIs[i]; unsigned Op = TII->getOperandIdx(MI->getOpcode(), - AMDGPU::OpName::bank_swizzle); + R600::OpName::bank_swizzle); MI->getOperand(Op).setImm(BS[i]); } unsigned Op = - TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::bank_swizzle); + TII->getOperandIdx(MI.getOpcode(), R600::OpName::bank_swizzle); MI.getOperand(Op).setImm(BS.back()); if (!CurrentPacketMIs.empty()) setIsLastBit(CurrentPacketMIs.back(), 0); @@ -334,6 +334,7 @@ // DFA state table should not be empty. assert(Packetizer.getResourceTracker() && "Empty DFA table!"); + assert(Packetizer.getResourceTracker()->getInstrItins()); if (Packetizer.getResourceTracker()->getInstrItins()->isEmpty()) return false; @@ -353,8 +354,8 @@ MachineBasicBlock::iterator End = MBB->end(); MachineBasicBlock::iterator MI = MBB->begin(); while (MI != End) { - if (MI->isKill() || MI->getOpcode() == AMDGPU::IMPLICIT_DEF || - (MI->getOpcode() == AMDGPU::CF_ALU && !MI->getOperand(8).getImm())) { + if (MI->isKill() || MI->getOpcode() == R600::IMPLICIT_DEF || + (MI->getOpcode() == R600::CF_ALU && !MI->getOperand(8).getImm())) { MachineBasicBlock::iterator DeleteMI = MI; ++MI; MBB->erase(DeleteMI); Index: lib/Target/AMDGPU/R600Processors.td =================================================================== --- lib/Target/AMDGPU/R600Processors.td +++ lib/Target/AMDGPU/R600Processors.td @@ -7,6 +7,62 @@ // //===----------------------------------------------------------------------===// +class SubtargetFeatureFetchLimit : + SubtargetFeature <"fetch"#Value, + "TexVTXClauseSize", + Value, + "Limit the maximum number of fetches in a clause to "#Value +>; + +def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst", + "R600ALUInst", + "false", + "Older version of ALU instructions encoding" +>; + +def FeatureFetchLimit8 : SubtargetFeatureFetchLimit <"8">; +def FeatureFetchLimit16 : SubtargetFeatureFetchLimit <"16">; + +def FeatureVertexCache : SubtargetFeature<"HasVertexCache", + "HasVertexCache", + "true", + "Specify use of dedicated vertex cache" +>; + +def FeatureCaymanISA : SubtargetFeature<"caymanISA", + "CaymanISA", + "true", + "Use Cayman ISA" +>; + +def FeatureCFALUBug : SubtargetFeature<"cfalubug", + "CFALUBug", + "true", + "GPU has CF_ALU bug" +>; + +class R600SubtargetFeatureGeneration Implies> : + SubtargetFeatureGeneration ; + +def FeatureR600 : R600SubtargetFeatureGeneration<"R600", + [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0] +>; + +def FeatureR700 : R600SubtargetFeatureGeneration<"R700", + [FeatureFetchLimit16, FeatureLocalMemorySize0] +>; + +def FeatureEvergreen : R600SubtargetFeatureGeneration<"EVERGREEN", + [FeatureFetchLimit16, FeatureLocalMemorySize32768] +>; + +def FeatureNorthernIslands : R600SubtargetFeatureGeneration<"NORTHERN_ISLANDS", + [FeatureFetchLimit16, FeatureWavefrontSize64, + FeatureLocalMemorySize32768] +>; + + //===----------------------------------------------------------------------===// // Radeon HD 2000/3000 Series (R600). //===----------------------------------------------------------------------===// Index: lib/Target/AMDGPU/R600RegisterInfo.h =================================================================== --- lib/Target/AMDGPU/R600RegisterInfo.h +++ lib/Target/AMDGPU/R600RegisterInfo.h @@ -15,13 +15,14 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_R600REGISTERINFO_H #define LLVM_LIB_TARGET_AMDGPU_R600REGISTERINFO_H -#include "AMDGPURegisterInfo.h" +#define GET_REGINFO_HEADER +#include "R600GenRegisterInfo.inc" namespace llvm { class AMDGPUSubtarget; -struct R600RegisterInfo final : public AMDGPURegisterInfo { +struct R600RegisterInfo final : public R600GenRegisterInfo { RegClassWeight RCW; R600RegisterInfo(); @@ -49,6 +50,8 @@ void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, RegScavenger *RS = nullptr) const override; + + void reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const; }; } // End namespace llvm Index: lib/Target/AMDGPU/R600RegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/R600RegisterInfo.cpp +++ lib/Target/AMDGPU/R600RegisterInfo.cpp @@ -21,34 +21,37 @@ using namespace llvm; -R600RegisterInfo::R600RegisterInfo() : AMDGPURegisterInfo() { +R600RegisterInfo::R600RegisterInfo() : R600GenRegisterInfo(0) { RCW.RegWeight = 0; RCW.WeightLimit = 0; } +#define GET_REGINFO_TARGET_DESC +#include "R600GenRegisterInfo.inc" + BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); const R600Subtarget &ST = MF.getSubtarget(); const R600InstrInfo *TII = ST.getInstrInfo(); - reserveRegisterTuples(Reserved, AMDGPU::ZERO); - reserveRegisterTuples(Reserved, AMDGPU::HALF); - reserveRegisterTuples(Reserved, AMDGPU::ONE); - reserveRegisterTuples(Reserved, AMDGPU::ONE_INT); - reserveRegisterTuples(Reserved, AMDGPU::NEG_HALF); - reserveRegisterTuples(Reserved, AMDGPU::NEG_ONE); - reserveRegisterTuples(Reserved, AMDGPU::PV_X); - reserveRegisterTuples(Reserved, AMDGPU::ALU_LITERAL_X); - reserveRegisterTuples(Reserved, AMDGPU::ALU_CONST); - reserveRegisterTuples(Reserved, AMDGPU::PREDICATE_BIT); - reserveRegisterTuples(Reserved, AMDGPU::PRED_SEL_OFF); - reserveRegisterTuples(Reserved, AMDGPU::PRED_SEL_ZERO); - reserveRegisterTuples(Reserved, AMDGPU::PRED_SEL_ONE); - reserveRegisterTuples(Reserved, AMDGPU::INDIRECT_BASE_ADDR); - - for (TargetRegisterClass::iterator I = AMDGPU::R600_AddrRegClass.begin(), - E = AMDGPU::R600_AddrRegClass.end(); I != E; ++I) { + reserveRegisterTuples(Reserved, R600::ZERO); + reserveRegisterTuples(Reserved, R600::HALF); + reserveRegisterTuples(Reserved, R600::ONE); + reserveRegisterTuples(Reserved, R600::ONE_INT); + reserveRegisterTuples(Reserved, R600::NEG_HALF); + reserveRegisterTuples(Reserved, R600::NEG_ONE); + reserveRegisterTuples(Reserved, R600::PV_X); + reserveRegisterTuples(Reserved, R600::ALU_LITERAL_X); + reserveRegisterTuples(Reserved, R600::ALU_CONST); + reserveRegisterTuples(Reserved, R600::PREDICATE_BIT); + reserveRegisterTuples(Reserved, R600::PRED_SEL_OFF); + reserveRegisterTuples(Reserved, R600::PRED_SEL_ZERO); + reserveRegisterTuples(Reserved, R600::PRED_SEL_ONE); + reserveRegisterTuples(Reserved, R600::INDIRECT_BASE_ADDR); + + for (TargetRegisterClass::iterator I = R600::R600_AddrRegClass.begin(), + E = R600::R600_AddrRegClass.end(); I != E; ++I) { reserveRegisterTuples(Reserved, *I); } @@ -58,7 +61,7 @@ } // Dummy to not crash RegisterClassInfo. -static const MCPhysReg CalleeSavedReg = AMDGPU::NoRegister; +static const MCPhysReg CalleeSavedReg = R600::NoRegister; const MCPhysReg *R600RegisterInfo::getCalleeSavedRegs( const MachineFunction *) const { @@ -66,7 +69,7 @@ } unsigned R600RegisterInfo::getFrameRegister(const MachineFunction &MF) const { - return AMDGPU::NoRegister; + return R600::NoRegister; } unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const { @@ -81,7 +84,7 @@ MVT VT) const { switch(VT.SimpleTy) { default: - case MVT::i32: return &AMDGPU::R600_TReg32RegClass; + case MVT::i32: return &R600::R600_TReg32RegClass; } } @@ -94,9 +97,9 @@ assert(!TargetRegisterInfo::isVirtualRegister(Reg)); switch (Reg) { - case AMDGPU::OQAP: - case AMDGPU::OQBP: - case AMDGPU::AR_X: + case R600::OQAP: + case R600::OQBP: + case R600::AR_X: return false; default: return true; @@ -109,3 +112,10 @@ RegScavenger *RS) const { llvm_unreachable("Subroutines not supported yet"); } + +void R600RegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const { + MCRegAliasIterator R(Reg, this, true); + + for (; R.isValid(); ++R) + Reserved.set(*R); +} Index: lib/Target/AMDGPU/R600RegisterInfo.td =================================================================== --- lib/Target/AMDGPU/R600RegisterInfo.td +++ lib/Target/AMDGPU/R600RegisterInfo.td @@ -245,7 +245,7 @@ (add V0123_W, V0123_Z, V0123_Y, V0123_X) >; -def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32], 64, +def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32, i64, f64], 64, (add (sequence "T%u_XY", 0, 63))>; def R600_Reg64Vertical : RegisterClass<"AMDGPU", [v2f32, v2i32], 64, Index: lib/Target/AMDGPU/R700Instructions.td =================================================================== --- lib/Target/AMDGPU/R700Instructions.td +++ lib/Target/AMDGPU/R700Instructions.td @@ -13,7 +13,7 @@ // //===----------------------------------------------------------------------===// -def isR700 : Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::R700">; +def isR700 : Predicate<"Subtarget->getGeneration() == R600Subtarget::R700">; let Predicates = [isR700] in { def SIN_r700 : SIN_Common<0x6E>; Index: lib/Target/AMDGPU/SIFoldOperands.cpp =================================================================== --- lib/Target/AMDGPU/SIFoldOperands.cpp +++ lib/Target/AMDGPU/SIFoldOperands.cpp @@ -76,7 +76,7 @@ MachineRegisterInfo *MRI; const SIInstrInfo *TII; const SIRegisterInfo *TRI; - const SISubtarget *ST; + const AMDGPUSubtarget *ST; void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI, @@ -962,7 +962,7 @@ return false; MRI = &MF.getRegInfo(); - ST = &MF.getSubtarget(); + ST = &MF.getSubtarget(); TII = ST->getInstrInfo(); TRI = &TII->getRegisterInfo(); Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -22,6 +22,9 @@ namespace llvm { class SITargetLowering final : public AMDGPUTargetLowering { +private: + const SISubtarget *Subtarget; + SDValue lowerKernArgParameterPtr(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, uint64_t Offset) const; SDValue getImplicitArgPtr(SelectionDAG &DAG, const SDLoc &SL) const; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -113,7 +113,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, const SISubtarget &STI) - : AMDGPUTargetLowering(TM, STI) { + : AMDGPUTargetLowering(TM, STI), + Subtarget(&STI) { addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass); addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); @@ -146,7 +147,7 @@ addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass); } - computeRegisterProperties(STI.getRegisterInfo()); + computeRegisterProperties(Subtarget->getRegisterInfo()); // We need to custom lower vector stores from local memory setOperationAction(ISD::LOAD, MVT::v2i32, Custom); @@ -306,7 +307,7 @@ setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand); - if (getSubtarget()->hasFlatAddressSpace()) { + if (Subtarget->hasFlatAddressSpace()) { setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom); setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom); } @@ -326,6 +327,11 @@ setOperationAction(ISD::FTRUNC, MVT::f64, Legal); setOperationAction(ISD::FCEIL, MVT::f64, Legal); setOperationAction(ISD::FRINT, MVT::f64, Legal); + } else { + setOperationAction(ISD::FCEIL, MVT::f64, Custom); + setOperationAction(ISD::FTRUNC, MVT::f64, Custom); + setOperationAction(ISD::FRINT, MVT::f64, Custom); + setOperationAction(ISD::FFLOOR, MVT::f64, Custom); } setOperationAction(ISD::FFLOOR, MVT::f64, Legal); @@ -550,7 +556,7 @@ } const SISubtarget *SITargetLowering::getSubtarget() const { - return static_cast(Subtarget); + return Subtarget; } //===----------------------------------------------------------------------===// @@ -1900,8 +1906,7 @@ // FIXME: Does sret work properly? if (!Info->isEntryFunction()) { - const SIRegisterInfo *TRI - = static_cast(Subtarget)->getRegisterInfo(); + const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); if (I) { @@ -2003,8 +2008,7 @@ SelectionDAG &DAG = CLI.DAG; const SDLoc &DL = CLI.DL; - const SISubtarget *ST = getSubtarget(); - const SIRegisterInfo *TRI = ST->getRegisterInfo(); + const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); auto &ArgUsageInfo = DAG.getPass()->getAnalysis(); @@ -2442,7 +2446,7 @@ // Add a register mask operand representing the call-preserved registers. - const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo(); + auto *TRI = static_cast(Subtarget->getRegisterInfo()); const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); @@ -7590,8 +7594,7 @@ MachineRegisterInfo &MRI = MF.getRegInfo(); SIMachineFunctionInfo *Info = MF.getInfo(); const MachineFrameInfo &MFI = MF.getFrameInfo(); - const SISubtarget &ST = MF.getSubtarget(); - const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); if (Info->isEntryFunction()) { // Callable functions have fixed registers used for stack access. Index: lib/Target/AMDGPU/SIInsertWaitcnts.cpp =================================================================== --- lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -922,7 +922,7 @@ // All waits must be resolved at call return. // NOTE: this could be improved with knowledge of all call sites or // with knowledge of the called routines. - if (MI.getOpcode() == AMDGPU::RETURN || + if ( MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG || MI.getOpcode() == AMDGPU::S_SETPC_B64_return) { for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; @@ -1119,7 +1119,7 @@ // TODO: Remove this work-around, enable the assert for Bug 457939 // after fixing the scheduler. Also, the Shader Compiler code is // independent of target. - if (readsVCCZ(MI) && ST->getGeneration() <= SISubtarget::SEA_ISLANDS) { + if (readsVCCZ(MI) && ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) { if (ScoreBrackets->getScoreLB(LGKM_CNT) < ScoreBrackets->getScoreUB(LGKM_CNT) && ScoreBrackets->hasPendingSMEM()) { @@ -1698,7 +1698,7 @@ if (ScoreBrackets->getScoreLB(LGKM_CNT) < ScoreBrackets->getScoreUB(LGKM_CNT) && ScoreBrackets->hasPendingSMEM()) { - if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) + if (ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) VCCZBugWorkAround = true; } } Index: lib/Target/AMDGPU/SIInsertWaits.cpp =================================================================== --- lib/Target/AMDGPU/SIInsertWaits.cpp +++ lib/Target/AMDGPU/SIInsertWaits.cpp @@ -71,7 +71,7 @@ class SIInsertWaits : public MachineFunctionPass { private: - const SISubtarget *ST = nullptr; + const AMDGPUSubtarget *ST = nullptr; const SIInstrInfo *TII = nullptr; const SIRegisterInfo *TRI = nullptr; const MachineRegisterInfo *MRI; @@ -322,7 +322,7 @@ return; } - if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { + if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM // or SMEM clause, respectively. // @@ -501,7 +501,7 @@ void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) { - if (ST->getGeneration() < SISubtarget::VOLCANIC_ISLANDS) + if (ST->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) return; // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG. @@ -538,7 +538,7 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { bool Changes = false; - ST = &MF.getSubtarget(); + ST = &MF.getSubtarget(); TII = ST->getInstrInfo(); TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); @@ -574,7 +574,7 @@ if (!HaveScalarStores && TII->isScalarStore(*I)) HaveScalarStores = true; - if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) { + if (ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) { // There is a hardware bug on CI/SI where SMRD instruction may corrupt // vccz bit, so when we detect that an instruction may read from a // corrupt vccz bit, we need to: Index: lib/Target/AMDGPU/SIInstrFormats.td =================================================================== --- lib/Target/AMDGPU/SIInstrFormats.td +++ lib/Target/AMDGPU/SIInstrFormats.td @@ -21,7 +21,7 @@ class InstSI pattern = []> : - AMDGPUInst, PredicateControl { + AMDGPUInst, GCNPredicateControl { let SubtargetPredicate = isGCN; // Low bits - basic encoding information. Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -31,6 +31,9 @@ #include #include +#define GET_INSTRINFO_HEADER +#include "AMDGPUGenInstrInfo.inc" + namespace llvm { class APInt; @@ -39,7 +42,7 @@ class SISubtarget; class TargetRegisterClass; -class SIInstrInfo final : public AMDGPUInstrInfo { +class SIInstrInfo final : public AMDGPUGenInstrInfo { private: const SIRegisterInfo RI; const SISubtarget &ST; @@ -163,7 +166,10 @@ bool shouldClusterMemOps(MachineInstr &FirstLdSt, unsigned BaseReg1, MachineInstr &SecondLdSt, unsigned BaseReg2, - unsigned NumLoads) const final; + unsigned NumLoads) const override; + + bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, + int64_t Offset1, unsigned NumLoads) const override; void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, @@ -879,6 +885,12 @@ static bool isLegalMUBUFImmOffset(unsigned Imm) { return isUInt<12>(Imm); } + + /// \brief Return a target-specific opcode if Opcode is a pseudo instruction. + /// Return -1 if the target-specific opcode for the pseudo instruction does + /// not exist. If Opcode is not a pseudo instruction, this is identity. + int pseudoToMCOpcode(int Opcode) const; + }; namespace AMDGPU { Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -14,6 +14,7 @@ #include "SIInstrInfo.h" #include "AMDGPU.h" +#include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" #include "GCNHazardRecognizer.h" #include "SIDefines.h" @@ -63,6 +64,20 @@ using namespace llvm; +#define GET_INSTRINFO_CTOR_DTOR +#include "AMDGPUGenInstrInfo.inc" + +namespace llvm { +namespace AMDGPU { +#define GET_RSRCINTRINSIC_IMPL +#include "AMDGPUGenSearchableTables.inc" + +#define GET_D16IMAGEDIMINTRINSIC_IMPL +#include "AMDGPUGenSearchableTables.inc" +} +} + + // Must be at least 4 to be able to branch over minimum unconditional branch // code. This is only for making it possible to write reasonably small tests for // long branches. @@ -71,7 +86,8 @@ cl::desc("Restrict range of branch instructions (DEBUG)")); SIInstrInfo::SIInstrInfo(const SISubtarget &ST) - : AMDGPUInstrInfo(ST), RI(ST), ST(ST) {} + : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), + RI(ST), ST(ST) {} //===----------------------------------------------------------------------===// // TargetInstrInfo callbacks @@ -438,6 +454,28 @@ return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold; } +// FIXME: This behaves strangely. If, for example, you have 32 load + stores, +// the first 16 loads will be interleaved with the stores, and the next 16 will +// be clustered as expected. It should really split into 2 16 store batches. +// +// Loads are clustered until this returns false, rather than trying to schedule +// groups of stores. This also means we have to deal with saying different +// address space loads should be clustered, and ones which might cause bank +// conflicts. +// +// This might be deprecated so it might not be worth that much effort to fix. +bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, + int64_t Offset0, int64_t Offset1, + unsigned NumLoads) const { + assert(Offset1 > Offset0 && + "Second offset should be larger than first offset!"); + // If we have less than 16 loads in a row, and the offsets are within 64 + // bytes, then schedule together. + + // A cacheline is 64 bytes (for global memory). + return (NumLoads <= 16 && (Offset1 - Offset0) < 64); +} + static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, unsigned DestReg, @@ -998,7 +1036,7 @@ unsigned FrameOffset, unsigned Size) const { MachineFunction *MF = MBB.getParent(); SIMachineFunctionInfo *MFI = MF->getInfo(); - const SISubtarget &ST = MF->getSubtarget(); + const AMDGPUSubtarget &ST = MF->getSubtarget(); DebugLoc DL = MBB.findDebugLoc(MI); unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize(); unsigned WavefrontSize = ST.getWavefrontSize(); @@ -1134,7 +1172,7 @@ MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MBB.findDebugLoc(MI); switch (MI.getOpcode()) { - default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); + default: return TargetInstrInfo::expandPostRAPseudo(MI); case AMDGPU::S_MOV_B64_term: // This is only a terminator to get the correct spill code placement during // register allocation. @@ -1884,16 +1922,16 @@ switch(Kind) { case PseudoSourceValue::Stack: case PseudoSourceValue::FixedStack: - return AMDGPUASI.PRIVATE_ADDRESS; + return ST.getAMDGPUAS().PRIVATE_ADDRESS; case PseudoSourceValue::ConstantPool: case PseudoSourceValue::GOT: case PseudoSourceValue::JumpTable: case PseudoSourceValue::GlobalValueCallEntry: case PseudoSourceValue::ExternalSymbolCallEntry: case PseudoSourceValue::TargetCustom: - return AMDGPUASI.CONSTANT_ADDRESS; + return ST.getAMDGPUAS().CONSTANT_ADDRESS; } - return AMDGPUASI.FLAT_ADDRESS; + return ST.getAMDGPUAS().FLAT_ADDRESS; } static void removeModOperands(MachineInstr &MI) { @@ -4605,7 +4643,7 @@ return AMDGPU::NoRegister; assert(!MI.memoperands_empty() && - (*MI.memoperands_begin())->getAddrSpace() == AMDGPUASI.PRIVATE_ADDRESS); + (*MI.memoperands_begin())->getAddrSpace() == ST.getAMDGPUAS().PRIVATE_ADDRESS); FrameIndex = Addr->getIndex(); return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); @@ -4724,7 +4762,7 @@ return true; for (const MachineMemOperand *MMO : MI.memoperands()) { - if (MMO->getAddrSpace() == AMDGPUASI.FLAT_ADDRESS) + if (MMO->getAddrSpace() == ST.getAMDGPUAS().FLAT_ADDRESS) return true; } return false; @@ -4904,3 +4942,56 @@ const auto RCID = MI.getDesc().OpInfo[Idx].RegClass; return RCID == AMDGPU::SReg_128RegClassID; } + +// This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td +enum SIEncodingFamily { + SI = 0, + VI = 1, + SDWA = 2, + SDWA9 = 3, + GFX80 = 4, + GFX9 = 5 +}; + +static SIEncodingFamily subtargetEncodingFamily(const SISubtarget &ST) { + switch (ST.getGeneration()) { + case SISubtarget::SOUTHERN_ISLANDS: + case SISubtarget::SEA_ISLANDS: + return SIEncodingFamily::SI; + case SISubtarget::VOLCANIC_ISLANDS: + case SISubtarget::GFX9: + return SIEncodingFamily::VI; + } + llvm_unreachable("Unknown subtarget generation!"); +} + +int SIInstrInfo::pseudoToMCOpcode(int Opcode) const { + SIEncodingFamily Gen = subtargetEncodingFamily(ST); + + if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 && + ST.getGeneration() >= SISubtarget::GFX9) + Gen = SIEncodingFamily::GFX9; + + if (get(Opcode).TSFlags & SIInstrFlags::SDWA) + Gen = ST.getGeneration() == SISubtarget::GFX9 ? SIEncodingFamily::SDWA9 + : SIEncodingFamily::SDWA; + // Adjust the encoding family to GFX80 for D16 buffer instructions when the + // subtarget has UnpackedD16VMem feature. + // TODO: remove this when we discard GFX80 encoding. + if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16) + && !(get(Opcode).TSFlags & SIInstrFlags::MIMG)) + Gen = SIEncodingFamily::GFX80; + + int MCOp = AMDGPU::getMCOpcode(Opcode, Gen); + + // -1 means that Opcode is already a native instruction. + if (MCOp == -1) + return Opcode; + + // (uint16_t)-1 means that Opcode is a pseudo instruction that has + // no encoding in the given subtarget generation. + if (MCOp == (uint16_t)-1) + return -1; + + return MCOp; +} Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -17,6 +17,11 @@ def DisableInst : Predicate <"false">, AssemblerPredicate<"FeatureDisable">; +class GCNPredicateControl : PredicateControl { + Predicate SIAssemblerPredicate = isSICI; + Predicate VIAssemblerPredicate = isVI; +} + // Execpt for the NONE field, this must be kept in sync with the // SIEncodingFamily enum in AMDGPUInstrInfo.cpp def SIEncodingFamily { Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -11,11 +11,10 @@ // that are not yet supported remain commented out. //===----------------------------------------------------------------------===// -class GCNPat : AMDGPUPat { +class GCNPat : Pat, GCNPredicateControl { let SubtargetPredicate = isGCN; } - include "VOPInstructions.td" include "SOPInstructions.td" include "SMInstructions.td" Index: lib/Target/AMDGPU/SIRegisterInfo.h =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.h +++ lib/Target/AMDGPU/SIRegisterInfo.h @@ -21,6 +21,7 @@ namespace llvm { +class AMDGPUSubtarget; class LiveIntervals; class MachineRegisterInfo; class SISubtarget; Index: lib/Target/AMDGPU/SIRegisterInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIRegisterInfo.cpp +++ lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1231,8 +1231,6 @@ &AMDGPU::VReg_512RegClass, &AMDGPU::SReg_512RegClass, &AMDGPU::SCC_CLASSRegClass, - &AMDGPU::R600_Reg32RegClass, - &AMDGPU::R600_PredicateRegClass, &AMDGPU::Pseudo_SReg_32RegClass, &AMDGPU::Pseudo_SReg_128RegClass, }; Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h =================================================================== --- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -92,7 +92,7 @@ /// \returns Maximum number of waves per execution unit for given subtarget \p /// Features without any kind of limitation. -unsigned getMaxWavesPerEU(const FeatureBitset &Features); +unsigned getMaxWavesPerEU(); /// \returns Maximum number of waves per execution unit for given subtarget \p /// Features and limited by given \p FlatWorkGroupSize. Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp =================================================================== --- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -224,7 +224,7 @@ if (Features.test(FeatureGFX9)) return {9, 0, 0}; - if (!Features.test(FeatureGCN) || Features.test(FeatureSouthernIslands)) + if (Features.test(FeatureSouthernIslands)) return {0, 0, 0}; return {7, 0, 0}; } @@ -282,7 +282,7 @@ } unsigned getMaxWavesPerCU(const FeatureBitset &Features) { - return getMaxWavesPerEU(Features) * getEUsPerCU(Features); + return getMaxWavesPerEU() * getEUsPerCU(Features); } unsigned getMaxWavesPerCU(const FeatureBitset &Features, @@ -294,9 +294,7 @@ return 1; } -unsigned getMaxWavesPerEU(const FeatureBitset &Features) { - if (!Features.test(FeatureGCN)) - return 8; +unsigned getMaxWavesPerEU() { // FIXME: Need to take scratch memory into account. return 10; } @@ -352,7 +350,7 @@ unsigned getMinNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU) { assert(WavesPerEU != 0); - if (WavesPerEU >= getMaxWavesPerEU(Features)) + if (WavesPerEU >= getMaxWavesPerEU()) return 0; unsigned MinNumSGPRs = alignDown(getTotalNumSGPRs(Features) / (WavesPerEU + 1), @@ -392,7 +390,7 @@ unsigned getMinNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU) { assert(WavesPerEU != 0); - if (WavesPerEU >= getMaxWavesPerEU(Features)) + if (WavesPerEU >= getMaxWavesPerEU()) return 0; unsigned MinNumVGPRs = alignDown(getTotalNumVGPRs(Features) / (WavesPerEU + 1),