Index: llvm/trunk/include/llvm/BinaryFormat/ELF.h =================================================================== --- llvm/trunk/include/llvm/BinaryFormat/ELF.h +++ llvm/trunk/include/llvm/BinaryFormat/ELF.h @@ -703,14 +703,17 @@ EF_AMDGPU_MACH_AMDGCN_GFX904 = 0x02e, EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f, EF_AMDGPU_MACH_AMDGCN_GFX909 = 0x031, + // AMDGCN GFX10. + EF_AMDGPU_MACH_AMDGCN_GFX1010 = 0x033, // Reserved for AMDGCN-based processors. EF_AMDGPU_MACH_AMDGCN_RESERVED0 = 0x027, EF_AMDGPU_MACH_AMDGCN_RESERVED1 = 0x030, + EF_AMDGPU_MACH_AMDGCN_RESERVED2 = 0x032, // First/last AMDGCN-based processors. EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600, - EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX909, + EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX1010, // Indicates if the "xnack" target feature is enabled for all code contained // in the object. Index: llvm/trunk/include/llvm/Support/AMDHSAKernelDescriptor.h =================================================================== --- llvm/trunk/include/llvm/Support/AMDHSAKernelDescriptor.h +++ llvm/trunk/include/llvm/Support/AMDHSAKernelDescriptor.h @@ -88,8 +88,11 @@ COMPUTE_PGM_RSRC1(ENABLE_IEEE_MODE, 23, 1), COMPUTE_PGM_RSRC1(BULKY, 24, 1), COMPUTE_PGM_RSRC1(CDBG_USER, 25, 1), - COMPUTE_PGM_RSRC1(FP16_OVFL, 26, 1), // GFX9+ - COMPUTE_PGM_RSRC1(RESERVED0, 27, 5), + COMPUTE_PGM_RSRC1(FP16_OVFL, 26, 1), // GFX9+ + COMPUTE_PGM_RSRC1(RESERVED0, 27, 2), + COMPUTE_PGM_RSRC1(WGP_MODE, 29, 1), // GFX10+ + COMPUTE_PGM_RSRC1(MEM_ORDERED, 30, 1), // GFX10+ + COMPUTE_PGM_RSRC1(FWD_PROGRESS, 31, 1), // GFX10+ }; #undef COMPUTE_PGM_RSRC1 @@ -119,6 +122,15 @@ }; #undef COMPUTE_PGM_RSRC2 +// Compute program resource register 3. Must match hardware definition. +#define COMPUTE_PGM_RSRC3(NAME, SHIFT, WIDTH) \ + AMDHSA_BITS_ENUM_ENTRY(COMPUTE_PGM_RSRC3_ ## NAME, SHIFT, WIDTH) +enum : int32_t { + COMPUTE_PGM_RSRC3(SHARED_VGPR_COUNT, 0, 4), // GFX10+ + COMPUTE_PGM_RSRC3(RESERVED0, 4, 28), +}; +#undef COMPUTE_PGM_RSRC3 + // Kernel code properties. Must be kept backwards compatible. #define KERNEL_CODE_PROPERTY(NAME, SHIFT, WIDTH) \ AMDHSA_BITS_ENUM_ENTRY(KERNEL_CODE_PROPERTY_ ## NAME, SHIFT, WIDTH) @@ -130,7 +142,8 @@ KERNEL_CODE_PROPERTY(ENABLE_SGPR_DISPATCH_ID, 4, 1), KERNEL_CODE_PROPERTY(ENABLE_SGPR_FLAT_SCRATCH_INIT, 5, 1), KERNEL_CODE_PROPERTY(ENABLE_SGPR_PRIVATE_SEGMENT_SIZE, 6, 1), - KERNEL_CODE_PROPERTY(RESERVED0, 7, 9), + KERNEL_CODE_PROPERTY(RESERVED0, 7, 3), + KERNEL_CODE_PROPERTY(RESERVED1, 11, 5), }; #undef KERNEL_CODE_PROPERTY @@ -140,7 +153,8 @@ uint32_t private_segment_fixed_size; uint8_t reserved0[8]; int64_t kernel_code_entry_byte_offset; - uint8_t reserved1[24]; + uint8_t reserved1[20]; + uint32_t compute_pgm_rsrc3; // GFX10+ uint32_t compute_pgm_rsrc1; uint32_t compute_pgm_rsrc2; uint16_t kernel_code_properties; @@ -166,6 +180,9 @@ offsetof(kernel_descriptor_t, reserved1) == 24, "invalid offset for reserved1"); static_assert( + offsetof(kernel_descriptor_t, compute_pgm_rsrc3) == 44, + "invalid offset for compute_pgm_rsrc3"); +static_assert( offsetof(kernel_descriptor_t, compute_pgm_rsrc1) == 48, "invalid offset for compute_pgm_rsrc1"); static_assert( Index: llvm/trunk/include/llvm/Support/TargetParser.h =================================================================== --- llvm/trunk/include/llvm/Support/TargetParser.h +++ llvm/trunk/include/llvm/Support/TargetParser.h @@ -123,8 +123,10 @@ GK_GFX906 = 63, GK_GFX909 = 65, + GK_GFX1010 = 71, + GK_AMDGCN_FIRST = GK_GFX600, - GK_AMDGCN_LAST = GK_GFX909, + GK_AMDGCN_LAST = GK_GFX1010, }; /// Instruction set architecture version. Index: llvm/trunk/lib/ObjectYAML/ELFYAML.cpp =================================================================== --- llvm/trunk/lib/ObjectYAML/ELFYAML.cpp +++ llvm/trunk/lib/ObjectYAML/ELFYAML.cpp @@ -411,6 +411,7 @@ BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX904, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX906, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX909, EF_AMDGPU_MACH); + BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1010, EF_AMDGPU_MACH); BCase(EF_AMDGPU_XNACK); BCase(EF_AMDGPU_SRAM_ECC); break; Index: llvm/trunk/lib/Support/TargetParser.cpp =================================================================== --- llvm/trunk/lib/Support/TargetParser.cpp +++ llvm/trunk/lib/Support/TargetParser.cpp @@ -62,7 +62,7 @@ // This table should be sorted by the value of GPUKind // Don't bother listing the implicitly true features -constexpr GPUInfo AMDGCNGPUs[33] = { +constexpr GPUInfo AMDGCNGPUs[34] = { // Name Canonical Kind Features // Name {{"gfx600"}, {"gfx600"}, GK_GFX600, FEATURE_FAST_FMA_F32}, @@ -98,6 +98,7 @@ {{"gfx904"}, {"gfx904"}, GK_GFX904, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32}, {{"gfx906"}, {"gfx906"}, GK_GFX906, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32}, {{"gfx909"}, {"gfx909"}, GK_GFX909, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32}, + {{"gfx1010"}, {"gfx1010"}, GK_GFX1010, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32}, }; const GPUInfo *getArchEntry(AMDGPU::GPUKind AK, ArrayRef Table) { @@ -179,22 +180,23 @@ } switch (AK) { - case GK_GFX600: return {6, 0, 0}; - case GK_GFX601: return {6, 0, 1}; - case GK_GFX700: return {7, 0, 0}; - case GK_GFX701: return {7, 0, 1}; - case GK_GFX702: return {7, 0, 2}; - case GK_GFX703: return {7, 0, 3}; - case GK_GFX704: return {7, 0, 4}; - case GK_GFX801: return {8, 0, 1}; - case GK_GFX802: return {8, 0, 2}; - case GK_GFX803: return {8, 0, 3}; - case GK_GFX810: return {8, 1, 0}; - case GK_GFX900: return {9, 0, 0}; - case GK_GFX902: return {9, 0, 2}; - case GK_GFX904: return {9, 0, 4}; - case GK_GFX906: return {9, 0, 6}; - case GK_GFX909: return {9, 0, 9}; - default: return {0, 0, 0}; + case GK_GFX600: return {6, 0, 0}; + case GK_GFX601: return {6, 0, 1}; + case GK_GFX700: return {7, 0, 0}; + case GK_GFX701: return {7, 0, 1}; + case GK_GFX702: return {7, 0, 2}; + case GK_GFX703: return {7, 0, 3}; + case GK_GFX704: return {7, 0, 4}; + case GK_GFX801: return {8, 0, 1}; + case GK_GFX802: return {8, 0, 2}; + case GK_GFX803: return {8, 0, 3}; + case GK_GFX810: return {8, 1, 0}; + case GK_GFX900: return {9, 0, 0}; + case GK_GFX902: return {9, 0, 2}; + case GK_GFX904: return {9, 0, 4}; + case GK_GFX906: return {9, 0, 6}; + case GK_GFX909: return {9, 0, 9}; + case GK_GFX1010: return {10, 1, 0}; + default: return {0, 0, 0}; } } Index: llvm/trunk/lib/Target/AMDGPU/AMDGPU.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPU.td +++ llvm/trunk/lib/Target/AMDGPU/AMDGPU.td @@ -60,6 +60,12 @@ "Have scratch_* flat memory instructions" >; +def FeatureScalarFlatScratchInsts : SubtargetFeature<"scalar-flat-scratch-insts", + "ScalarFlatScratchInsts", + "true", + "Have s_scratch_* flat memory instructions" +>; + def FeatureAddNoCarryInsts : SubtargetFeature<"add-no-carry-insts", "AddNoCarryInsts", "true", @@ -115,12 +121,72 @@ "Enable XNACK support" >; +def FeatureCuMode : SubtargetFeature<"cumode", + "EnableCuMode", + "true", + "Enable CU wavefront execution mode" +>; + def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug", "SGPRInitBug", "true", "VI SGPR initialization bug requiring a fixed SGPR allocation size" >; +def FeatureLdsMisalignedBug : SubtargetFeature<"lds-misaligned-bug", + "LDSMisalignedBug", + "true", + "Some GFX10 bug with misaligned multi-dword LDS access in WGP mode" +>; + +def FeatureVcmpxPermlaneHazard : SubtargetFeature<"vcmpx-permlane-hazard", + "HasVcmpxPermlaneHazard", + "true", + "TODO: describe me" +>; + +def FeatureVMEMtoScalarWriteHazard : SubtargetFeature<"vmem-to-scalar-write-hazard", + "HasVMEMtoScalarWriteHazard", + "true", + "VMEM instruction followed by scalar writing to EXEC mask, M0 or SGPR leads to incorrect execution." +>; + +def FeatureSMEMtoVectorWriteHazard : SubtargetFeature<"smem-to-vector-write-hazard", + "HasSMEMtoVectorWriteHazard", + "true", + "s_load_dword followed by v_cmp page faults" +>; + +def FeatureInstFwdPrefetchBug : SubtargetFeature<"inst-fwd-prefetch-bug", + "HasInstFwdPrefetchBug", + "true", + "S_INST_PREFETCH instruction causes shader to hang" +>; + +def FeatureVcmpxExecWARHazard : SubtargetFeature<"vcmpx-exec-war-hazard", + "HasVcmpxExecWARHazard", + "true", + "V_CMPX WAR hazard on EXEC (V_CMPX issue ONLY)" +>; + +def FeatureLdsBranchVmemWARHazard : SubtargetFeature<"lds-branch-vmem-war-hazard", + "HasLdsBranchVmemWARHazard", + "true", + "Switching between LDS and VMEM-tex not waiting VM_VSRC=0" +>; + +def FeatureNSAtoVMEMBug : SubtargetFeature<"nsa-to-vmem-bug", + "HasNSAtoVMEMBug", + "true", + "MIMG-NSA followed by VMEM fail if EXEC_LO or EXEC_HI equals zero" +>; + +def FeatureFlatSegmentOffsetBug : SubtargetFeature<"flat-segment-offset-bug", + "HasFlatSegmentOffsetBug", + "true", + "GFX10 bug, inst_offset ignored in flat segment" +>; + class SubtargetFeatureLDSBankCount : SubtargetFeature < "ldsbankcount"#Value, "LDSBankCount", @@ -155,6 +221,12 @@ "Additional instructions for GFX9+" >; +def FeatureGFX10Insts : SubtargetFeature<"gfx10-insts", + "GFX10Insts", + "true", + "Additional instructions for GFX10+" +>; + def FeatureGFX7GFX8GFX9Insts : SubtargetFeature<"gfx7-gfx8-gfx9-insts", "GFX7GFX8GFX9Insts", "true", @@ -257,6 +329,12 @@ "Support 16 bit coordindates/gradients/lod/clamp/mip types on gfx9" >; +def FeatureNSAEncoding : SubtargetFeature<"nsa-encoding", + "HasNSAEncoding", + "true", + "Support NSA encoding for image instructions" +>; + def FeatureIntClamp : SubtargetFeature<"int-clamp-insts", "HasIntClamp", "true", @@ -299,6 +377,36 @@ "Enable SRAM ECC" >; +def FeatureNoSdstCMPX : SubtargetFeature<"no-sdst-cmpx", + "HasNoSdstCMPX", + "true", + "V_CMPX does not write VCC/SGPR in addition to EXEC" +>; + +def FeatureVscnt : SubtargetFeature<"vscnt", + "HasVscnt", + "true", + "Has separate store vscnt counter" +>; + +def FeatureRegisterBanking : SubtargetFeature<"register-banking", + "HasRegisterBanking", + "true", + "Has register banking" +>; + +def FeatureVOP3Literal : SubtargetFeature<"vop3-literal", + "HasVOP3Literal", + "true", + "Can use one literal in VOP3" +>; + +def FeatureNoDataDepHazard : SubtargetFeature<"no-data-dep-hazard", + "HasNoDataDepHazard", + "true", + "Does not need SW waitstates" +>; + //===------------------------------------------------------------===// // Subtarget Features (options and debugging) //===------------------------------------------------------------===// @@ -487,7 +595,24 @@ FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst, FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts, FeatureAddNoCarryInsts, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts, - FeatureScalarAtomics, FeatureR128A16 + FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16 + ] +>; + +def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10", + "gfx10", + [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128, + FeatureFlatAddressSpace, + FeatureCIInsts, Feature16BitInsts, + FeatureSMemRealTime, FeatureInv2PiInlineImm, + FeatureApertureRegs, FeatureGFX9Insts, FeatureGFX10Insts, FeatureVOP3P, + FeatureMovrel, FeatureFastFMAF32, FeatureDPP, FeatureIntClamp, + FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst, + FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts, + FeatureAddNoCarryInsts, FeatureFmaMixInsts, FeatureGFX8Insts, + FeatureNoSdstCMPX, FeatureVscnt, FeatureRegisterBanking, + FeatureVOP3Literal, FeatureNoDataDepHazard, + FeatureDoesNotSupportSRAMECC ] >; @@ -601,6 +726,34 @@ FeatureXNACK, FeatureCodeObjectV3]>; +// TODO: Organize more features into groups. +def FeatureGroup { + // Bugs present on gfx10.1. + list GFX10_1_Bugs = [ + FeatureVcmpxPermlaneHazard, + FeatureVMEMtoScalarWriteHazard, + FeatureSMEMtoVectorWriteHazard, + FeatureInstFwdPrefetchBug, + FeatureVcmpxExecWARHazard, + FeatureLdsBranchVmemWARHazard, + FeatureNSAtoVMEMBug, + FeatureFlatSegmentOffsetBug + ]; +} + +def FeatureISAVersion10_1_0 : FeatureSet< + !listconcat(FeatureGroup.GFX10_1_Bugs, + [FeatureGFX10, + FeatureLDSBankCount32, + FeatureDLInsts, + FeatureNSAEncoding, + FeatureWavefrontSize64, + FeatureScalarStores, + FeatureScalarAtomics, + FeatureScalarFlatScratchInsts, + FeatureLdsMisalignedBug, + FeatureCodeObjectV3])>; + //===----------------------------------------------------------------------===// def AMDGPUInstrInfo : InstrInfo { @@ -687,10 +840,21 @@ def isGFX6GFX7 : Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||" "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS">, + AssemblerPredicate<"!FeatureGCN3Encoding,!FeatureGFX10Insts">; + +def isGFX6GFX7GFX10 : + Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">, AssemblerPredicate<"!FeatureGCN3Encoding">; def isGFX7Only : Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS">, + AssemblerPredicate<"!FeatureGCN3Encoding,FeatureCIInsts,!FeatureGFX10Insts">; + +def isGFX7GFX10 : + Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">, AssemblerPredicate<"!FeatureGCN3Encoding,FeatureCIInsts">; def isGFX7GFX8GFX9 : @@ -699,6 +863,13 @@ "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">, AssemblerPredicate<"FeatureGFX7GFX8GFX9Insts">; +def isGFX6GFX7GFX8GFX9 : + Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||" + "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">, + AssemblerPredicate<"!FeatureGFX10Insts">; + def isGFX7Plus : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">, AssemblerPredicate<"FeatureCIInsts">; @@ -724,6 +895,10 @@ "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">, AssemblerPredicate<"FeatureGFX8Insts,FeatureGCN3Encoding">; +def isGFX10Plus : + Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10">, + AssemblerPredicate<"FeatureGFX10Insts">; + def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">, AssemblerPredicate<"FeatureFlatAddressSpace">; @@ -731,6 +906,8 @@ AssemblerPredicate<"FeatureFlatGlobalInsts">; def HasFlatScratchInsts : Predicate<"Subtarget->hasFlatScratchInsts()">, AssemblerPredicate<"FeatureFlatScratchInsts">; +def HasScalarFlatScratchInsts : Predicate<"Subtarget->hasScalarFlatScratchInsts()">, + AssemblerPredicate<"FeatureScalarFlatScratchInsts">; def HasD16LoadStore : Predicate<"Subtarget->hasD16LoadStore()">, AssemblerPredicate<"FeatureGFX9Insts">; @@ -766,6 +943,10 @@ Predicate<"Subtarget->hasSDWA()">, AssemblerPredicate<"FeatureGCN3Encoding,FeatureGFX9Insts,FeatureSDWA">; +def HasSDWA10 : + Predicate<"Subtarget->hasSDWA()">, + AssemblerPredicate<"!FeatureGCN3Encoding,FeatureGFX10Insts,FeatureSDWA">; + def HasDPP : Predicate<"Subtarget->hasDPP()">, AssemblerPredicate<"FeatureGCN3Encoding,FeatureDPP">; @@ -778,9 +959,18 @@ def HasMadMixInsts : Predicate<"Subtarget->hasMadMixInsts()">, AssemblerPredicate<"FeatureMadMixInsts">; +def HasScalarStores : Predicate<"Subtarget->hasScalarStores()">, + AssemblerPredicate<"FeatureScalarStores">; + def HasScalarAtomics : Predicate<"Subtarget->hasScalarAtomics()">, AssemblerPredicate<"FeatureScalarAtomics">; +def HasNoSdstCMPX : Predicate<"Subtarget->hasNoSdstCMPX()">, + AssemblerPredicate<"FeatureNoSdstCMPX">; + +def HasSdstCMPX : Predicate<"!Subtarget->hasNoSdstCMPX()">, + AssemblerPredicate<"!FeatureNoSdstCMPX">; + def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">; def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">; def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">, Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -55,7 +55,8 @@ SOUTHERN_ISLANDS = 4, SEA_ISLANDS = 5, VOLCANIC_ISLANDS = 6, - GFX9 = 7 + GFX9 = 7, + GFX10 = 8 }; private: @@ -293,6 +294,7 @@ bool UnalignedBufferAccess; bool HasApertureRegs; bool EnableXNACK; + bool EnableCuMode; bool TrapHandler; // Used as options. @@ -313,6 +315,7 @@ bool CIInsts; bool GFX8Insts; bool GFX9Insts; + bool GFX10Insts; bool GFX7GFX8GFX9Insts; bool SGPRInitBug; bool HasSMemRealTime; @@ -329,24 +332,41 @@ bool HasSDWAOutModsVOPC; bool HasDPP; bool HasR128A16; + bool HasNSAEncoding; bool HasDLInsts; bool HasDot1Insts; bool HasDot2Insts; bool EnableSRAMECC; bool DoesNotSupportSRAMECC; + bool HasNoSdstCMPX; + bool HasVscnt; + bool HasRegisterBanking; + bool HasVOP3Literal; + bool HasNoDataDepHazard; bool FlatAddressSpace; bool FlatInstOffsets; bool FlatGlobalInsts; bool FlatScratchInsts; + bool ScalarFlatScratchInsts; bool AddNoCarryInsts; bool HasUnpackedD16VMem; bool R600ALUInst; bool CaymanISA; bool CFALUBug; + bool LDSMisalignedBug; bool HasVertexCache; short TexVTXClauseSize; bool ScalarizeGlobal; + bool HasVcmpxPermlaneHazard; + bool HasVMEMtoScalarWriteHazard; + bool HasSMEMtoVectorWriteHazard; + bool HasInstFwdPrefetchBug; + bool HasVcmpxExecWARHazard; + bool HasLdsBranchVmemWARHazard; + bool HasNSAtoVMEMBug; + bool HasFlatSegmentOffsetBug; + // Dummy feature to use for assembler in tablegen. bool FeatureDisable; @@ -583,6 +603,10 @@ return EnableXNACK; } + bool isCuModeEnabled() const { + return EnableCuMode; + } + bool hasFlatAddressSpace() const { return FlatAddressSpace; } @@ -599,6 +623,14 @@ return FlatScratchInsts; } + bool hasScalarFlatScratchInsts() const { + return ScalarFlatScratchInsts; + } + + bool hasFlatSegmentOffsetBug() const { + return HasFlatSegmentOffsetBug; + } + bool hasFlatLgkmVMemCountInOrder() const { return getGeneration() > GFX9; } @@ -654,10 +686,6 @@ return HasSDWAOutModsVOPC; } - bool vmemWriteNeedsExpWaitcnt() const { - return getGeneration() < SEA_ISLANDS; - } - bool hasDLInsts() const { return HasDLInsts; } @@ -674,6 +702,30 @@ return EnableSRAMECC; } + bool hasNoSdstCMPX() const { + return HasNoSdstCMPX; + } + + bool hasVscnt() const { + return HasVscnt; + } + + bool hasRegisterBanking() const { + return HasRegisterBanking; + } + + bool hasVOP3Literal() const { + return HasVOP3Literal; + } + + bool hasNoDataDepHazard() const { + return HasNoDataDepHazard; + } + + bool vmemWriteNeedsExpWaitcnt() const { + return getGeneration() < SEA_ISLANDS; + } + // Scratch is allocated in 256 dword per wave blocks for the entire // wavefront. When viewed from the perspecive of an arbitrary workitem, this // is 4-byte aligned. @@ -782,6 +834,12 @@ return HasR128A16; } + bool hasNSAEncoding() const { + return HasNSAEncoding; + } + + bool hasMadF16() const; + bool enableSIScheduler() const { return EnableSIScheduler; } @@ -816,6 +874,38 @@ getGeneration() <= AMDGPUSubtarget::GFX9; } + bool hasVcmpxPermlaneHazard() const { + return HasVcmpxPermlaneHazard; + } + + bool hasVMEMtoScalarWriteHazard() const { + return HasVMEMtoScalarWriteHazard; + } + + bool hasSMEMtoVectorWriteHazard() const { + return HasSMEMtoVectorWriteHazard; + } + + bool hasLDSMisalignedBug() const { + return LDSMisalignedBug && !EnableCuMode; + } + + bool hasInstFwdPrefetchBug() const { + return HasInstFwdPrefetchBug; + } + + bool hasVcmpxExecWARHazard() const { + return HasVcmpxExecWARHazard; + } + + bool hasLdsBranchVmemWARHazard() const { + return HasLdsBranchVmemWARHazard; + } + + bool hasNSAtoVMEMBug() const { + return HasNSAtoVMEMBug; + } + /// Return the maximum number of waves per SIMD for kernels using \p SGPRs /// SGPRs unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -181,6 +181,7 @@ HasApertureRegs(false), EnableXNACK(false), + EnableCuMode(false), TrapHandler(false), EnableHugePrivateBuffer(false), @@ -196,6 +197,7 @@ CIInsts(false), GFX8Insts(false), GFX9Insts(false), + GFX10Insts(false), GFX7GFX8GFX9Insts(false), SGPRInitBug(false), HasSMemRealTime(false), @@ -212,20 +214,37 @@ HasSDWAOutModsVOPC(false), HasDPP(false), HasR128A16(false), + HasNSAEncoding(false), HasDLInsts(false), HasDot1Insts(false), HasDot2Insts(false), EnableSRAMECC(false), DoesNotSupportSRAMECC(false), + HasNoSdstCMPX(false), + HasVscnt(false), + HasRegisterBanking(false), + HasVOP3Literal(false), + HasNoDataDepHazard(false), FlatAddressSpace(false), FlatInstOffsets(false), FlatGlobalInsts(false), FlatScratchInsts(false), + ScalarFlatScratchInsts(false), AddNoCarryInsts(false), HasUnpackedD16VMem(false), + LDSMisalignedBug(false), ScalarizeGlobal(false), + HasVcmpxPermlaneHazard(false), + HasVMEMtoScalarWriteHazard(false), + HasSMEMtoVectorWriteHazard(false), + HasInstFwdPrefetchBug(false), + HasVcmpxExecWARHazard(false), + HasLdsBranchVmemWARHazard(false), + HasNSAtoVMEMBug(false), + HasFlatSegmentOffsetBug(false), + FeatureDisable(false), InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), TLInfo(TM, *this), @@ -243,6 +262,8 @@ return getLocalMemorySize(); unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); + if (!WorkGroupsPerCu) + return 0; unsigned MaxWaves = getMaxWavesPerEU(); return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves; } @@ -251,6 +272,8 @@ const Function &F) const { unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second; unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize); + if (!WorkGroupsPerCu) + return 0; unsigned MaxWaves = getMaxWavesPerEU(); unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu; unsigned NumWaves = Limit / (Bytes ? Bytes : 1u); @@ -271,7 +294,8 @@ case CallingConv::AMDGPU_CS: case CallingConv::AMDGPU_KERNEL: case CallingConv::SPIR_KERNEL: - return std::make_pair(getWavefrontSize() * 2, getWavefrontSize() * 4); + return std::make_pair(getWavefrontSize() * 2, + std::max(getWavefrontSize() * 4, 256u)); case CallingConv::AMDGPU_VS: case CallingConv::AMDGPU_LS: case CallingConv::AMDGPU_HS: @@ -496,7 +520,14 @@ Policy.ShouldTrackLaneMasks = true; } +bool GCNSubtarget::hasMadF16() const { + return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1; +} + unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { + if (getGeneration() >= AMDGPUSubtarget::GFX10) + return 10; + if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { if (SGPRs <= 80) return 10; @@ -543,6 +574,9 @@ unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { const SIMachineFunctionInfo &MFI = *MF.getInfo(); + if (getGeneration() >= AMDGPUSubtarget::GFX10) + return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. + if (MFI.hasFlatScratchInit()) { if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). Index: llvm/trunk/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ llvm/trunk/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -999,6 +999,10 @@ return AMDGPU::isGFX9(getSTI()); } + bool isGFX10() const { + return AMDGPU::isGFX10(getSTI()); + } + bool hasInv2PiInlineImm() const { return getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]; } @@ -1407,7 +1411,7 @@ bool AMDGPUOperand::isSDWAOperand(MVT type) const { if (AsmParser->isVI()) return isVReg32(); - else if (AsmParser->isGFX9()) + else if (AsmParser->isGFX9() || AsmParser->isGFX10()) return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(type); else return false; @@ -2953,7 +2957,7 @@ if (getParser().parseIdentifier(KernelName)) return true; - kernel_descriptor_t KD = getDefaultAmdhsaKernelDescriptor(); + kernel_descriptor_t KD = getDefaultAmdhsaKernelDescriptor(&getSTI()); StringSet<> Seen; Index: llvm/trunk/lib/Target/AMDGPU/GCNProcessors.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/GCNProcessors.td +++ llvm/trunk/lib/Target/AMDGPU/GCNProcessors.td @@ -164,3 +164,10 @@ FeatureISAVersion9_0_9.Features >; +//===----------------------------------------------------------------------===// +// GCN GFX10. +//===----------------------------------------------------------------------===// + +def : ProcessorModel<"gfx1010", GFX10SpeedModel, + FeatureISAVersion10_1_0.Features +>; Index: llvm/trunk/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ llvm/trunk/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -60,39 +60,40 @@ AMDGPU::GPUKind AK; switch (ElfMach) { - case ELF::EF_AMDGPU_MACH_R600_R600: AK = GK_R600; break; - case ELF::EF_AMDGPU_MACH_R600_R630: AK = GK_R630; break; - case ELF::EF_AMDGPU_MACH_R600_RS880: AK = GK_RS880; break; - case ELF::EF_AMDGPU_MACH_R600_RV670: AK = GK_RV670; break; - case ELF::EF_AMDGPU_MACH_R600_RV710: AK = GK_RV710; break; - case ELF::EF_AMDGPU_MACH_R600_RV730: AK = GK_RV730; break; - case ELF::EF_AMDGPU_MACH_R600_RV770: AK = GK_RV770; break; - case ELF::EF_AMDGPU_MACH_R600_CEDAR: AK = GK_CEDAR; break; - case ELF::EF_AMDGPU_MACH_R600_CYPRESS: AK = GK_CYPRESS; break; - case ELF::EF_AMDGPU_MACH_R600_JUNIPER: AK = GK_JUNIPER; break; - case ELF::EF_AMDGPU_MACH_R600_REDWOOD: AK = GK_REDWOOD; break; - case ELF::EF_AMDGPU_MACH_R600_SUMO: AK = GK_SUMO; break; - case ELF::EF_AMDGPU_MACH_R600_BARTS: AK = GK_BARTS; break; - case ELF::EF_AMDGPU_MACH_R600_CAICOS: AK = GK_CAICOS; break; - case ELF::EF_AMDGPU_MACH_R600_CAYMAN: AK = GK_CAYMAN; break; - case ELF::EF_AMDGPU_MACH_R600_TURKS: AK = GK_TURKS; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX600: AK = GK_GFX600; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX601: AK = GK_GFX601; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX700: AK = GK_GFX700; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX701: AK = GK_GFX701; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX702: AK = GK_GFX702; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX703: AK = GK_GFX703; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX704: AK = GK_GFX704; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX801: AK = GK_GFX801; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX802: AK = GK_GFX802; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX803: AK = GK_GFX803; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX810: AK = GK_GFX810; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX900: AK = GK_GFX900; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX902: AK = GK_GFX902; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX904: AK = GK_GFX904; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX906: AK = GK_GFX906; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909: AK = GK_GFX909; break; - case ELF::EF_AMDGPU_MACH_NONE: AK = GK_NONE; break; + case ELF::EF_AMDGPU_MACH_R600_R600: AK = GK_R600; break; + case ELF::EF_AMDGPU_MACH_R600_R630: AK = GK_R630; break; + case ELF::EF_AMDGPU_MACH_R600_RS880: AK = GK_RS880; break; + case ELF::EF_AMDGPU_MACH_R600_RV670: AK = GK_RV670; break; + case ELF::EF_AMDGPU_MACH_R600_RV710: AK = GK_RV710; break; + case ELF::EF_AMDGPU_MACH_R600_RV730: AK = GK_RV730; break; + case ELF::EF_AMDGPU_MACH_R600_RV770: AK = GK_RV770; break; + case ELF::EF_AMDGPU_MACH_R600_CEDAR: AK = GK_CEDAR; break; + case ELF::EF_AMDGPU_MACH_R600_CYPRESS: AK = GK_CYPRESS; break; + case ELF::EF_AMDGPU_MACH_R600_JUNIPER: AK = GK_JUNIPER; break; + case ELF::EF_AMDGPU_MACH_R600_REDWOOD: AK = GK_REDWOOD; break; + case ELF::EF_AMDGPU_MACH_R600_SUMO: AK = GK_SUMO; break; + case ELF::EF_AMDGPU_MACH_R600_BARTS: AK = GK_BARTS; break; + case ELF::EF_AMDGPU_MACH_R600_CAICOS: AK = GK_CAICOS; break; + case ELF::EF_AMDGPU_MACH_R600_CAYMAN: AK = GK_CAYMAN; break; + case ELF::EF_AMDGPU_MACH_R600_TURKS: AK = GK_TURKS; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX600: AK = GK_GFX600; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX601: AK = GK_GFX601; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX700: AK = GK_GFX700; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX701: AK = GK_GFX701; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX702: AK = GK_GFX702; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX703: AK = GK_GFX703; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX704: AK = GK_GFX704; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX801: AK = GK_GFX801; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX802: AK = GK_GFX802; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX803: AK = GK_GFX803; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX810: AK = GK_GFX810; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX900: AK = GK_GFX900; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX902: AK = GK_GFX902; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX904: AK = GK_GFX904; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX906: AK = GK_GFX906; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909: AK = GK_GFX909; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: AK = GK_GFX1010; break; + case ELF::EF_AMDGPU_MACH_NONE: AK = GK_NONE; break; } StringRef GPUName = getArchNameAMDGCN(AK); @@ -139,6 +140,7 @@ case GK_GFX904: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX904; case GK_GFX906: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX906; case GK_GFX909: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX909; + case GK_GFX1010: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010; case GK_NONE: return ELF::EF_AMDGPU_MACH_NONE; } @@ -324,6 +326,17 @@ PRINT_FIELD(OS, ".amdhsa_fp16_overflow", KD, compute_pgm_rsrc1, amdhsa::COMPUTE_PGM_RSRC1_FP16_OVFL); + if (IVersion.Major >= 10) { + PRINT_FIELD(OS, ".amdhsa_workgroup_processor_mode", KD, + compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_WGP_MODE); + PRINT_FIELD(OS, ".amdhsa_memory_ordered", KD, + compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_MEM_ORDERED); + PRINT_FIELD(OS, ".amdhsa_forward_progress", KD, + compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_FWD_PROGRESS); + } PRINT_FIELD( OS, ".amdhsa_exception_fp_ieee_invalid_op", KD, compute_pgm_rsrc2, Index: llvm/trunk/lib/Target/AMDGPU/SIDefines.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIDefines.h +++ llvm/trunk/lib/Target/AMDGPU/SIDefines.h @@ -523,6 +523,15 @@ #define S_00B848_IEEE_MODE(x) (((x) & 0x1) << 23) #define G_00B848_IEEE_MODE(x) (((x) >> 23) & 0x1) #define C_00B848_IEEE_MODE 0xFF7FFFFF +#define S_00B848_WGP_MODE(x) (((x) & 0x1) << 29) +#define G_00B848_WGP_MODE(x) (((x) >> 29) & 0x1) +#define C_00B848_WGP_MODE 0xDFFFFFFF +#define S_00B848_MEM_ORDERED(x) (((x) & 0x1) << 30) +#define G_00B848_MEM_ORDERED(x) (((x) >> 30) & 0x1) +#define C_00B848_MEM_ORDERED 0xBFFFFFFF +#define S_00B848_FWD_PROGRESS(x) (((x) & 0x1) << 31) +#define G_00B848_FWD_PROGRESS(x) (((x) >> 31) & 0x1) +#define C_00B848_FWD_PROGRESS 0x7FFFFFFF // Helpers for setting FLOAT_MODE @@ -553,6 +562,15 @@ #define R_0286E8_SPI_TMPRING_SIZE 0x0286E8 #define S_0286E8_WAVESIZE(x) (((x) & 0x1FFF) << 12) +#define R_028B54_VGT_SHADER_STAGES_EN 0x028B54 +#define S_028B54_HS_W32_EN(x) (((x) & 0x1) << 21) +#define S_028B54_GS_W32_EN(x) (((x) & 0x1) << 22) +#define S_028B54_VS_W32_EN(x) (((x) & 0x1) << 23) +#define R_0286D8_SPI_PS_IN_CONTROL 0x0286D8 +#define S_0286D8_PS_W32_EN(x) (((x) & 0x1) << 15) +#define R_00B800_COMPUTE_DISPATCH_INITIATOR 0x00B800 +#define S_00B800_CS_W32_EN(x) (((x) & 0x1) << 15) + #define R_SPILLED_SGPRS 0x4 #define R_SPILLED_VGPRS 0x8 } // End namespace llvm Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -5591,7 +5591,9 @@ SDWA = 2, SDWA9 = 3, GFX80 = 4, - GFX9 = 5 + GFX9 = 5, + GFX10 = 6, + SDWA10 = 7 }; static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) { @@ -5604,6 +5606,8 @@ case AMDGPUSubtarget::VOLCANIC_ISLANDS: case AMDGPUSubtarget::GFX9: return SIEncodingFamily::VI; + case AMDGPUSubtarget::GFX10: + return SIEncodingFamily::GFX10; } llvm_unreachable("Unknown subtarget generation!"); } Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td @@ -23,6 +23,8 @@ int SDWA9 = 3; int GFX80 = 4; int GFX9 = 5; + int GFX10 = 6; + int SDWA10 = 7; } //===----------------------------------------------------------------------===// Index: llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.td +++ llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.td @@ -112,9 +112,9 @@ } foreach Index = 0-15 in { - def TTMP#Index#_vi : SIReg<"ttmp"#Index, !add(112, Index)>; - def TTMP#Index#_gfx9 : SIReg<"ttmp"#Index, !add(108, Index)>; - def TTMP#Index : SIReg<"", 0>; + def TTMP#Index#_vi : SIReg<"ttmp"#Index, !add(112, Index)>; + def TTMP#Index#_gfx9_gfx10 : SIReg<"ttmp"#Index, !add(108, Index)>; + def TTMP#Index : SIReg<"", 0>; } multiclass FLAT_SCR_LOHI_m ci_e, bits<16> vi_e> { @@ -311,8 +311,8 @@ getSubRegs.ret>; foreach Index = {0, 2, 4, 6, 8, 10, 12, 14} in { - def TTMP#Index#_TTMP#!add(Index,1)#_vi : TmpRegTuples<"_vi", 2, Index>; - def TTMP#Index#_TTMP#!add(Index,1)#_gfx9 : TmpRegTuples<"_gfx9", 2, Index>; + def TTMP#Index#_TTMP#!add(Index,1)#_vi : TmpRegTuples<"_vi", 2, Index>; + def TTMP#Index#_TTMP#!add(Index,1)#_gfx9_gfx10 : TmpRegTuples<"_gfx9_gfx10", 2, Index>; } foreach Index = {0, 4, 8, 12} in { @@ -321,7 +321,7 @@ _TTMP#!add(Index,3)#_vi : TmpRegTuples<"_vi", 4, Index>; def TTMP#Index#_TTMP#!add(Index,1)# _TTMP#!add(Index,2)# - _TTMP#!add(Index,3)#_gfx9 : TmpRegTuples<"_gfx9", 4, Index>; + _TTMP#!add(Index,3)#_gfx9_gfx10 : TmpRegTuples<"_gfx9_gfx10", 4, Index>; } foreach Index = {0, 4, 8} in { @@ -338,7 +338,7 @@ _TTMP#!add(Index,4)# _TTMP#!add(Index,5)# _TTMP#!add(Index,6)# - _TTMP#!add(Index,7)#_gfx9 : TmpRegTuples<"_gfx9", 8, Index>; + _TTMP#!add(Index,7)#_gfx9_gfx10 : TmpRegTuples<"_gfx9_gfx10", 8, Index>; } def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15_vi : @@ -348,12 +348,12 @@ TTMP8_vi, TTMP9_vi, TTMP10_vi, TTMP11_vi, TTMP12_vi, TTMP13_vi, TTMP14_vi, TTMP15_vi]>; -def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15_gfx9 : +def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15_gfx9_gfx10 : TmpRegTuplesBase<0, 16, - [TTMP0_gfx9, TTMP1_gfx9, TTMP2_gfx9, TTMP3_gfx9, - TTMP4_gfx9, TTMP5_gfx9, TTMP6_gfx9, TTMP7_gfx9, - TTMP8_gfx9, TTMP9_gfx9, TTMP10_gfx9, TTMP11_gfx9, - TTMP12_gfx9, TTMP13_gfx9, TTMP14_gfx9, TTMP15_gfx9]>; + [TTMP0_gfx9_gfx10, TTMP1_gfx9_gfx10, TTMP2_gfx9_gfx10, TTMP3_gfx9_gfx10, + TTMP4_gfx9_gfx10, TTMP5_gfx9_gfx10, TTMP6_gfx9_gfx10, TTMP7_gfx9_gfx10, + TTMP8_gfx9_gfx10, TTMP9_gfx9_gfx10, TTMP10_gfx9_gfx10, TTMP11_gfx9_gfx10, + TTMP12_gfx9_gfx10, TTMP13_gfx9_gfx10, TTMP14_gfx9_gfx10, TTMP15_gfx9_gfx10]>; // VGPR 32-bit registers Index: llvm/trunk/lib/Target/AMDGPU/SISchedule.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SISchedule.td +++ llvm/trunk/lib/Target/AMDGPU/SISchedule.td @@ -37,6 +37,9 @@ // half rate f64 instruction (same as v_add_f64) def WriteDoubleAdd : SchedWrite; +// Conversion to or from f64 instruction +def WriteDoubleCvt : SchedWrite; + // Half rate 64-bit instructions. def Write64Bit : SchedWrite; @@ -61,6 +64,7 @@ def SIFullSpeedModel : SISchedMachineModel; def SIQuarterSpeedModel : SISchedMachineModel; +def GFX10SpeedModel : SISchedMachineModel; // XXX: Are the resource counts correct? def HWBranch : ProcResource<1> { @@ -81,6 +85,9 @@ def HWVALU : ProcResource<1> { let BufferSize = 1; } +def HWRC : ProcResource<1> { // Register destination cache + let BufferSize = 1; +} class HWWriteRes resources, int latency> : WriteRes { @@ -124,6 +131,7 @@ def : HWVALUWriteRes; def : HWVALUWriteRes; def : HWVALUWriteRes; +def : HWVALUWriteRes; def : InstRW<[WriteCopy], (instrs COPY)>; @@ -136,7 +144,32 @@ def : HWVALUWriteRes; def : HWVALUWriteRes; def : HWVALUWriteRes; +def : HWVALUWriteRes; def : InstRW<[WriteCopy], (instrs COPY)>; } // End SchedModel = SIQuarterSpeedModel + +let SchedModel = GFX10SpeedModel in { + +// The latency values are 1 / (operations / cycle). +// Add 1 stall cycle for VGPR read. +def : HWWriteRes; +def : HWWriteRes; +def : HWWriteRes; +def : HWWriteRes; +def : HWWriteRes; +def : HWWriteRes; +def : HWWriteRes; + +def : HWWriteRes; +def : HWWriteRes; +def : HWWriteRes; +def : HWWriteRes; +def : HWWriteRes; +def : HWWriteRes; +def : HWWriteRes; + +def : InstRW<[WriteCopy], (instrs COPY)>; + +} // End SchedModel = GFX10SpeedModel Index: llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -244,7 +244,8 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, const MCSubtargetInfo *STI); -amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor(); +amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor( + const MCSubtargetInfo *STI); bool isGroupSegment(const GlobalValue *GV); bool isGlobalSegment(const GlobalValue *GV); @@ -398,6 +399,7 @@ bool isCI(const MCSubtargetInfo &STI); bool isVI(const MCSubtargetInfo &STI); bool isGFX9(const MCSubtargetInfo &STI); +bool isGFX10(const MCSubtargetInfo &STI); /// Is Reg - scalar register bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI); Index: llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -435,11 +435,21 @@ Header.kernarg_segment_alignment = 4; Header.group_segment_alignment = 4; Header.private_segment_alignment = 4; + + if (Version.Major >= 10) { + Header.compute_pgm_resource_registers |= + S_00B848_WGP_MODE(STI->getFeatureBits().test(FeatureCuMode) ? 0 : 1) | + S_00B848_MEM_ORDERED(1); + } } -amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor() { +amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor( + const MCSubtargetInfo *STI) { + IsaVersion Version = getIsaVersion(STI->getCPU()); + amdhsa::kernel_descriptor_t KD; memset(&KD, 0, sizeof(KD)); + AMDHSA_BITS_SET(KD.compute_pgm_rsrc1, amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64, amdhsa::FLOAT_DENORM_MODE_FLUSH_NONE); @@ -449,6 +459,13 @@ amdhsa::COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE, 1); AMDHSA_BITS_SET(KD.compute_pgm_rsrc2, amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X, 1); + if (Version.Major >= 10) { + AMDHSA_BITS_SET(KD.compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_WGP_MODE, + STI->getFeatureBits().test(FeatureCuMode) ? 0 : 1); + AMDHSA_BITS_SET(KD.compute_pgm_rsrc1, + amdhsa::COMPUTE_PGM_RSRC1_MEM_ORDERED, 1); + } return KD; } @@ -679,6 +696,10 @@ return STI.getFeatureBits()[AMDGPU::FeatureGFX9]; } +bool isGFX10(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureGFX10]; +} + bool isGCN3Encoding(const MCSubtargetInfo &STI) { return STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding]; } @@ -704,46 +725,46 @@ CASE_CI_VI(FLAT_SCR) \ CASE_CI_VI(FLAT_SCR_LO) \ CASE_CI_VI(FLAT_SCR_HI) \ - CASE_VI_GFX9(TTMP0) \ - CASE_VI_GFX9(TTMP1) \ - CASE_VI_GFX9(TTMP2) \ - CASE_VI_GFX9(TTMP3) \ - CASE_VI_GFX9(TTMP4) \ - CASE_VI_GFX9(TTMP5) \ - CASE_VI_GFX9(TTMP6) \ - CASE_VI_GFX9(TTMP7) \ - CASE_VI_GFX9(TTMP8) \ - CASE_VI_GFX9(TTMP9) \ - CASE_VI_GFX9(TTMP10) \ - CASE_VI_GFX9(TTMP11) \ - CASE_VI_GFX9(TTMP12) \ - CASE_VI_GFX9(TTMP13) \ - CASE_VI_GFX9(TTMP14) \ - CASE_VI_GFX9(TTMP15) \ - CASE_VI_GFX9(TTMP0_TTMP1) \ - CASE_VI_GFX9(TTMP2_TTMP3) \ - CASE_VI_GFX9(TTMP4_TTMP5) \ - CASE_VI_GFX9(TTMP6_TTMP7) \ - CASE_VI_GFX9(TTMP8_TTMP9) \ - CASE_VI_GFX9(TTMP10_TTMP11) \ - CASE_VI_GFX9(TTMP12_TTMP13) \ - CASE_VI_GFX9(TTMP14_TTMP15) \ - CASE_VI_GFX9(TTMP0_TTMP1_TTMP2_TTMP3) \ - CASE_VI_GFX9(TTMP4_TTMP5_TTMP6_TTMP7) \ - CASE_VI_GFX9(TTMP8_TTMP9_TTMP10_TTMP11) \ - CASE_VI_GFX9(TTMP12_TTMP13_TTMP14_TTMP15) \ - CASE_VI_GFX9(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7) \ - CASE_VI_GFX9(TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11) \ - CASE_VI_GFX9(TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \ - CASE_VI_GFX9(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \ + CASE_VI_GFX9_GFX10(TTMP0) \ + CASE_VI_GFX9_GFX10(TTMP1) \ + CASE_VI_GFX9_GFX10(TTMP2) \ + CASE_VI_GFX9_GFX10(TTMP3) \ + CASE_VI_GFX9_GFX10(TTMP4) \ + CASE_VI_GFX9_GFX10(TTMP5) \ + CASE_VI_GFX9_GFX10(TTMP6) \ + CASE_VI_GFX9_GFX10(TTMP7) \ + CASE_VI_GFX9_GFX10(TTMP8) \ + CASE_VI_GFX9_GFX10(TTMP9) \ + CASE_VI_GFX9_GFX10(TTMP10) \ + CASE_VI_GFX9_GFX10(TTMP11) \ + CASE_VI_GFX9_GFX10(TTMP12) \ + CASE_VI_GFX9_GFX10(TTMP13) \ + CASE_VI_GFX9_GFX10(TTMP14) \ + CASE_VI_GFX9_GFX10(TTMP15) \ + CASE_VI_GFX9_GFX10(TTMP0_TTMP1) \ + CASE_VI_GFX9_GFX10(TTMP2_TTMP3) \ + CASE_VI_GFX9_GFX10(TTMP4_TTMP5) \ + CASE_VI_GFX9_GFX10(TTMP6_TTMP7) \ + CASE_VI_GFX9_GFX10(TTMP8_TTMP9) \ + CASE_VI_GFX9_GFX10(TTMP10_TTMP11) \ + CASE_VI_GFX9_GFX10(TTMP12_TTMP13) \ + CASE_VI_GFX9_GFX10(TTMP14_TTMP15) \ + CASE_VI_GFX9_GFX10(TTMP0_TTMP1_TTMP2_TTMP3) \ + CASE_VI_GFX9_GFX10(TTMP4_TTMP5_TTMP6_TTMP7) \ + CASE_VI_GFX9_GFX10(TTMP8_TTMP9_TTMP10_TTMP11) \ + CASE_VI_GFX9_GFX10(TTMP12_TTMP13_TTMP14_TTMP15) \ + CASE_VI_GFX9_GFX10(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7) \ + CASE_VI_GFX9_GFX10(TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11) \ + CASE_VI_GFX9_GFX10(TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \ + CASE_VI_GFX9_GFX10(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \ } #define CASE_CI_VI(node) \ assert(!isSI(STI)); \ case node: return isCI(STI) ? node##_ci : node##_vi; -#define CASE_VI_GFX9(node) \ - case node: return isGFX9(STI) ? node##_gfx9 : node##_vi; +#define CASE_VI_GFX9_GFX10(node) \ + case node: return (isGFX9(STI) || isGFX10(STI)) ? node##_gfx9_gfx10 : node##_vi; unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) { if (STI.getTargetTriple().getArch() == Triple::r600) @@ -752,17 +773,17 @@ } #undef CASE_CI_VI -#undef CASE_VI_GFX9 +#undef CASE_VI_GFX9_GFX10 #define CASE_CI_VI(node) case node##_ci: case node##_vi: return node; -#define CASE_VI_GFX9(node) case node##_vi: case node##_gfx9: return node; +#define CASE_VI_GFX9_GFX10(node) case node##_vi: case node##_gfx9_gfx10: return node; unsigned mc2PseudoReg(unsigned Reg) { MAP_REG2REG } #undef CASE_CI_VI -#undef CASE_VI_GFX9 +#undef CASE_VI_GFX9_GFX10 #undef MAP_REG2REG bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo) { @@ -1030,5 +1051,6 @@ bool isIntrinsicSourceOfDivergence(unsigned IntrID) { return lookupSourceOfDivergence(IntrID); } + } // namespace AMDGPU } // namespace llvm Index: llvm/trunk/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h +++ llvm/trunk/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h @@ -82,6 +82,9 @@ COMPPGM1(enable_dx10_clamp, compute_pgm_rsrc1_dx10_clamp, DX10_CLAMP), COMPPGM1(debug_mode, compute_pgm_rsrc1_debug_mode, DEBUG_MODE), COMPPGM1(enable_ieee_mode, compute_pgm_rsrc1_ieee_mode, IEEE_MODE), +COMPPGM1(enable_wgp_mode, compute_pgm_rsrc1_wgp_mode, WGP_MODE), +COMPPGM1(enable_mem_ordered, compute_pgm_rsrc1_mem_ordered, MEM_ORDERED), +COMPPGM1(enable_fwd_progress, compute_pgm_rsrc1_fwd_progress, FWD_PROGRESS), // TODO: bulky // TODO: cdbg_user COMPPGM2(enable_sgpr_private_segment_wave_byte_offset, compute_pgm_rsrc2_scratch_en, SCRATCH_EN), Index: llvm/trunk/test/CodeGen/AMDGPU/elf-header-flags-mach.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/elf-header-flags-mach.ll +++ llvm/trunk/test/CodeGen/AMDGPU/elf-header-flags-mach.ll @@ -47,6 +47,7 @@ ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx904 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX904 %s ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx906 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX906 %s ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx909 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX909 %s +; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx1010 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1010 %s ; ARCH-R600: Arch: r600 ; ARCH-GCN: Arch: amdgcn @@ -87,6 +88,7 @@ ; GFX904: EF_AMDGPU_MACH_AMDGCN_GFX904 (0x2E) ; GFX906: EF_AMDGPU_MACH_AMDGCN_GFX906 (0x2F) ; GFX909: EF_AMDGPU_MACH_AMDGCN_GFX909 (0x31) +; GFX1010: EF_AMDGPU_MACH_AMDGCN_GFX1010 (0x33) ; ALL: ] define amdgpu_kernel void @elf_header() { Index: llvm/trunk/test/CodeGen/AMDGPU/hsa-note-no-func.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/hsa-note-no-func.ll +++ llvm/trunk/test/CodeGen/AMDGPU/hsa-note-no-func.ll @@ -24,6 +24,7 @@ ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx904 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX904 %s ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx906 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX906 %s ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx909 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX909 %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX1010 %s ; HSA: .hsa_code_object_version 2,1 ; HSA-SI600: .hsa_code_object_isa 6,0,0,"AMD","AMDGPU" @@ -42,3 +43,4 @@ ; HSA-GFX904: .hsa_code_object_isa 9,0,4,"AMD","AMDGPU" ; HSA-GFX906: .hsa_code_object_isa 9,0,6,"AMD","AMDGPU" ; HSA-GFX909: .hsa_code_object_isa 9,0,9,"AMD","AMDGPU" +; HSA-GFX1010: .hsa_code_object_isa 10,1,0,"AMD","AMDGPU" Index: llvm/trunk/tools/llvm-readobj/ELFDumper.cpp =================================================================== --- llvm/trunk/tools/llvm-readobj/ELFDumper.cpp +++ llvm/trunk/tools/llvm-readobj/ELFDumper.cpp @@ -1275,6 +1275,7 @@ LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX904), LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX906), LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX909), + LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1010), LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_XNACK), LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_SRAM_ECC) };