diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h --- a/clang/include/clang/Basic/Cuda.h +++ b/clang/include/clang/Basic/Cuda.h @@ -74,6 +74,7 @@ GFX1010, GFX1011, GFX1012, + GFX1030, LAST, }; diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp --- a/clang/lib/Basic/Targets/AMDGPU.cpp +++ b/clang/lib/Basic/Targets/AMDGPU.cpp @@ -170,6 +170,22 @@ // XXX - What does the member GPU mean if device name string passed here? if (isAMDGCN(getTriple())) { switch (llvm::AMDGPU::parseArchAMDGCN(CPU)) { + case GK_GFX1030: + Features["ci-insts"] = true; + Features["dot1-insts"] = true; + Features["dot2-insts"] = true; + Features["dot5-insts"] = true; + Features["dot6-insts"] = true; + Features["dl-insts"] = true; + Features["flat-address-space"] = true; + Features["16-bit-insts"] = true; + Features["dpp"] = true; + Features["gfx8-insts"] = true; + Features["gfx9-insts"] = true; + Features["gfx10-insts"] = true; + Features["gfx10-3-insts"] = true; + Features["s-memrealtime"] = true; + break; case GK_GFX1012: case GK_GFX1011: Features["dot1-insts"] = true; diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp --- a/clang/lib/Basic/Targets/NVPTX.cpp +++ b/clang/lib/Basic/Targets/NVPTX.cpp @@ -200,6 +200,7 @@ case CudaArch::GFX1010: case CudaArch::GFX1011: case CudaArch::GFX1012: + case CudaArch::GFX1030: case CudaArch::LAST: break; case CudaArch::UNKNOWN: diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -5015,6 +5015,7 @@ case CudaArch::GFX1010: case CudaArch::GFX1011: case CudaArch::GFX1012: + case CudaArch::GFX1030: case CudaArch::UNKNOWN: break; case CudaArch::LAST: @@ -5074,6 +5075,7 @@ case CudaArch::GFX1010: case CudaArch::GFX1011: case CudaArch::GFX1012: + case CudaArch::GFX1030: case CudaArch::UNKNOWN: break; case CudaArch::LAST: diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl --- a/clang/test/CodeGenOpenCL/amdgpu-features.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl @@ -13,6 +13,7 @@ // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1010 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1010 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1011 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1011 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1012 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1012 %s +// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1030 -S -emit-llvm -o - %s | FileCheck --check-prefix=GFX1030 %s // GFX600-NOT: "target-features" // GFX601-NOT: "target-features" @@ -24,5 +25,6 @@ // GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime" // GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime" // GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime" +// GFX1030: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dpp,+flat-address-space,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime" kernel void test() {} diff --git a/clang/test/Driver/amdgpu-macros.cl b/clang/test/Driver/amdgpu-macros.cl --- a/clang/test/Driver/amdgpu-macros.cl +++ b/clang/test/Driver/amdgpu-macros.cl @@ -180,6 +180,7 @@ // RUN: %clang -E -dM -target amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,GFX1010 %s // RUN: %clang -E -dM -target amdgcn -mcpu=gfx1011 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,GFX1011 %s // RUN: %clang -E -dM -target amdgcn -mcpu=gfx1012 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,GFX1012 %s +// RUN: %clang -E -dM -target amdgcn -mcpu=gfx1030 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,GFX1030 %s // GFX600-DAG: #define FP_FAST_FMA 1 // GFX601-DAG: #define FP_FAST_FMA 1 @@ -201,6 +202,7 @@ // GFX1010-DAG: #define FP_FAST_FMA 1 // GFX1011-DAG: #define FP_FAST_FMA 1 // GFX1012-DAG: #define FP_FAST_FMA 1 +// GFX1030-DAG: #define FP_FAST_FMA 1 // GFX600-DAG: #define FP_FAST_FMAF 1 // GFX601-NOT: #define FP_FAST_FMAF 1 @@ -222,6 +224,7 @@ // GFX1010-DAG: #define FP_FAST_FMAF 1 // GFX1011-DAG: #define FP_FAST_FMAF 1 // GFX1012-DAG: #define FP_FAST_FMAF 1 +// GFX1030-DAG: #define FP_FAST_FMAF 1 // ARCH-GCN-DAG: #define __AMDGCN__ 1 // ARCH-GCN-DAG: #define __AMDGPU__ 1 @@ -247,6 +250,7 @@ // GFX1010-DAG: #define __HAS_FMAF__ 1 // GFX1011-DAG: #define __HAS_FMAF__ 1 // GFX1012-DAG: #define __HAS_FMAF__ 1 +// GFX1030-DAG: #define __HAS_FMAF__ 1 // GFX600-DAG: #define __HAS_FP64__ 1 // GFX601-DAG: #define __HAS_FP64__ 1 @@ -268,6 +272,7 @@ // GFX1010-DAG: #define __HAS_FP64__ 1 // GFX1011-DAG: #define __HAS_FP64__ 1 // GFX1012-DAG: #define __HAS_FP64__ 1 +// GFX1030-DAG: #define __HAS_FP64__ 1 // GFX600-DAG: #define __HAS_LDEXPF__ 1 // GFX601-DAG: #define __HAS_LDEXPF__ 1 @@ -289,6 +294,7 @@ // GFX1010-DAG: #define __HAS_LDEXPF__ 1 // GFX1011-DAG: #define __HAS_LDEXPF__ 1 // GFX1012-DAG: #define __HAS_LDEXPF__ 1 +// GFX1030-DAG: #define __HAS_LDEXPF__ 1 // GFX600-DAG: #define __gfx600__ 1 // GFX601-DAG: #define __gfx601__ 1 @@ -310,3 +316,4 @@ // GFX1010-DAG: #define __gfx1010__ 1 // GFX1011-DAG: #define __gfx1011__ 1 // GFX1012-DAG: #define __gfx1012__ 1 +// GFX1030-DAG: #define __gfx1030__ 1 diff --git a/clang/test/Driver/amdgpu-mcpu.cl b/clang/test/Driver/amdgpu-mcpu.cl --- a/clang/test/Driver/amdgpu-mcpu.cl +++ b/clang/test/Driver/amdgpu-mcpu.cl @@ -90,6 +90,7 @@ // RUN: %clang -### -target amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefix=GFX1010 %s // RUN: %clang -### -target amdgcn -mcpu=gfx1011 %s 2>&1 | FileCheck --check-prefix=GFX1011 %s // RUN: %clang -### -target amdgcn -mcpu=gfx1012 %s 2>&1 | FileCheck --check-prefix=GFX1012 %s +// RUN: %clang -### -target amdgcn -mcpu=gfx1030 %s 2>&1 | FileCheck --check-prefix=GFX1030 %s // GCNDEFAULT-NOT: -target-cpu // GFX600: "-target-cpu" "gfx600" @@ -129,3 +130,4 @@ // GFX1010: "-target-cpu" "gfx1010" // GFX1011: "-target-cpu" "gfx1011" // GFX1012: "-target-cpu" "gfx1012" +// GFX1030: "-target-cpu" "gfx1030" diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -263,6 +263,15 @@ .. TODO:: Add product names. + ``gfx1030`` ``amdgcn`` dGPU - xnack *TBA* + [off] + - wavefrontsize64 + [off] + - cumode + [off] + .. TODO + Add product + names. =========== =============== ============ ===== ================= ======= ====================== .. _amdgpu-target-features: @@ -806,6 +815,7 @@ ``EF_AMDGPU_MACH_AMDGCN_GFX1010`` 0x033 ``gfx1010`` ``EF_AMDGPU_MACH_AMDGCN_GFX1011`` 0x034 ``gfx1011`` ``EF_AMDGPU_MACH_AMDGCN_GFX1012`` 0x035 ``gfx1012`` + ``EF_AMDGPU_MACH_AMDGCN_GFX1030`` 0x036 ``gfx1030`` ================================= ========== ============================= Sections diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h --- a/llvm/include/llvm/BinaryFormat/ELF.h +++ b/llvm/include/llvm/BinaryFormat/ELF.h @@ -706,6 +706,7 @@ EF_AMDGPU_MACH_AMDGCN_GFX1010 = 0x033, EF_AMDGPU_MACH_AMDGCN_GFX1011 = 0x034, EF_AMDGPU_MACH_AMDGCN_GFX1012 = 0x035, + EF_AMDGPU_MACH_AMDGCN_GFX1030 = 0x036, // Reserved for AMDGCN-based processors. EF_AMDGPU_MACH_AMDGCN_RESERVED0 = 0x027, @@ -713,7 +714,7 @@ // First/last AMDGCN-based processors. EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600, - EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX1012, + EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX1030, // Indicates if the "xnack" target feature is enabled for all code contained // in the object. diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -765,6 +765,11 @@ "STORE_MIP", [], [AMDGPUArg], [IntrWriteMem], [SDNPMemOperand], 1>; + defm int_amdgcn_image_msaa_load + : AMDGPUImageDimIntrinsicsAll<"MSAA_LOAD", [llvm_any_ty], [], [IntrReadMem], + [SDNPMemOperand]>, + AMDGPUImageDMaskIntrinsic; + ////////////////////////////////////////////////////////////////////////// // sample and getlod intrinsics ////////////////////////////////////////////////////////////////////////// @@ -1142,6 +1147,7 @@ [ImmArg>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<2, 0>; +def int_amdgcn_buffer_atomic_csub : AMDGPUBufferAtomic; } // defset AMDGPUBufferIntrinsics // Uses that do not set the done bit should set IntrWriteMem on the @@ -1603,6 +1609,14 @@ GCCBuiltin<"__builtin_amdgcn_s_get_waveid_in_workgroup">, Intrinsic<[llvm_i32_ty], [], [IntrReadMem, IntrInaccessibleMemOnly]>; +class AMDGPUGlobalAtomicRtn : Intrinsic < + [vt], + [llvm_anyptr_ty, // vaddr + vt], // vdata(VGPR) + [IntrArgMemOnly, NoCapture>], "", [SDNPMemOperand]>; + +def int_amdgcn_global_atomic_csub : AMDGPUGlobalAtomicRtn; + //===----------------------------------------------------------------------===// // Deep learning intrinsics. //===----------------------------------------------------------------------===// diff --git a/llvm/include/llvm/Support/TargetParser.h b/llvm/include/llvm/Support/TargetParser.h --- a/llvm/include/llvm/Support/TargetParser.h +++ b/llvm/include/llvm/Support/TargetParser.h @@ -84,9 +84,10 @@ GK_GFX1010 = 71, GK_GFX1011 = 72, GK_GFX1012 = 73, + GK_GFX1030 = 75, GK_AMDGCN_FIRST = GK_GFX600, - GK_AMDGCN_LAST = GK_GFX1012, + GK_AMDGCN_LAST = GK_GFX1030, }; /// Instruction set architecture version. diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp --- a/llvm/lib/ObjectYAML/ELFYAML.cpp +++ b/llvm/lib/ObjectYAML/ELFYAML.cpp @@ -429,6 +429,7 @@ BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1010, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1011, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1012, EF_AMDGPU_MACH); + BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1030, EF_AMDGPU_MACH); BCase(EF_AMDGPU_XNACK); BCase(EF_AMDGPU_SRAM_ECC); break; diff --git a/llvm/lib/Support/TargetParser.cpp b/llvm/lib/Support/TargetParser.cpp --- a/llvm/lib/Support/TargetParser.cpp +++ b/llvm/lib/Support/TargetParser.cpp @@ -62,7 +62,7 @@ // This table should be sorted by the value of GPUKind // Don't bother listing the implicitly true features -constexpr GPUInfo AMDGCNGPUs[37] = { +constexpr GPUInfo AMDGCNGPUs[38] = { // Name Canonical Kind Features // Name {{"gfx600"}, {"gfx600"}, GK_GFX600, FEATURE_FAST_FMA_F32}, @@ -102,6 +102,7 @@ {{"gfx1010"}, {"gfx1010"}, GK_GFX1010, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32}, {{"gfx1011"}, {"gfx1011"}, GK_GFX1011, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32}, {{"gfx1012"}, {"gfx1012"}, GK_GFX1012, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32}, + {{"gfx1030"}, {"gfx1030"}, GK_GFX1030, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32}, }; const GPUInfo *getArchEntry(AMDGPU::GPUKind AK, ArrayRef Table) { @@ -203,6 +204,7 @@ case GK_GFX1010: return {10, 1, 0}; case GK_GFX1011: return {10, 1, 1}; case GK_GFX1012: return {10, 1, 2}; + case GK_GFX1030: return {10, 3, 0}; default: return {0, 0, 0}; } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -260,6 +260,12 @@ "Additional instructions for GFX10+" >; +def FeatureGFX10_3Insts : SubtargetFeature<"gfx10-3-insts", + "GFX10_3Insts", + "true", + "Additional instructions for GFX10.3" +>; + def FeatureGFX7GFX8GFX9Insts : SubtargetFeature<"gfx7-gfx8-gfx9-insts", "GFX7GFX8GFX9Insts", "true", @@ -387,6 +393,12 @@ "Support NSA encoding for image instructions" >; +def FeatureGFX10_BEncoding : SubtargetFeature<"gfx10_b-encoding", + "GFX10_BEncoding", + "true", + "Encoding format GFX10_B" +>; + def FeatureIntClamp : SubtargetFeature<"int-clamp-insts", "HasIntClamp", "true", @@ -485,6 +497,30 @@ "Has separate store vscnt counter" >; +def FeatureGetWaveIdInst : SubtargetFeature<"get-wave-id-inst", + "HasGetWaveIdInst", + "true", + "Has s_get_waveid_in_workgroup instruction" +>; + +def FeatureSMemTimeInst : SubtargetFeature<"s-memtime-inst", + "HasSMemTimeInst", + "true", + "Has s_memtime instruction" +>; + +def FeatureMadMacF32Insts : SubtargetFeature<"mad-mac-f32-insts", + "HasMadMacF32Insts", + "true", + "Has v_mad_f32/v_mac_f32/v_madak_f32/v_madmk_f32 instructions" +>; + +def FeatureDsSrc2Insts : SubtargetFeature<"ds-src2-insts", + "HasDsSrc2Insts", + "true", + "Has ds_*_src2 instructions" +>; + def FeatureRegisterBanking : SubtargetFeature<"register-banking", "HasRegisterBanking", "true", @@ -617,9 +653,10 @@ def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS", "southern-islands", [FeatureFP64, FeatureLocalMemorySize32768, FeatureMIMG_R128, - FeatureWavefrontSize64, - FeatureLDSBankCount32, FeatureMovrel, FeatureTrigReducedRange, - FeatureDoesNotSupportSRAMECC, FeatureDoesNotSupportXNACK] + FeatureWavefrontSize64, FeatureSMemTimeInst, FeatureMadMacF32Insts, + FeatureDsSrc2Insts, FeatureLDSBankCount32, FeatureMovrel, + FeatureTrigReducedRange, FeatureDoesNotSupportSRAMECC, + FeatureDoesNotSupportXNACK] >; def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS", @@ -627,7 +664,8 @@ [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128, FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange, - FeatureGFX7GFX8GFX9Insts, FeatureDoesNotSupportSRAMECC] + FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts, + FeatureDsSrc2Insts, FeatureDoesNotSupportSRAMECC] >; def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS", @@ -638,8 +676,9 @@ FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel, FeatureScalarStores, FeatureInv2PiInlineImm, FeatureSDWA, FeatureSDWAOutModsVOPC, FeatureSDWAMac, FeatureDPP, - FeatureIntClamp, FeatureTrigReducedRange, FeatureDoesNotSupportSRAMECC, - FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts, FeatureFastDenormalF32 + FeatureIntClamp, FeatureTrigReducedRange, FeatureGFX8Insts, + FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts, + FeatureDsSrc2Insts, FeatureDoesNotSupportSRAMECC, FeatureFastDenormalF32 ] >; @@ -655,7 +694,9 @@ FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts, FeatureAddNoCarryInsts, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts, FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16, - FeatureFastDenormalF32] + FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts, + FeatureFastDenormalF32 + ] >; def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10", @@ -843,6 +884,10 @@ FeatureScalarStores, FeatureScalarAtomics, FeatureScalarFlatScratchInsts, + FeatureGetWaveIdInst, + FeatureSMemTimeInst, + FeatureMadMacF32Insts, + FeatureDsSrc2Insts, FeatureLdsMisalignedBug, FeatureDoesNotSupportXNACK, FeatureCodeObjectV3])>; @@ -861,6 +906,10 @@ FeatureScalarStores, FeatureScalarAtomics, FeatureScalarFlatScratchInsts, + FeatureGetWaveIdInst, + FeatureSMemTimeInst, + FeatureMadMacF32Insts, + FeatureDsSrc2Insts, FeatureDoesNotSupportXNACK, FeatureCodeObjectV3])>; @@ -878,10 +927,29 @@ FeatureScalarStores, FeatureScalarAtomics, FeatureScalarFlatScratchInsts, + FeatureGetWaveIdInst, + FeatureSMemTimeInst, + FeatureMadMacF32Insts, + FeatureDsSrc2Insts, FeatureLdsMisalignedBug, FeatureDoesNotSupportXNACK, FeatureCodeObjectV3])>; +def FeatureISAVersion10_3_0 : FeatureSet< + [FeatureGFX10, + FeatureGFX10_BEncoding, + FeatureGFX10_3Insts, + FeatureLDSBankCount32, + FeatureDLInsts, + FeatureDot1Insts, + FeatureDot2Insts, + FeatureDot5Insts, + FeatureDot6Insts, + FeatureNSAEncoding, + FeatureWavefrontSize32, + FeatureDoesNotSupportXNACK, + FeatureCodeObjectV3]>; + //===----------------------------------------------------------------------===// def AMDGPUInstrInfo : InstrInfo { @@ -1039,6 +1107,9 @@ def HasD16LoadStore : Predicate<"Subtarget->hasD16LoadStore()">, AssemblerPredicate<(all_of FeatureGFX9Insts)>; +def HasGFX10_BEncoding : Predicate<"Subtarget->hasGFX10_BEncoding()">, + AssemblerPredicate<(all_of FeatureGFX10_BEncoding)>; + def HasUnpackedD16VMem : Predicate<"Subtarget->hasUnpackedD16VMem()">, AssemblerPredicate<(all_of FeatureUnpackedD16VMem)>; def HasPackedD16VMem : Predicate<"!Subtarget->hasUnpackedD16VMem()">, @@ -1148,15 +1219,32 @@ def HasDot6Insts : Predicate<"Subtarget->hasDot6Insts()">, AssemblerPredicate<(all_of FeatureDot6Insts)>; +def HasGetWaveIdInst : Predicate<"Subtarget->hasGetWaveIdInst()">, + AssemblerPredicate<(all_of FeatureGetWaveIdInst)>; + def HasMAIInsts : Predicate<"Subtarget->hasMAIInsts()">, AssemblerPredicate<(all_of FeatureMAIInsts)>; +def HasSMemTimeInst : Predicate<"Subtarget->hasSMemTimeInst()">, + AssemblerPredicate<(all_of FeatureSMemTimeInst)>; + +def HasNoSMemTimeInst : Predicate<"!Subtarget->hasSMemTimeInst()">; + def HasPkFmacF16Inst : Predicate<"Subtarget->hasPkFmacF16Inst()">, AssemblerPredicate<(all_of FeaturePkFmacF16Inst)>; +def HasMadMacF32Insts : Predicate<"Subtarget->hasMadMacF32Insts()">, + AssemblerPredicate<(all_of FeatureMadMacF32Insts)>; + def HasAtomicFaddInsts : Predicate<"Subtarget->hasAtomicFaddInsts()">, AssemblerPredicate<(all_of FeatureAtomicFaddInsts)>; +def HasNoMadMacF32Insts : Predicate<"!Subtarget->hasMadMacF32Insts()">, + AssemblerPredicate<(all_of (not FeatureMadMacF32Insts))>; + +def HasDsSrc2Insts : Predicate<"!Subtarget->hasDsSrc2Insts()">, + AssemblerPredicate<(all_of FeatureDsSrc2Insts)>; + def HasOffset3fBug : Predicate<"!Subtarget->hasOffset3fBug()">, AssemblerPredicate<(all_of FeatureOffset3fBug)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -923,7 +923,10 @@ Value *FQNeg = Builder.CreateFNeg(FQ); // float fr = mad(fqneg, fb, fa); - Value *FR = Builder.CreateIntrinsic(Intrinsic::amdgcn_fmad_ftz, + auto FMAD = !ST->hasMadMacF32Insts() + ? Intrinsic::fma + : (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz; + Value *FR = Builder.CreateIntrinsic(FMAD, {FQNeg->getType()}, {FQNeg, FB, FA}, FQ); // int iq = (int)fq; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -716,7 +716,8 @@ (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC || Opc == ISD::ATOMIC_LOAD_FADD || Opc == AMDGPUISD::ATOMIC_LOAD_FMIN || - Opc == AMDGPUISD::ATOMIC_LOAD_FMAX)) { + Opc == AMDGPUISD::ATOMIC_LOAD_FMAX || + Opc == AMDGPUISD::ATOMIC_LOAD_CSUB)) { N = glueCopyToM0LDSInit(N); SelectCode(N); return; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -509,6 +509,7 @@ ATOMIC_DEC, ATOMIC_LOAD_FMIN, ATOMIC_LOAD_FMAX, + ATOMIC_LOAD_CSUB, BUFFER_LOAD, BUFFER_LOAD_UBYTE, BUFFER_LOAD_USHORT, @@ -535,6 +536,7 @@ BUFFER_ATOMIC_INC, BUFFER_ATOMIC_DEC, BUFFER_ATOMIC_CMPSWAP, + BUFFER_ATOMIC_CSUB, BUFFER_ATOMIC_FADD, BUFFER_ATOMIC_PK_FADD, ATOMIC_PK_FADD, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -1699,10 +1699,11 @@ const AMDGPUMachineFunction *MFI = MF.getInfo(); // float fr = mad(fqneg, fb, fa); - unsigned OpCode = !MFI->getMode().allFP32Denormals() ? + unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? + (unsigned)ISD::FMA : + !MFI->getMode().allFP32Denormals() ? (unsigned)ISD::FMAD : (unsigned)AMDGPUISD::FMAD_FTZ; - SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa); // int iq = (int)fq; @@ -1785,11 +1786,12 @@ const SIMachineFunctionInfo *MFI = MF.getInfo(); // Compute denominator reciprocal. - unsigned FMAD = !MFI->getMode().allFP32Denormals() ? + unsigned FMAD = !Subtarget->hasMadMacF32Insts() ? + (unsigned)ISD::FMA : + !MFI->getMode().allFP32Denormals() ? (unsigned)ISD::FMAD : (unsigned)AMDGPUISD::FMAD_FTZ; - SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo); SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi); SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi, @@ -4394,6 +4396,7 @@ NODE_NAME_CASE(ATOMIC_DEC) NODE_NAME_CASE(ATOMIC_LOAD_FMIN) NODE_NAME_CASE(ATOMIC_LOAD_FMAX) + NODE_NAME_CASE(ATOMIC_LOAD_CSUB) NODE_NAME_CASE(BUFFER_LOAD) NODE_NAME_CASE(BUFFER_LOAD_UBYTE) NODE_NAME_CASE(BUFFER_LOAD_USHORT) @@ -4420,6 +4423,7 @@ NODE_NAME_CASE(BUFFER_ATOMIC_INC) NODE_NAME_CASE(BUFFER_ATOMIC_DEC) NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP) + NODE_NAME_CASE(BUFFER_ATOMIC_CSUB) NODE_NAME_CASE(BUFFER_ATOMIC_FADD) NODE_NAME_CASE(BUFFER_ATOMIC_PK_FADD) NODE_NAME_CASE(ATOMIC_PK_FADD) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -198,6 +198,7 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; @@ -238,6 +239,7 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -67,6 +67,8 @@ protected: bool Has16BitInsts; bool HasMadMixInsts; + bool HasMadMacF32Insts; + bool HasDsSrc2Insts; bool HasSDWA; bool HasVOP3PInsts; bool HasMulI24; @@ -140,6 +142,10 @@ return isAmdHsaOS() || isMesaKernel(F); } + bool isGCN() const { + return TargetTriple.getArch() == Triple::amdgcn; + } + bool has16BitInsts() const { return Has16BitInsts; } @@ -148,6 +154,14 @@ return HasMadMixInsts; } + bool hasMadMacF32Insts() const { + return HasMadMacF32Insts || !isGCN(); + } + + bool hasDsSrc2Insts() const { + return HasDsSrc2Insts; + } + bool hasSDWA() const { return HasSDWA; } @@ -325,6 +339,7 @@ bool GFX8Insts; bool GFX9Insts; bool GFX10Insts; + bool GFX10_3Insts; bool GFX7GFX8GFX9Insts; bool SGPRInitBug; bool HasSMemRealTime; @@ -345,6 +360,7 @@ bool HasGFX10A16; bool HasG16; bool HasNSAEncoding; + bool GFX10_BEncoding; bool HasDLInsts; bool HasDot1Insts; bool HasDot2Insts; @@ -359,6 +375,8 @@ bool DoesNotSupportSRAMECC; bool HasNoSdstCMPX; bool HasVscnt; + bool HasGetWaveIdInst; + bool HasSMemTimeInst; bool HasRegisterBanking; bool HasVOP3Literal; bool HasNoDataDepHazard; @@ -721,6 +739,14 @@ return ScalarFlatScratchInsts; } + bool hasGlobalAddTidInsts() const { + return GFX10_BEncoding; + } + + bool hasAtomicCSub() const { + return GFX10_BEncoding; + } + bool hasMultiDwordFlatScratchAddressing() const { return getGeneration() >= GFX9; } @@ -854,6 +880,14 @@ return HasVscnt; } + bool hasGetWaveIdInst() const { + return HasGetWaveIdInst; + } + + bool hasSMemTimeInst() const { + return HasSMemTimeInst; + } + bool hasRegisterBanking() const { return HasRegisterBanking; } @@ -972,6 +1006,14 @@ return HasNSAEncoding; } + bool hasGFX10_BEncoding() const { + return GFX10_BEncoding; + } + + bool hasGFX10_3Insts() const { + return GFX10_3Insts; + } + bool hasMadF16() const; bool enableSIScheduler() const { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -153,6 +153,8 @@ TargetTriple(TT), Has16BitInsts(false), HasMadMixInsts(false), + HasMadMacF32Insts(false), + HasDsSrc2Insts(false), HasSDWA(false), HasVOP3PInsts(false), HasMulI24(true), @@ -205,6 +207,7 @@ GFX8Insts(false), GFX9Insts(false), GFX10Insts(false), + GFX10_3Insts(false), GFX7GFX8GFX9Insts(false), SGPRInitBug(false), HasSMemRealTime(false), @@ -225,6 +228,7 @@ HasGFX10A16(false), HasG16(false), HasNSAEncoding(false), + GFX10_BEncoding(false), HasDLInsts(false), HasDot1Insts(false), HasDot2Insts(false), @@ -239,6 +243,8 @@ DoesNotSupportSRAMECC(false), HasNoSdstCMPX(false), HasVscnt(false), + HasGetWaveIdInst(false), + HasSMemTimeInst(false), HasRegisterBanking(false), HasVOP3Literal(false), HasNoDataDepHazard(false), diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1188,6 +1188,10 @@ return AMDGPU::isGFX10(getSTI()); } + bool isGFX10_BEncoding() const { + return AMDGPU::isGFX10_BEncoding(getSTI()); + } + bool hasInv2PiInlineImm() const { return getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]; } diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1003,6 +1003,11 @@ "buffer_atomic_dec_x2", VReg_64, i64, atomic_dec_global_64 >; +let SubtargetPredicate = HasGFX10_BEncoding in +defm BUFFER_ATOMIC_CSUB : MUBUF_Pseudo_Atomics_RTN < + "buffer_atomic_csub", VGPR_32, i32, atomic_csub_global_32 +>; + let SubtargetPredicate = isGFX8GFX9 in { def BUFFER_STORE_LDS_DWORD : MUBUF_Pseudo_Store_Lds <"buffer_store_lds_dword">; } @@ -1372,6 +1377,7 @@ defm : BufferAtomicPatterns; defm : BufferAtomicPatterns; defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; defm : BufferAtomicPatterns; defm : BufferAtomicPatterns; defm : BufferAtomicPatterns; @@ -1879,8 +1885,7 @@ def _LDS_BOTHEN_gfx10 : MUBUF_Real_gfx10(NAME#"_LDS_BOTHEN")>, MUBUFLdsTable<1, NAME # "_BOTHEN_gfx10">; } - multiclass MUBUF_Real_Atomics_gfx10 op> : - MUBUF_Real_AllAddr_gfx10 { + multiclass MUBUF_Real_Atomics_RTN_gfx10 op> { def _BOTHEN_RTN_gfx10 : MUBUF_Real_gfx10(NAME#"_BOTHEN_RTN")>; def _IDXEN_RTN_gfx10 : @@ -1890,6 +1895,8 @@ def _OFFSET_RTN_gfx10 : MUBUF_Real_gfx10(NAME#"_OFFSET_RTN")>; } + multiclass MUBUF_Real_Atomics_gfx10 op> : + MUBUF_Real_AllAddr_gfx10, MUBUF_Real_Atomics_RTN_gfx10; } // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Real_AllAddr_gfx10<0x019>; @@ -2054,6 +2061,8 @@ defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05f>; defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x060>; +defm BUFFER_ATOMIC_CSUB : MUBUF_Real_Atomics_RTN_gfx10<0x034>; + defm BUFFER_WBINVL1_SC : MUBUF_Real_gfx6<0x070>; defm BUFFER_WBINVL1_VOL : MUBUF_Real_gfx7<0x070>; def BUFFER_WBINVL1_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<0x071, BUFFER_WBINVL1>; diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -505,6 +505,7 @@ def DS_GWS_BARRIER : DS_GWS_1D<"ds_gws_barrier">; } +let SubtargetPredicate = HasDsSrc2Insts in { def DS_ADD_SRC2_U32 : DS_1A<"ds_add_src2_u32">; def DS_SUB_SRC2_U32 : DS_1A<"ds_sub_src2_u32">; def DS_RSUB_SRC2_U32 : DS_1A<"ds_rsub_src2_u32">; @@ -537,6 +538,7 @@ def DS_WRITE_SRC2_B32 : DS_1A<"ds_write_src2_b32">; def DS_WRITE_SRC2_B64 : DS_1A<"ds_write_src2_b64">; +} // End SubtargetPredicate = HasDsSrc2Insts let Uses = [EXEC], mayLoad = 0, mayStore = 0, isConvergent = 1 in { def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32", VGPR_32, 0, SwizzleImm>; @@ -619,7 +621,7 @@ } // let SubtargetPredicate = isGFX8Plus -let SubtargetPredicate = HasLDSFPAtomics in { +let SubtargetPredicate = HasLDSFPAtomics, OtherPredicates = [HasDsSrc2Insts] in { def DS_ADD_SRC2_F32 : DS_1A<"ds_add_src2_f32">; } diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -296,6 +296,18 @@ if (Bytes.size() >= 8) { const uint64_t QW = eatBytes(Bytes); + if (STI.getFeatureBits()[AMDGPU::FeatureGFX10_BEncoding]) { + Res = tryDecodeInst(DecoderTableGFX10_B64, MI, QW, Address); + if (Res) { + if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dpp8) + == -1) + break; + if (convertDPP8Inst(MI) == MCDisassembler::Success) + break; + MI = MCInst(); // clear + } + } + Res = tryDecodeInst(DecoderTableDPP864, MI, QW, Address); if (Res && convertDPP8Inst(MI) == MCDisassembler::Success) break; @@ -345,6 +357,11 @@ Res = tryDecodeInst(DecoderTableGFX932, MI, DW, Address); if (Res) break; + if (STI.getFeatureBits()[AMDGPU::FeatureGFX10_BEncoding]) { + Res = tryDecodeInst(DecoderTableGFX10_B32, MI, DW, Address); + if (Res) break; + } + Res = tryDecodeInst(DecoderTableGFX1032, MI, DW, Address); if (Res) break; diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -183,6 +183,25 @@ } } +class FLAT_Global_Load_AddTid_Pseudo : FLAT_Pseudo< + opName, + (outs regClass:$vdst), + !con((ins SReg_64:$saddr, flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc), + !if(HasTiedOutput, (ins regClass:$vdst_in), (ins))), + " $vdst, $saddr$offset$glc$slc$dlc"> { + let is_flat_global = 1; + let has_data = 0; + let mayLoad = 1; + let has_vaddr = 0; + let has_saddr = 1; + let enabled_saddr = 1; + let maybeAtomic = 1; + + let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", ""); + let DisableEncoding = !if(HasTiedOutput, "$vdst_in", ""); +} + multiclass FLAT_Global_Store_Pseudo { let is_flat_global = 1, SubtargetPredicate = HasFlatGlobalInsts in { def "" : FLAT_Store_Pseudo, @@ -192,6 +211,24 @@ } } +class FLAT_Global_Store_AddTid_Pseudo : FLAT_Pseudo< + opName, + (outs), + !con( + (ins vdataClass:$vdata, SReg_64:$saddr), + (ins flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc)), + " $vdata, $saddr$offset$glc$slc$dlc"> { + let is_flat_global = 1; + let mayLoad = 0; + let mayStore = 1; + let has_vdst = 0; + let has_vaddr = 0; + let has_saddr = 1; + let enabled_saddr = 1; + let maybeAtomic = 1; +} + class FLAT_Scratch_Load_Pseudo : FLAT_Pseudo< opName, @@ -526,6 +563,8 @@ defm GLOBAL_LOAD_SBYTE_D16_HI : FLAT_Global_Load_Pseudo <"global_load_sbyte_d16_hi", VGPR_32, 1>; defm GLOBAL_LOAD_SHORT_D16 : FLAT_Global_Load_Pseudo <"global_load_short_d16", VGPR_32, 1>; defm GLOBAL_LOAD_SHORT_D16_HI : FLAT_Global_Load_Pseudo <"global_load_short_d16_hi", VGPR_32, 1>; +let OtherPredicates = [HasGFX10_BEncoding] in +def GLOBAL_LOAD_DWORD_ADDTID : FLAT_Global_Load_AddTid_Pseudo <"global_load_dword_addtid", VGPR_32>; defm GLOBAL_STORE_BYTE : FLAT_Global_Store_Pseudo <"global_store_byte", VGPR_32>; defm GLOBAL_STORE_SHORT : FLAT_Global_Store_Pseudo <"global_store_short", VGPR_32>; @@ -533,6 +572,8 @@ defm GLOBAL_STORE_DWORDX2 : FLAT_Global_Store_Pseudo <"global_store_dwordx2", VReg_64>; defm GLOBAL_STORE_DWORDX3 : FLAT_Global_Store_Pseudo <"global_store_dwordx3", VReg_96>; defm GLOBAL_STORE_DWORDX4 : FLAT_Global_Store_Pseudo <"global_store_dwordx4", VReg_128>; +let OtherPredicates = [HasGFX10_BEncoding] in +def GLOBAL_STORE_DWORD_ADDTID : FLAT_Global_Store_AddTid_Pseudo <"global_store_dword_addtid", VGPR_32>; defm GLOBAL_STORE_BYTE_D16_HI : FLAT_Global_Store_Pseudo <"global_store_byte_d16_hi", VGPR_32>; defm GLOBAL_STORE_SHORT_D16_HI : FLAT_Global_Store_Pseudo <"global_store_short_d16_hi", VGPR_32>; @@ -618,6 +659,10 @@ defm GLOBAL_ATOMIC_DEC_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_dec_x2", VReg_64, i64, atomic_dec_global_64>; + +let SubtargetPredicate = HasGFX10_BEncoding in +defm GLOBAL_ATOMIC_CSUB : FLAT_Global_Atomic_Pseudo_RTN <"global_atomic_csub", + VGPR_32, i32, atomic_csub_global_32>; } // End is_flat_global = 1 @@ -914,6 +959,7 @@ def : FlatSignedAtomicPat ; def : FlatSignedAtomicPat ; def : FlatSignedAtomicPat ; +def : FlatSignedAtomicPat ; def : FlatSignedAtomicPat ; def : FlatSignedAtomicPat ; @@ -1214,6 +1260,9 @@ FLAT_Real_RTN_gfx10, FLAT_Real_SADDR_RTN_gfx10; +multiclass FLAT_Real_GlblAtomics_RTN_gfx10 op> : + FLAT_Real_RTN_gfx10, + FLAT_Real_SADDR_RTN_gfx10; // ENC_FLAT. defm FLAT_LOAD_UBYTE : FLAT_Real_Base_gfx10<0x008>; @@ -1299,6 +1348,7 @@ defm GLOBAL_ATOMIC_CMPSWAP : FLAT_Real_GlblAtomics_gfx10<0x031>; defm GLOBAL_ATOMIC_ADD : FLAT_Real_GlblAtomics_gfx10<0x032>; defm GLOBAL_ATOMIC_SUB : FLAT_Real_GlblAtomics_gfx10<0x033>; +defm GLOBAL_ATOMIC_CSUB : FLAT_Real_GlblAtomics_RTN_gfx10<0x034>; defm GLOBAL_ATOMIC_SMIN : FLAT_Real_GlblAtomics_gfx10<0x035>; defm GLOBAL_ATOMIC_UMIN : FLAT_Real_GlblAtomics_gfx10<0x036>; defm GLOBAL_ATOMIC_SMAX : FLAT_Real_GlblAtomics_gfx10<0x037>; @@ -1327,7 +1377,8 @@ defm GLOBAL_ATOMIC_FCMPSWAP_X2 : FLAT_Real_GlblAtomics_gfx10<0x05e>; defm GLOBAL_ATOMIC_FMIN_X2 : FLAT_Real_GlblAtomics_gfx10<0x05f>; defm GLOBAL_ATOMIC_FMAX_X2 : FLAT_Real_GlblAtomics_gfx10<0x060>; - +defm GLOBAL_LOAD_DWORD_ADDTID : FLAT_Real_Base_gfx10<0x016>; +defm GLOBAL_STORE_DWORD_ADDTID : FLAT_Real_Base_gfx10<0x017>; // ENC_FLAT_SCRATCH. defm SCRATCH_LOAD_UBYTE : FLAT_Real_AllAddr_gfx10<0x008>; diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td --- a/llvm/lib/Target/AMDGPU/GCNProcessors.td +++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td @@ -183,3 +183,7 @@ def : ProcessorModel<"gfx1012", GFX10SpeedModel, FeatureISAVersion10_1_2.Features >; + +def : ProcessorModel<"gfx1030", GFX10SpeedModel, + FeatureISAVersion10_3_0.Features +>; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -97,6 +97,7 @@ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: AK = GK_GFX1010; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011: AK = GK_GFX1011; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012: AK = GK_GFX1012; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1030: AK = GK_GFX1030; break; case ELF::EF_AMDGPU_MACH_NONE: AK = GK_NONE; break; } @@ -148,6 +149,7 @@ case GK_GFX1010: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010; case GK_GFX1011: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011; case GK_GFX1012: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012; + case GK_GFX1030: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1030; case GK_NONE: return ELF::EF_AMDGPU_MACH_NONE; } diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -829,6 +829,9 @@ //def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", 0x0000007e>; //def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", 0x0000007f>; +let SubtargetPredicate = HasGFX10_BEncoding in +defm IMAGE_MSAA_LOAD : MIMG_NoSampler <0x00000080, "image_msaa_load", 1>; + /********** ========================================= **********/ /********** Table of dimension-aware image intrinsics **********/ /********** ========================================= **********/ diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -333,7 +333,9 @@ ID_FLAT_SCR_HI = 21, ID_XNACK_MASK = 22, ID_POPS_PACKER = 25, - ID_SYMBOLIC_LAST_ = 26, + ID_SHADER_CYCLES = 29, + ID_SYMBOLIC_FIRST_GFX1030_ = ID_SHADER_CYCLES, + ID_SYMBOLIC_LAST_ = 30, ID_SHIFT_ = 0, ID_WIDTH_ = 6, ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -453,10 +453,8 @@ setOperationAction(ISD::FLOG10, MVT::f16, Custom); } - // v_mad_f32 does not support denormals. We report it as unconditionally - // legal, and the context where it is formed will disallow it when fp32 - // denormals are enabled. - setOperationAction(ISD::FMAD, MVT::f32, Legal); + if (Subtarget->hasMadMacF32Insts()) + setOperationAction(ISD::FMAD, MVT::f32, Legal); if (!Subtarget->hasBFI()) { // fcopysign can be done in a single instruction with BFI. @@ -1130,6 +1128,17 @@ return true; } + case Intrinsic::amdgcn_global_atomic_csub: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(CI.getType()); + Info.ptrVal = CI.getOperand(0); + Info.align.reset(); + Info.flags = MachineMemOperand::MOLoad | + MachineMemOperand::MOStore | + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOVolatile; + return true; + } case Intrinsic::amdgcn_ds_gws_init: case Intrinsic::amdgcn_ds_gws_barrier: case Intrinsic::amdgcn_ds_gws_sema_v: @@ -4283,7 +4292,8 @@ // v_mad_f32/v_mac_f32 do not support denormals. EVT VT = N->getValueType(0); if (VT == MVT::f32) - return !hasFP32Denormals(DAG.getMachineFunction()); + return Subtarget->hasMadMacF32Insts() && + !hasFP32Denormals(DAG.getMachineFunction()); if (VT == MVT::f16) { return Subtarget->hasMadF16() && !hasFP64FP16Denormals(DAG.getMachineFunction()); @@ -6859,6 +6869,7 @@ case Intrinsic::amdgcn_buffer_atomic_swap: case Intrinsic::amdgcn_buffer_atomic_add: case Intrinsic::amdgcn_buffer_atomic_sub: + case Intrinsic::amdgcn_buffer_atomic_csub: case Intrinsic::amdgcn_buffer_atomic_smin: case Intrinsic::amdgcn_buffer_atomic_umin: case Intrinsic::amdgcn_buffer_atomic_smax: @@ -6901,6 +6912,9 @@ case Intrinsic::amdgcn_buffer_atomic_sub: Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB; break; + case Intrinsic::amdgcn_buffer_atomic_csub: + Opcode = AMDGPUISD::BUFFER_ATOMIC_CSUB; + break; case Intrinsic::amdgcn_buffer_atomic_smin: Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN; break; @@ -7149,6 +7163,18 @@ return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, Op->getVTList(), Ops, VT, M->getMemOperand()); } + case Intrinsic::amdgcn_global_atomic_csub: { + MemSDNode *M = cast(Op); + SDValue Ops[] = { + M->getOperand(0), // Chain + M->getOperand(2), // Ptr + M->getOperand(3) // Value + }; + + return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_LOAD_CSUB, SDLoc(Op), + M->getVTList(), Ops, M->getMemoryVT(), + M->getMemOperand()); + } default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -28,6 +28,7 @@ int GFX9 = 5; int GFX10 = 6; int SDWA10 = 7; + int GFX10_B = 8; } //===----------------------------------------------------------------------===// @@ -54,6 +55,10 @@ [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] >; +def SIatomic_csub : SDNode<"AMDGPUISD::ATOMIC_LOAD_CSUB", SDTAtomic2, + [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] +>; + def SDTAtomic2_f32 : SDTypeProfile<1, 2, [ SDTCisSameAs<0,2>, SDTCisFP<0>, SDTCisPtrTy<1> ]>; @@ -197,6 +202,7 @@ def SIbuffer_atomic_xor : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_XOR">; def SIbuffer_atomic_inc : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_INC">; def SIbuffer_atomic_dec : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_DEC">; +def SIbuffer_atomic_csub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_CSUB">; def SIbuffer_atomic_fadd : SDBufferAtomicNoRtn <"AMDGPUISD::BUFFER_ATOMIC_FADD", f32>; def SIbuffer_atomic_pk_fadd : SDBufferAtomicNoRtn <"AMDGPUISD::BUFFER_ATOMIC_PK_FADD", v2f16>; @@ -305,6 +311,10 @@ // PatFrags for global memory operations //===----------------------------------------------------------------------===// +let AddressSpaces = !cast("LoadAddress_global").AddrSpaces in { +defm atomic_csub_global : binary_atomic_op; +} + foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in { let AddressSpaces = !cast("LoadAddress_"#as).AddrSpaces in { @@ -658,6 +668,7 @@ defm atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">; defm atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">; +defm atomic_load_csub : SIAtomicM0Glue2 <"LOAD_CSUB", 1>; defm atomic_inc : SIAtomicM0Glue2 <"INC", 1>; defm atomic_dec : SIAtomicM0Glue2 <"DEC", 1>; defm atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">; @@ -1374,6 +1385,7 @@ int FLAT_SCR_HI = 21; int XNACK_MASK = 22; int POPS_PACKER = 25; + int SHADER_CYCLES = 29; } class getHwRegImm { diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -916,6 +916,7 @@ $src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) >; +let SubtargetPredicate = HasMadMacF32Insts in def : FMADModsPat; def : FMADModsPat { let SubtargetPredicate = Has16BitInsts; diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -220,7 +220,7 @@ // Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding. void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) { const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); - if (Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA) + if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA) return; MachineFunction *MF = MI.getParent()->getParent(); diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td --- a/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -319,6 +319,7 @@ } } // End SubtargetPredicate = HasScalarStores +let SubtargetPredicate = HasSMemTimeInst in def S_MEMTIME : SM_Time_Pseudo <"s_memtime", int_amdgcn_s_memtime>; def S_DCACHE_INV : SM_Inval_Pseudo <"s_dcache_inv", int_amdgcn_s_dcache_inv>; @@ -339,10 +340,11 @@ } } // SubtargetPredicate = isGFX8Plus -let SubtargetPredicate = isGFX10Plus in { +let SubtargetPredicate = isGFX10Plus in def S_GL1_INV : SM_Inval_Pseudo<"s_gl1_inv">; +let SubtargetPredicate = HasGetWaveIdInst in def S_GET_WAVEID_IN_WORKGROUP : SM_WaveId_Pseudo <"s_get_waveid_in_workgroup", int_amdgcn_s_get_waveid_in_workgroup>; -} // End SubtargetPredicate = isGFX10Plus + let SubtargetPredicate = HasScalarFlatScratchInsts, Uses = [FLAT_SCR] in { defm S_SCRATCH_LOAD_DWORD : SM_Pseudo_Loads <"s_scratch_load_dword", SReg_64, SReg_32_XM0_XEXEC>; @@ -847,10 +849,21 @@ defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX16", v16f32>; } // End let AddedComplexity = 100 +let OtherPredicates = [HasSMemTimeInst] in { def : GCNPat < (i64 (readcyclecounter)), (S_MEMTIME) >; +} // let OtherPredicates = [HasSMemTimeInst] + +let OtherPredicates = [HasNoSMemTimeInst] in { +def : GCNPat < + (i64 (readcyclecounter)), + (REG_SEQUENCE SReg_64, + (S_GETREG_B32 getHwRegImm.ret), sub0, + (S_MOV_B32 (i32 0)), sub1) +>; +} // let OtherPredicates = [HasNoSMemTimeInst] //===----------------------------------------------------------------------===// // GFX10. diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -793,7 +793,11 @@ "$sdst, $simm16" >; +let hasSideEffects = 1 in { + let mayLoad = 1 in { +// s_getreg_b32 should use hasSideEffects = 1 for tablegen to allow +// its use in the readcyclecounter selection. def S_GETREG_B32 : SOPK_Pseudo < "s_getreg_b32", (outs SReg_32:$sdst), (ins hwreg:$simm16), @@ -801,7 +805,7 @@ >; } -let hasSideEffects = 1, mayLoad = 0, mayStore =0 in { +let mayLoad = 0, mayStore =0 in { def S_SETREG_B32 : SOPK_Pseudo < "s_setreg_b32", @@ -829,6 +833,7 @@ let Uses = [MODE]; } +} } // End hasSideEffects = 1 class SOPK_WAITCNT pat=[]> : diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp @@ -78,7 +78,11 @@ "HW_REG_XNACK_MASK", nullptr, // HW_ID1, no predictable values nullptr, // HW_ID2, no predictable values - "HW_REG_POPS_PACKER" + "HW_REG_POPS_PACKER", + nullptr, + nullptr, + nullptr, + "HW_REG_SHADER_CYCLES" }; } // namespace Hwreg diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -558,6 +558,9 @@ bool isVI(const MCSubtargetInfo &STI); bool isGFX9(const MCSubtargetInfo &STI); bool isGFX10(const MCSubtargetInfo &STI); +bool isGCN3Encoding(const MCSubtargetInfo &STI); +bool isGFX10_BEncoding(const MCSubtargetInfo &STI); +bool hasGFX10_3Insts(const MCSubtargetInfo &STI); /// Is Reg - scalar register bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -311,7 +311,7 @@ // FIXME: Need to take scratch memory into account. if (!isGFX10(*STI)) return 10; - return 20; + return hasGFX10_3Insts(*STI) ? 16 : 20; } unsigned getWavesPerEUForWorkGroup(const MCSubtargetInfo *STI, @@ -441,12 +441,21 @@ bool IsWave32 = EnableWavefrontSize32 ? *EnableWavefrontSize32 : STI->getFeatureBits().test(FeatureWavefrontSize32); + + if (hasGFX10_3Insts(*STI)) + return IsWave32 ? 16 : 8; + return IsWave32 ? 8 : 4; } unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI, Optional EnableWavefrontSize32) { - return getVGPRAllocGranule(STI, EnableWavefrontSize32); + + bool IsWave32 = EnableWavefrontSize32 ? + *EnableWavefrontSize32 : + STI->getFeatureBits().test(FeatureWavefrontSize32); + + return IsWave32 ? 8 : 4; } unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) { @@ -732,13 +741,16 @@ return ID_SYMBOLIC_FIRST_GFX9_; else if (isGFX9(STI)) return ID_SYMBOLIC_FIRST_GFX10_; + else if (isGFX10(STI) && !isGFX10_BEncoding(STI)) + return ID_SYMBOLIC_FIRST_GFX1030_; else return ID_SYMBOLIC_LAST_; } bool isValidHwreg(int64_t Id, const MCSubtargetInfo &STI) { - return ID_SYMBOLIC_FIRST_ <= Id && Id < getLastSymbolicHwreg(STI) && - IdSymbolic[Id]; + return + ID_SYMBOLIC_FIRST_ <= Id && Id < getLastSymbolicHwreg(STI) && + IdSymbolic[Id] && (Id != ID_XNACK_MASK || !AMDGPU::isGFX10_BEncoding(STI)); } bool isValidHwreg(int64_t Id) { @@ -976,6 +988,14 @@ return STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding]; } +bool isGFX10_BEncoding(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureGFX10_BEncoding]; +} + +bool hasGFX10_3Insts(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureGFX10_3Insts]; +} + bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) { const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID); const unsigned FirstSubReg = TRI->getSubReg(Reg, AMDGPU::sub0); diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -463,6 +463,7 @@ //===----------------------------------------------------------------------===// defm V_CNDMASK_B32 : VOP2eInst <"v_cndmask_b32", VOP2e_I32_I32_I32_I1>; +let SubtargetPredicate = HasMadMacF32Insts in def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, []>; let isCommutable = 1 in { @@ -489,12 +490,14 @@ defm V_XOR_B32 : VOP2Inst <"v_xor_b32", VOP_PAT_GEN, xor>; let mayRaiseFPException = 0 in { +let SubtargetPredicate = HasMadMacF32Insts in { let Constraints = "$vdst = $src2", DisableEncoding="$src2", isConvertibleToThreeAddress = 1 in { defm V_MAC_F32 : VOP2Inst <"v_mac_f32", VOP_MAC_F32>; } def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32, []>; +} // End SubtargetPredicate = HasMadMacF32Insts } // No patterns so that the scalar instructions are always selected. @@ -553,6 +556,7 @@ let isCommutable = 1 in { let SubtargetPredicate = isGFX6GFX7GFX10 in { +let OtherPredicates = [HasMadMacF32Insts] in defm V_MAC_LEGACY_F32 : VOP2Inst <"v_mac_legacy_f32", VOP_F32_F32_F32>; } // End SubtargetPredicate = isGFX6GFX7GFX10 let SubtargetPredicate = isGFX6GFX7 in { @@ -1278,6 +1282,7 @@ defm V_ADD_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x003>; defm V_SUB_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x004>; defm V_SUBREV_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x005>; +let OtherPredicates = [HasMadMacF32Insts] in defm V_MAC_LEGACY_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x006>; defm V_MUL_LEGACY_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x007>; defm V_MUL_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x008>; @@ -1610,3 +1615,9 @@ let SubtargetPredicate = HasPkFmacF16Inst in { defm V_PK_FMAC_F16 : VOP2_Real_e32_vi<0x3c>; } // End SubtargetPredicate = HasPkFmacF16Inst + +let SubtargetPredicate = HasDot3Insts in { + // NB: Opcode conflicts with V_DOT2C_F32_F16 + let DecoderNamespace = "GFX10_B" in + defm V_DOT8C_I32_I4 : VOP2_Real_DOT_ACC_gfx10<0x02>; +} diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -292,8 +292,13 @@ let isCommutable = 1 in { let mayRaiseFPException = 0 in { +let SubtargetPredicate = HasMadMacF32Insts in { def V_MAD_LEGACY_F32 : VOP3Inst <"v_mad_legacy_f32", VOP3_Profile>; def V_MAD_F32 : VOP3Inst <"v_mad_f32", VOP3_Profile, fmad>; +} // End SubtargetPredicate = HasMadMacInsts + +let SubtargetPredicate = HasNoMadMacF32Insts in +def V_FMA_LEGACY_F32 : VOP3Inst <"v_fma_legacy_f32", VOP3_Profile>; } def V_MAD_I32_I24 : VOP3Inst <"v_mad_i32_i24", VOP3_Profile>; @@ -1020,6 +1025,10 @@ defm V_DIV_SCALE_F32 : VOP3be_Real_gfx6_gfx7_gfx10<0x16d>; defm V_DIV_SCALE_F64 : VOP3be_Real_gfx6_gfx7_gfx10<0x16e>; +// NB: Same opcode as v_mad_legacy_f32 +let DecoderNamespace = "GFX10_B" in +defm V_FMA_LEGACY_F32 : VOP3_Real_gfx10<0x140>; + //===----------------------------------------------------------------------===// // GFX8, GFX9 (VI). //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -3147,7 +3147,7 @@ ; CGP-NEXT: v_rcp_f32_e32 v2, v1 ; CGP-NEXT: v_mul_f32_e32 v2, v0, v2 ; CGP-NEXT: v_trunc_f32_e32 v2, v2 -; CGP-NEXT: v_mad_f32 v0, -v2, v1, v0 +; CGP-NEXT: v_fma_f32 v0, -v2, v1, v0 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 ; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v0|, v1 ; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] @@ -3444,9 +3444,9 @@ ; CGP-NEXT: v_mul_f32_e32 v6, v2, v6 ; CGP-NEXT: v_trunc_f32_e32 v5, v5 ; CGP-NEXT: v_trunc_f32_e32 v6, v6 -; CGP-NEXT: v_mad_f32 v0, -v5, v3, v0 +; CGP-NEXT: v_fma_f32 v0, -v5, v3, v0 ; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 -; CGP-NEXT: v_mad_f32 v2, -v6, v4, v2 +; CGP-NEXT: v_fma_f32 v2, -v6, v4, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6 ; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v0|, v3 ; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -3116,7 +3116,7 @@ ; CGP-NEXT: v_rcp_f32_e32 v4, v3 ; CGP-NEXT: v_mul_f32_e32 v4, v2, v4 ; CGP-NEXT: v_trunc_f32_e32 v4, v4 -; CGP-NEXT: v_mad_f32 v2, -v4, v3, v2 +; CGP-NEXT: v_fma_f32 v2, -v4, v3, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 ; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, v3 ; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] @@ -3411,9 +3411,9 @@ ; CGP-NEXT: v_mul_f32_e32 v10, v7, v10 ; CGP-NEXT: v_trunc_f32_e32 v9, v9 ; CGP-NEXT: v_trunc_f32_e32 v10, v10 -; CGP-NEXT: v_mad_f32 v5, -v9, v6, v5 +; CGP-NEXT: v_fma_f32 v5, -v9, v6, v5 ; CGP-NEXT: v_cvt_u32_f32_e32 v9, v9 -; CGP-NEXT: v_mad_f32 v7, -v10, v8, v7 +; CGP-NEXT: v_fma_f32 v7, -v10, v8, v7 ; CGP-NEXT: v_cvt_u32_f32_e32 v10, v10 ; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, v6 ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll b/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll --- a/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll +++ b/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll @@ -51,6 +51,7 @@ ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx1010 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1010 %s ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx1011 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1011 %s ; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx1012 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1012 %s +; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx1030 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1030 %s ; ARCH-R600: Arch: r600 ; ARCH-GCN: Arch: amdgcn @@ -96,6 +97,7 @@ ; GFX1010: EF_AMDGPU_MACH_AMDGCN_GFX1010 (0x33) ; GFX1011: EF_AMDGPU_MACH_AMDGCN_GFX1011 (0x34) ; GFX1012: EF_AMDGPU_MACH_AMDGCN_GFX1012 (0x35) +; GFX1030: EF_AMDGPU_MACH_AMDGCN_GFX1030 (0x36) ; ALL: ] define amdgpu_kernel void @elf_header() { diff --git a/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll b/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll --- a/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll @@ -1,8 +1,8 @@ -; RUN: llc -march=amdgcn -mattr=+fast-fmaf -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s -; RUN: llc -march=amdgcn -mattr=-fast-fmaf -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s +; RUN: llc -march=amdgcn -mattr=+fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s +; RUN: llc -march=amdgcn -mattr=-fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s -; RUN: llc -march=amdgcn -mattr=+fast-fmaf -denormal-fp-math-f32=ieee -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FASTFMA %s -; RUN: llc -march=amdgcn -mattr=-fast-fmaf -denormal-fp-math-f32=ieee -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SLOWFMA %s +; RUN: llc -march=amdgcn -mattr=+fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=ieee -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FASTFMA %s +; RUN: llc -march=amdgcn -mattr=-fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=ieee -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SLOWFMA %s ; FIXME: This should also fold when fma is actually fast if an FMA ; exists in the original program. diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll --- a/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll @@ -17,6 +17,8 @@ ; FIXME: Should probably test this, but sometimes selecting fmac is painful to match. ; XUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx906 -denormal-fp-math-f32=ieee -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,GFX9-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,GFX906 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx1030 -mattr=-fp32-denormals -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH-FMAC,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx1030 -mattr=+fp32-denormals -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM,GCN-DENORM-FASTFMA-STRICT,GCN-DENORM-STRICT %s ; Test all permutations of: fp32 denormals, fast fp contract, fp contract enabled for fmuladd, fmaf fast/slow. diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -1,4 +1,4 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=+mad-mac-f32-insts -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s diff --git a/llvm/test/CodeGen/AMDGPU/hsa-note-no-func.ll b/llvm/test/CodeGen/AMDGPU/hsa-note-no-func.ll --- a/llvm/test/CodeGen/AMDGPU/hsa-note-no-func.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-note-no-func.ll @@ -28,6 +28,7 @@ ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX1010 %s ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1011 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX1011 %s ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1012 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX1012 %s +; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1030 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX1030 %s ; HSA: .hsa_code_object_version 2,1 ; HSA-SI600: .hsa_code_object_isa 6,0,0,"AMD","AMDGPU" @@ -50,3 +51,4 @@ ; HSA-GFX1010: .hsa_code_object_isa 10,1,0,"AMD","AMDGPU" ; HSA-GFX1011: .hsa_code_object_isa 10,1,1,"AMD","AMDGPU" ; HSA-GFX1012: .hsa_code_object_isa 10,1,2,"AMD","AMDGPU" +; HSA-GFX1030: .hsa_code_object_isa 10,3,0,"AMD","AMDGPU" diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -5,6 +5,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: idot8_acc32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll @@ -0,0 +1,37 @@ +; RUN: llc < %s -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s -check-prefix=GCN + +declare i32 @llvm.amdgcn.buffer.atomic.csub(i32, <4 x i32>, i32, i32, i1) +declare i32 @llvm.amdgcn.global.atomic.csub(i32 addrspace(1)*, i32) + +; GCN-LABEL: {{^}}buffer_atomic_csub: +; GCN: buffer_atomic_csub v0, v1, s[0:3], 0 idxen glc +define amdgpu_ps void @buffer_atomic_csub(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) { +main_body: + %ret = call i32 @llvm.amdgcn.buffer.atomic.csub(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) + ret void +} + +; GCN-LABEL: {{^}}buffer_atomic_csub_off4_slc: +; GCN: buffer_atomic_csub v0, v1, s[0:3], 0 idxen offset:4 glc slc +define amdgpu_ps void @buffer_atomic_csub_off4_slc(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) { +main_body: + %ret = call i32 @llvm.amdgcn.buffer.atomic.csub(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1) + ret void +} + +; GCN-LABEL: {{^}}global_atomic_csub: +; GCN: global_atomic_csub v{{[0-9]+}}, v[{{[0-9:]+}}], v{{[0-9]+}}, off glc +define amdgpu_kernel void @global_atomic_csub(i32 addrspace(1)* %ptr, i32 %data) { +main_body: + %ret = call i32 @llvm.amdgcn.global.atomic.csub(i32 addrspace(1)* %ptr, i32 %data) + ret void +} + +; GCN-LABEL: {{^}}global_atomic_csub_off4: +; GCN: global_atomic_csub v{{[0-9]+}}, v[{{[0-9:]+}}], v{{[0-9]+}}, off offset:4 glc +define amdgpu_kernel void @global_atomic_csub_off4(i32 addrspace(1)* %ptr, i32 %data) { +main_body: + %p = getelementptr i32, i32 addrspace(1)* %ptr, i64 1 + %ret = call i32 @llvm.amdgcn.global.atomic.csub(i32 addrspace(1)* %p, i32 %data) + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll @@ -0,0 +1,253 @@ +; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,GFX10 %s + +; GCN-LABEL: {{^}}load_1d: +; GFX10: image_msaa_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm ; +define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, i32 %s) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.msaa.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}load_1d_tfe: +; GFX10: image_msaa_load v[0:4], v{{[0-9]+}}, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm tfe ; +define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) { +main_body: + %v = call {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.1d.v4f32i32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) + %v.vec = extractvalue {<4 x float>, i32} %v, 0 + %v.err = extractvalue {<4 x float>, i32} %v, 1 + store i32 %v.err, i32 addrspace(1)* %out, align 4 + ret <4 x float> %v.vec +} + +; GCN-LABEL: {{^}}load_1d_lwe: +; GFX10: image_msaa_load v[0:4], v{{[0-9]+}}, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm lwe ; +define amdgpu_ps <4 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) { +main_body: + %v = call {<4 x float>, i32} @llvm.amdgcn.image.msaa.load.1d.v4f32i32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 2, i32 0) + %v.vec = extractvalue {<4 x float>, i32} %v, 0 + %v.err = extractvalue {<4 x float>, i32} %v, 1 + store i32 %v.err, i32 addrspace(1)* %out, align 4 + ret <4 x float> %v.vec +} + +; GCN-LABEL: {{^}}load_2d: +; GFX10: image_msaa_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm ; +define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2d.v4f32.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}load_2d_tfe: +; GFX10: image_msaa_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe ; +define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t) { +main_body: + %v = call {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.2d.v4f32i32.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0) + %v.vec = extractvalue {<4 x float>, i32} %v, 0 + %v.err = extractvalue {<4 x float>, i32} %v, 1 + store i32 %v.err, i32 addrspace(1)* %out, align 4 + ret <4 x float> %v.vec +} + +; GCN-LABEL: {{^}}load_3d: +; GFX10: image_msaa_load v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm ; +define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.msaa.load.3d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}load_3d_tfe_lwe: +; GFX10: image_msaa_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe lwe ; +define amdgpu_ps <4 x float> @load_3d_tfe_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %r) { +main_body: + %v = call {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.3d.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 3, i32 0) + %v.vec = extractvalue {<4 x float>, i32} %v, 0 + %v.err = extractvalue {<4 x float>, i32} %v, 1 + store i32 %v.err, i32 addrspace(1)* %out, align 4 + ret <4 x float> %v.vec +} + +; GCN-LABEL: {{^}}load_1darray: +; GFX10: image_msaa_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm ; +define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, i32 %s, i32 %slice) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.msaa.load.1darray.v4f32.i32(i32 15, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}load_1darray_tfe: +; GFX10: image_msaa_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm tfe ; +define amdgpu_ps <4 x float> @load_1darray_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %slice) { +main_body: + %v = call {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.1darray.v4f32i32.i32(i32 15, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 1, i32 0) + %v.vec = extractvalue {<4 x float>, i32} %v, 0 + %v.err = extractvalue {<4 x float>, i32} %v, 1 + store i32 %v.err, i32 addrspace(1)* %out, align 4 + ret <4 x float> %v.vec +} + +; GCN-LABEL: {{^}}load_2darray: +; GFX10: image_msaa_load v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm ; +define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2darray.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}load_2darray_lwe: +; GFX10: image_msaa_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm lwe ; +define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice) { +main_body: + %v = call {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.2darray.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0) + %v.vec = extractvalue {<4 x float>, i32} %v, 0 + %v.err = extractvalue {<4 x float>, i32} %v, 1 + store i32 %v.err, i32 addrspace(1)* %out, align 4 + ret <4 x float> %v.vec +} + +; GCN-LABEL: {{^}}load_2dmsaa: +; GFX10: image_msaa_load v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm ; +define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %fragid) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}load_2dmsaa_both: +; GFX10: image_msaa_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; +define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %fragid) { +main_body: + %v = call {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 3, i32 0) + %v.vec = extractvalue {<4 x float>, i32} %v, 0 + %v.err = extractvalue {<4 x float>, i32} %v, 1 + store i32 %v.err, i32 addrspace(1)* %out, align 4 + ret <4 x float> %v.vec +} + +; GCN-LABEL: {{^}}load_2darraymsaa: +; GFX10: image_msaa_load v[0:3], v[0:3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm ; +define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %fragid) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}load_2darraymsaa_tfe: +; GFX10: image_msaa_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; +define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice, i32 %fragid) { +main_body: + %v = call {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0) + %v.vec = extractvalue {<4 x float>, i32} %v, 0 + %v.err = extractvalue {<4 x float>, i32} %v, 1 + store i32 %v.err, i32 addrspace(1)* %out, align 4 + ret <4 x float> %v.vec +} + +; GCN-LABEL: {{^}}load_1d_tfe_V4_dmask3: +; GFX10: image_msaa_load v[0:3], v{{[0-9]+}}, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm tfe ; +define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask3(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) { +main_body: + %v = call {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.1d.v4f32i32.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) + %v.vec = extractvalue {<4 x float>, i32} %v, 0 + %v.err = extractvalue {<4 x float>, i32} %v, 1 + store i32 %v.err, i32 addrspace(1)* %out, align 4 + ret <4 x float> %v.vec +} + +; GCN-LABEL: {{^}}load_1d_tfe_V4_dmask2: +; GFX10: image_msaa_load v[0:2], v{{[0-9]+}}, s[0:7] dmask:0x6 dim:SQ_RSRC_IMG_1D unorm tfe ; +define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask2(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) { +main_body: + %v = call {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.1d.v4f32i32.i32(i32 6, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) + %v.vec = extractvalue {<4 x float>, i32} %v, 0 + %v.err = extractvalue {<4 x float>, i32} %v, 1 + store i32 %v.err, i32 addrspace(1)* %out, align 4 + ret <4 x float> %v.vec +} + +; GCN-LABEL: {{^}}load_1d_tfe_V4_dmask1: +; GFX10: image_msaa_load v[0:1], v{{[0-9]+}}, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm tfe ; +define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask1(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) { +main_body: + %v = call {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.1d.v4f32i32.i32(i32 8, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) + %v.vec = extractvalue {<4 x float>, i32} %v, 0 + %v.err = extractvalue {<4 x float>, i32} %v, 1 + store i32 %v.err, i32 addrspace(1)* %out, align 4 + ret <4 x float> %v.vec +} + +; GCN-LABEL: {{^}}load_1d_tfe_V2_dmask1: +; GFX10: image_msaa_load v[0:1], v{{[0-9]+}}, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm tfe ; +define amdgpu_ps <2 x float> @load_1d_tfe_V2_dmask1(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) { +main_body: + %v = call {<2 x float>,i32} @llvm.amdgcn.image.msaa.load.1d.v2f32i32.i32(i32 8, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) + %v.vec = extractvalue {<2 x float>, i32} %v, 0 + %v.err = extractvalue {<2 x float>, i32} %v, 1 + store i32 %v.err, i32 addrspace(1)* %out, align 4 + ret <2 x float> %v.vec +} + +; GCN-LABEL: {{^}}load_1d_V1: +; GFX10: image_msaa_load v0, v0, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm ; +define amdgpu_ps float @load_1d_V1(<8 x i32> inreg %rsrc, i32 %s) { +main_body: + %v = call float @llvm.amdgcn.image.msaa.load.1d.f32.i32(i32 8, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret float %v +} + +; GCN-LABEL: {{^}}load_1d_V2: +; GFX10: image_msaa_load v[0:1], v0, s[0:7] dmask:0x9 dim:SQ_RSRC_IMG_1D unorm ; +define amdgpu_ps <2 x float> @load_1d_V2(<8 x i32> inreg %rsrc, i32 %s) { +main_body: + %v = call <2 x float> @llvm.amdgcn.image.msaa.load.1d.v2f32.i32(i32 9, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret <2 x float> %v +} + +; GCN-LABEL: {{^}}load_1d_glc: +; GFX10: image_msaa_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc ; +define amdgpu_ps <4 x float> @load_1d_glc(<8 x i32> inreg %rsrc, i32 %s) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.msaa.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 1) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}load_1d_slc: +; GFX10: image_msaa_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm slc ; +define amdgpu_ps <4 x float> @load_1d_slc(<8 x i32> inreg %rsrc, i32 %s) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.msaa.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 2) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}load_1d_glc_slc: +; GFX10: image_msaa_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc slc ; +define amdgpu_ps <4 x float> @load_1d_glc_slc(<8 x i32> inreg %rsrc, i32 %s) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.msaa.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 3) + ret <4 x float> %v +} + +declare <4 x float> @llvm.amdgcn.image.msaa.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #1 +declare {float,i32} @llvm.amdgcn.image.msaa.load.1d.f32i32.i32(i32, i32, <8 x i32>, i32, i32) #1 +declare {<2 x float>,i32} @llvm.amdgcn.image.msaa.load.1d.v2f32i32.i32(i32, i32, <8 x i32>, i32, i32) #1 +declare {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.1d.v4f32i32.i32(i32, i32, <8 x i32>, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.msaa.load.2d.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1 +declare {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.2d.v4f32i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.msaa.load.3d.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 +declare {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.3d.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.msaa.load.1darray.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1 +declare {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.1darray.v4f32i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.msaa.load.2darray.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 +declare {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.2darray.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 +declare {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1 +declare {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32i32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1 + +declare float @llvm.amdgcn.image.msaa.load.1d.f32.i32(i32, i32, <8 x i32>, i32, i32) #1 +declare float @llvm.amdgcn.image.msaa.load.2d.f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1 +declare <2 x float> @llvm.amdgcn.image.msaa.load.1d.v2f32.i32(i32, i32, <8 x i32>, i32, i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll @@ -1,6 +1,7 @@ ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SIVI -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=SIVI -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: not --crash llc -march=amdgcn -mcpu=gfx1030 -mattr=-flat-for-global -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GFX1030-ERR %s declare i64 @llvm.amdgcn.s.memtime() #0 @@ -12,6 +13,7 @@ ; SIVI-NOT: lgkmcnt ; GCN: s_memtime s{{\[[0-9]+:[0-9]+\]}} ; GCN: {{buffer|global}}_store_dwordx2 +; GFX1030-ERR: ERROR define amdgpu_kernel void @test_s_memtime(i64 addrspace(1)* %out) #0 { %cycle0 = call i64 @llvm.amdgcn.s.memtime() store volatile i64 %cycle0, i64 addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll @@ -1,6 +1,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX906 ; RUN: llc -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10 ; RUN: llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10 +; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10 declare i32 @llvm.amdgcn.sdot4(i32 %a, i32 %b, i32 %c, i1 %clamp) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll @@ -1,14 +1,15 @@ ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX906 ; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX908 -; RUN: llc -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10,GFX1011 -; RUN: llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10,GFX1011 +; RUN: llc -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10 +; RUN: llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10 +; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10 declare i32 @llvm.amdgcn.sdot8(i32 %a, i32 %b, i32 %c, i1 %clamp) ; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot8_clamp -; GFX906: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}} -; GFX908: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}} -; GFX10: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} clamp{{$}} +; GFX906: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}} +; GFX908: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}} +; GFX10: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} clamp{{$}} define amdgpu_kernel void @test_llvm_amdgcn_sdot8_clamp( i32 addrspace(1)* %r, <8 x i4> addrspace(1)* %a, @@ -26,9 +27,9 @@ } ; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot8_no_clamp -; GFX906: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} -; GFX908: v_dot8c_i32_i4_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}} -; GFX1011: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX906: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX908: v_dot8c_i32_i4_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}} +; GFX10: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}} define amdgpu_kernel void @test_llvm_amdgcn_sdot8_no_clamp( i32 addrspace(1)* %r, <8 x i4> addrspace(1)* %a, diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch.ll @@ -0,0 +1,388 @@ +; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs -asm-verbose=0 < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10-ASM %s +; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s -filetype=obj | llvm-objdump -d --arch-name=amdgcn --mcpu=gfx1030 - | FileCheck --check-prefixes=GCN,GFX10,GFX10-DIS %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=GFX8 %s + +; GFX8-NOT: s_inst_prefetch +; GFX8-NOT: .palign 6 + +; GCN-LABEL: test_loop_64 +; GFX10: s_movk_i32 s{{[0-9]+}}, 0x400 +; GFX10-DIS-NEXT: {{^$}} +; GFX10-ASM-NEXT: [[L1:BB[0-9_]+]]: +; GFX10-DIS-NEXT: <[[L1:BB[0-9_]+]]>: +; GFX10: s_sleep 0 +; GFX10: s_cbranch_scc0 [[L1]] +; GFX10-NEXT: s_endpgm +define amdgpu_kernel void @test_loop_64(i32 addrspace(1)* nocapture %arg) { +bb: + br label %bb2 + +bb1: ; preds = %bb2 + ret void + +bb2: ; preds = %bb2, %bb + %tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb2 ] + %tmp2 = add nuw nsw i32 %tmp1, 1 + %tmp3 = icmp eq i32 %tmp2, 1024 + tail call void @llvm.amdgcn.s.sleep(i32 0) + br i1 %tmp3, label %bb1, label %bb2 +} + +; GCN-LABEL: test_loop_128 +; GFX10: s_movk_i32 s{{[0-9]+}}, 0x400 +; GFX10-ASM-NEXT: .p2align 6 +; GFX10-DIS-NEXT: s_nop 0 +; GFX10-NOT: s_inst_prefetch +; GFX10-ASM: [[L1:BB[0-9_]+]]: +; GFX10-DIS: <[[L1:BB[0-9_]+]]>: +; GFX10: s_sleep 0 +; GFX10: s_cbranch_scc0 [[L1]] +; GFX10-NEXT: s_endpgm +define amdgpu_kernel void @test_loop_128(i32 addrspace(1)* nocapture %arg) { +bb: + br label %bb2 + +bb1: ; preds = %bb2 + ret void + +bb2: ; preds = %bb2, %bb + %tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb2 ] + %tmp2 = add nuw nsw i32 %tmp1, 1 + %tmp3 = icmp eq i32 %tmp2, 1024 + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + br i1 %tmp3, label %bb1, label %bb2 +} + +; GCN-LABEL: test_loop_192 +; GFX10: s_movk_i32 s{{[0-9]+}}, 0x400 +; GFX10-NEXT: s_inst_prefetch 0x1 +; GFX10-ASM-NEXT: .p2align 6 +; GFX10-DIS-NEXT: s_nop 0 +; GFX10-NOT: s_inst_prefetch +; GFX10-ASM: [[L1:BB[0-9_]+]]: +; GFX10-DIS: <[[L1:BB[0-9_]+]]>: +; GFX10: s_sleep 0 +; GFX10: s_cbranch_scc0 [[L1]] +; GFX10-NEXT: s_inst_prefetch 0x2 +; GFX10-NEXT: s_endpgm +define amdgpu_kernel void @test_loop_192(i32 addrspace(1)* nocapture %arg) { +bb: + br label %bb2 + +bb1: ; preds = %bb2 + ret void + +bb2: ; preds = %bb2, %bb + %tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb2 ] + %tmp2 = add nuw nsw i32 %tmp1, 1 + %tmp3 = icmp eq i32 %tmp2, 1024 + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + br i1 %tmp3, label %bb1, label %bb2 +} + +; GCN-LABEL: test_loop_256 +; GFX10: s_movk_i32 s{{[0-9]+}}, 0x400 +; GFX10-DIS-NEXT: {{^$}} +; GFX10-ASM-NEXT: [[L1:BB[0-9_]+]]: +; GFX10-DIS-NEXT: <[[L1:BB[0-9_]+]]>: +; GFX10: s_sleep 0 +; GFX10: s_cbranch_scc0 [[L1]] +; GFX10-NEXT: s_endpgm +define amdgpu_kernel void @test_loop_256(i32 addrspace(1)* nocapture %arg) { +bb: + br label %bb2 + +bb1: ; preds = %bb2 + ret void + +bb2: ; preds = %bb2, %bb + %tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb2 ] + %tmp2 = add nuw nsw i32 %tmp1, 1 + %tmp3 = icmp eq i32 %tmp2, 1024 + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + br i1 %tmp3, label %bb1, label %bb2 +} + +; GCN-LABEL: test_loop_prefetch_inner_outer +; GFX10: s_inst_prefetch 0x1 +; GFX10-ASM-NEXT: .p2align 6 +; GFX10-DIS-NEXT: s_nop 0 +; GFX10-NOT: s_inst_prefetch +; GFX10-ASM: [[L1:BB[0-9_]+]]: +; GFX10-DIS: <[[L1:BB[0-9_]+]]>: +; GFX10-NOT: s_inst_prefetch +; GFX10-ASM: .p2align 6 +; GFX10-DIS: s_nop 0 +; GFX10-NOT: s_inst_prefetch +; GFX10-ASM: [[L2:BB[0-9_]+]]: +; GFX10-DIS: <[[L2:BB[0-9_]+]]>: +; GFX10-NOT: s_inst_prefetch +; GFX10: s_sleep 0 +; GFX10: s_cbranch_scc{{[01]}} [[L2]] +; GFX10-NOT: s_inst_prefetch +; GFX10: s_cbranch_scc{{[01]}} [[L1]] +; GFX10-NEXT: s_inst_prefetch 0x2 +; GFX10-NEXT: s_endpgm +define amdgpu_kernel void @test_loop_prefetch_inner_outer(i32 addrspace(1)* nocapture %arg) { +bb: + br label %bb2 + +bb1: + ret void + +bb2: + %tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb4 ] + %tmp2 = add nuw nsw i32 %tmp1, 1 + %tmp3 = icmp eq i32 %tmp2, 1024 + br label %bb3 + +bb3: + %tmp4 = phi i32 [ 0, %bb2 ], [ %tmp5, %bb3 ] + %tmp5 = add nuw nsw i32 %tmp4, 1 + %tmp6 = icmp eq i32 %tmp5, 1024 + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + br i1 %tmp6, label %bb4, label %bb3 + +bb4: + br i1 %tmp3, label %bb1, label %bb2 +} + +; GCN-LABEL: test_loop_prefetch_inner_outer_noouter +; GFX10-NOT: .p2align 6 +; GFX10-NOT: s_nop +; GFX10-NOT: s_inst_prefetch +; GFX10-ASM: [[L0:BB[0-9_]+]]: +; GFX10-DIS: <[[L0:BB[0-9_]+]]>: +; GFX10: s_inst_prefetch 0x1 +; GFX10-ASM-NEXT: .p2align 6 +; GFX10-DIS-NEXT: s_nop 0 +; GFX10-NOT: s_inst_prefetch +; GFX10-ASM: [[L1:BB[0-9_]+]]: +; GFX10-DIS: <[[L1:BB[0-9_]+]]>: +; GFX10-NOT: s_inst_prefetch +; GFX10-ASM: .p2align 6 +; GFX10-DIS: s_nop 0 +; GFX10-NOT: s_inst_prefetch +; GFX10-ASM: [[L2:BB[0-9_]+]]: +; GFX10-DIS: <[[L2:BB[0-9_]+]]>: +; GFX10-NOT: s_inst_prefetch +; GFX10: s_sleep 0 +; GFX10: s_cbranch_scc{{[01]}} [[L2]] +; GFX10-NOT: s_inst_prefetch +; GFX10: s_cbranch_scc{{[01]}} [[L1]] +; GFX10-NEXT: s_inst_prefetch 0x2 +; GFX10: s_cbranch_scc{{[01]}} [[L0]] +; GFX10-NEXT: s_endpgm +define amdgpu_kernel void @test_loop_prefetch_inner_outer_noouter(i32 addrspace(1)* nocapture %arg) { +bb: + br label %bb2 + +bb1: + ret void + +bb2: + %tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb6 ] + %tmp2 = add nuw nsw i32 %tmp1, 1 + %tmp3 = icmp eq i32 %tmp2, 1024 + br label %bb3 + +bb3: + %tmp4 = phi i32 [ 0, %bb2 ], [ %tmp5, %bb5 ] + %tmp5 = add nuw nsw i32 %tmp4, 1 + %tmp6 = icmp eq i32 %tmp5, 1024 + br label %bb4 + +bb4: + %tmp7 = phi i32 [ 0, %bb3 ], [ %tmp8, %bb4 ] + %tmp8 = add nuw nsw i32 %tmp7, 1 + %tmp9 = icmp eq i32 %tmp8, 1024 + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + br i1 %tmp9, label %bb5, label %bb4 + +bb5: + br i1 %tmp6, label %bb6, label %bb3 + +bb6: + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + tail call void @llvm.amdgcn.s.sleep(i32 0) + br i1 %tmp3, label %bb1, label %bb2 +} + +declare void @llvm.amdgcn.s.sleep(i32) diff --git a/llvm/test/CodeGen/AMDGPU/madmk.ll b/llvm/test/CodeGen/AMDGPU/madmk.ll --- a/llvm/test/CodeGen/AMDGPU/madmk.ll +++ b/llvm/test/CodeGen/AMDGPU/madmk.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mattr=+mad-mac-f32-insts -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s ; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; FIXME: None of these trigger madmk emission anymore. It is still diff --git a/llvm/test/CodeGen/AMDGPU/operand-folding.ll b/llvm/test/CodeGen/AMDGPU/operand-folding.ll --- a/llvm/test/CodeGen/AMDGPU/operand-folding.ll +++ b/llvm/test/CodeGen/AMDGPU/operand-folding.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s ; CHECK-LABEL: {{^}}fold_sgpr: ; CHECK: v_add_i32_e32 v{{[0-9]+}}, vcc, s diff --git a/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll b/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll --- a/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll +++ b/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll @@ -1,6 +1,7 @@ ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=MEMTIME -check-prefix=SIVI -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=MEMTIME -check-prefix=SIVI -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=MEMTIME -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GETREG -check-prefix=GCN %s declare i64 @llvm.readcyclecounter() #0 @@ -13,6 +14,14 @@ ; MEMTIME: s_memtime s{{\[[0-9]+:[0-9]+\]}} ; MEMTIME: store_dwordx2 +; GETREG-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0 +; GETREG-DAG: s_getreg_b32 [[CNT1:s[0-9]+]], hwreg(HW_REG_SHADER_CYCLES, 0, 20) +; GETREG-DAG: v_mov_b32_e32 v[[VCNT1:[0-9]+]], [[CNT1]] +; GETREG: global_store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[VCNT1]]:[[ZERO]]], off +; GETREG: s_getreg_b32 [[CNT2:s[0-9]+]], hwreg(HW_REG_SHADER_CYCLES, 0, 20) +; GETREG: v_mov_b32_e32 v[[VCNT2:[0-9]+]], [[CNT2]] +; GETREG: global_store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[VCNT2]]:[[ZERO]]], off + define amdgpu_kernel void @test_readcyclecounter(i64 addrspace(1)* %out) #0 { %cycle0 = call i64 @llvm.readcyclecounter() store volatile i64 %cycle0, i64 addrspace(1)* %out @@ -27,6 +36,7 @@ ; GCN-LABEL: {{^}}test_readcyclecounter_smem: ; MEMTIME-DAG: s_memtime ; GCN-DAG: s_load_dword +; GETREG-DAG: s_getreg_b32 s1, hwreg(HW_REG_SHADER_CYCLES, 0, 20) define amdgpu_cs i32 @test_readcyclecounter_smem(i64 addrspace(4)* inreg %in) #0 { %cycle0 = call i64 @llvm.readcyclecounter() %in.v = load i64, i64 addrspace(4)* %in diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll --- a/llvm/test/CodeGen/AMDGPU/udiv.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -2,6 +2,7 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefix=SI -check-prefix=FUNC -check-prefix=VI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx1030 -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefix=GCN -check-prefix=GFX1030 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s @@ -186,6 +187,7 @@ ; GCN-LABEL: {{^}}fdiv_test_denormals ; VI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX1030: v_fmac_f32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @fdiv_test_denormals(i8 addrspace(1)* nocapture readonly %arg) { bb: %tmp = load i8, i8 addrspace(1)* null, align 1 diff --git a/llvm/test/CodeGen/AMDGPU/v_mac.ll b/llvm/test/CodeGen/AMDGPU/v_mac.ll --- a/llvm/test/CodeGen/AMDGPU/v_mac.ll +++ b/llvm/test/CodeGen/AMDGPU/v_mac.ll @@ -1,4 +1,4 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-FLUSH -check-prefix=GCN %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-DENORM -check-prefix=GCN %s diff --git a/llvm/test/MC/AMDGPU/gfx1030_err.s b/llvm/test/MC/AMDGPU/gfx1030_err.s new file mode 100644 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx1030_err.s @@ -0,0 +1,139 @@ +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1030 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX10 %s + +v_dot8c_i32_i4 v5, v1, v2 +// GFX10: error: + +v_dot8c_i32_i4 v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 +// GFX10: error: + +v_dot8c_i32_i4 v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 +// GFX10: error: + +v_dot8c_i32_i4 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX10: error: + +v_dot8c_i32_i4 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX10: error: + +s_get_waveid_in_workgroup s0 +// GFX10: error: + +s_memtime s[0:1] +// GFX10: error: + +s_getreg_b32 s2, hwreg(HW_REG_XNACK_MASK) +// GFX10: error: + +v_mac_f32 v0, v1, v2 +// GFX10: error: + +v_mad_f32 v0, v1, v2, v3 +// GFX10: error: + +v_madak_f32 v0, v1, v2, 1 +// GFX10: error: + +v_madmk_f32 v0, v1, 1, v2 +// GFX10: error: + +v_mad_legacy_f32 v0, v1, v2, v3 +// GFX10: error: + +v_mac_legacy_f32 v0, v1, v2 +// GFX10: error: + +ds_add_src2_u32 v1 offset:65535 gds +// GFX10: error: + +ds_add_src2_u32 v1 offset:65535 +// GFX10: error: + +ds_add_src2_f32 v1 offset:65535 +// GFX10: error: + +ds_sub_src2_u32 v1 offset:65535 +// GFX10: error: + +ds_rsub_src2_u32 v1 offset:65535 +// GFX10: error: + +ds_inc_src2_u32 v1 offset:65535 +// GFX10: error: + +ds_dec_src2_u32 v1 offset:65535 +// GFX10: error: + +ds_min_src2_i32 v1 offset:65535 +// GFX10: error: + +ds_max_src2_i32 v1 offset:65535 +// GFX10: error: + +ds_min_src2_u32 v1 offset:65535 +// GFX10: error: + +ds_max_src2_u32 v1 offset:65535 +// GFX10: error: + +ds_and_src2_b32 v1 offset:65535 +// GFX10: error: + +ds_or_src2_b32 v1 offset:65535 +// GFX10: error: + +ds_xor_src2_b32 v1 offset:65535 +// GFX10: error: + +ds_min_src2_f32 v1 offset:65535 +// GFX10: error: + +ds_max_src2_f32 v1 offset:65535 +// GFX10: error: + +ds_add_src2_u64 v1 offset:65535 +// GFX10: error: + +ds_sub_src2_u64 v1 offset:65535 +// GFX10: error: + +ds_rsub_src2_u64 v1 offset:65535 +// GFX10: error: + +ds_inc_src2_u64 v1 offset:65535 +// GFX10: error: + +ds_dec_src2_u64 v1 offset:65535 +// GFX10: error: + +ds_min_src2_i64 v1 offset:65535 +// GFX10: error: + +ds_max_src2_i64 v1 offset:65535 +// GFX10: error: + +ds_min_src2_u64 v1 offset:65535 +// GFX10: error: + +ds_max_src2_u64 v1 offset:65535 +// GFX10: error: + +ds_and_src2_b64 v1 offset:65535 +// GFX10: error: + +ds_or_src2_b64 v1 offset:65535 +// GFX10: error: + +ds_xor_src2_b64 v1 offset:65535 +// GFX10: error: + +ds_min_src2_f64 v1 offset:65535 +// GFX10: error: + +ds_max_src2_f64 v1 offset:65535 +// GFX10: error: + +ds_write_src2_b32 v1 offset:65535 +// GFX10: error: + +ds_write_src2_b64 v1 offset:65535 +// GFX10: error: diff --git a/llvm/test/MC/AMDGPU/gfx1030_new.s b/llvm/test/MC/AMDGPU/gfx1030_new.s new file mode 100644 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx1030_new.s @@ -0,0 +1,76 @@ +// RUN: llvm-mc -arch=amdgcn -mcpu=gfx1030 -show-encoding %s | FileCheck --check-prefix=GFX10 %s + +global_load_dword_addtid v1, s[2:3] offset:16 +// GFX10: encoding: [0x10,0x80,0x58,0xdc,0x00,0x00,0x02,0x01] + +global_load_dword_addtid v1, s[2:3] offset:16 glc slc dlc +// GFX10: encoding: [0x10,0x90,0x5b,0xdc,0x00,0x00,0x02,0x01] + +global_store_dword_addtid v1, s[2:3] offset:16 glc slc dlc +// GFX10: encoding: [0x10,0x90,0x5f,0xdc,0x00,0x01,0x02,0x00] + +global_store_dword v[254:255], v1, s[2:3] offset:16 +// GFX10: encoding: [0x10,0x80,0x70,0xdc,0xfe,0x01,0x02,0x00] + +global_atomic_csub v2, v[0:1], v2, off offset:100 glc slc +// GFX10: encoding: [0x64,0x80,0xd3,0xdc,0x00,0x02,0x7d,0x02] + +global_atomic_csub v2, v[0:1], v2, off +// GFX10: encoding: [0x00,0x80,0xd1,0xdc,0x00,0x02,0x7d,0x02] + +global_atomic_csub v2, v[0:1], v2, s[2:3] +// GFX10: encoding: [0x00,0x80,0xd1,0xdc,0x00,0x02,0x02,0x02] + +global_atomic_csub v2, v[0:1], v2, s[2:3] offset:100 glc slc +// GFX10: encoding: [0x64,0x80,0xd3,0xdc,0x00,0x02,0x02,0x02] + +buffer_atomic_csub v5, off, s[8:11], s3 +// GFX10: encoding: [0x00,0x40,0xd0,0xe0,0x00,0x05,0x02,0x03] + +buffer_atomic_csub v5, off, s[8:11], s3 offset:4095 glc +// GFX10: encoding: [0xff,0x4f,0xd0,0xe0,0x00,0x05,0x02,0x03] + +buffer_atomic_csub v5, off, s[8:11], -1 offset:4095 glc +// GFX10: encoding: [0xff,0x4f,0xd0,0xe0,0x00,0x05,0x02,0xc1] + +buffer_atomic_csub v5, v0, s[8:11], s3 offen offset:4095 glc +// GFX10: encoding: [0xff,0x5f,0xd0,0xe0,0x00,0x05,0x02,0x03] + +buffer_atomic_csub v5, v0, s[8:11], s3 idxen offset:4095 glc +// GFX10: encoding: [0xff,0x6f,0xd0,0xe0,0x00,0x05,0x02,0x03] + +buffer_atomic_csub v5, off, s[8:11], s3 glc slc +// GFX10: encoding: [0x00,0x40,0xd0,0xe0,0x00,0x05,0x42,0x03] + +s_getreg_b32 s2, hwreg(HW_REG_SHADER_CYCLES) +// GFX10: encoding: [0x1d,0xf8,0x02,0xb9] + +s_getreg_b32 s2, 29 +// GFX10: s_getreg_b32 s2, hwreg(HW_REG_SHADER_CYCLES, 0, 1) ; encoding: [0x1d,0x00,0x02,0xb9] + +s_getreg_b32 s2, hwreg(22) +// GFX10: s_getreg_b32 s2, hwreg(22) ; encoding: [0x16,0xf8,0x02,0xb9] + +v_fma_legacy_f32 v0, v1, v2, v3 +// GFX10: encoding: [0x00,0x00,0x40,0xd5,0x01,0x05,0x0e,0x04] + +v_fma_legacy_f32 v0, v1, |v2|, -v3 +// GFX10: encoding: [0x00,0x02,0x40,0xd5,0x01,0x05,0x0e,0x84] + +v_fma_legacy_f32 v0, s1, 2.0, -v3 +// GFX10: encoding: [0x00,0x00,0x40,0xd5,0x01,0xe8,0x0d,0x84] + +image_msaa_load v[1:4], v5, s[8:15] dmask:0xf dim:SQ_RSRC_IMG_1D +// GFX10: encoding: [0x01,0x0f,0x00,0xf0,0x05,0x01,0x02,0x00] + +image_msaa_load v[1:4], v5, s[8:15] dmask:0xf dim:SQ_RSRC_IMG_1D glc +// GFX10: encoding: [0x01,0x2f,0x00,0xf0,0x05,0x01,0x02,0x00] + +image_msaa_load v5, v[1:2], s[8:15] dmask:0x1 dim:SQ_RSRC_IMG_2D d16 +// GFX10: encoding: [0x09,0x01,0x00,0xf0,0x01,0x05,0x02,0x80] + +image_msaa_load v[1:4], v5, s[8:15] dmask:0xf dim:SQ_RSRC_IMG_1D +// GFX10: encoding: [0x01,0x0f,0x00,0xf0,0x05,0x01,0x02,0x00] + +image_msaa_load v14, [v204,v11,v14,v19], s[40:47] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY +// GFX10: encoding: [0x3b,0x01,0x00,0xf0,0xcc,0x0e,0x0a,0x00,0x0b,0x0e,0x13,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1030_dasm_new.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1030_dasm_new.txt new file mode 100644 --- /dev/null +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1030_dasm_new.txt @@ -0,0 +1,67 @@ +# RUN: llvm-mc -arch=amdgcn -mcpu=gfx1030 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX10 %s + +# GFX10: global_load_dword_addtid v1, s[2:3] offset:16 +0x10,0x80,0x58,0xdc,0x00,0x00,0x02,0x01 + +# GFX10: global_load_dword_addtid v1, s[2:3] offset:16 glc slc dlc +0x10,0x90,0x5b,0xdc,0x00,0x00,0x02,0x01 + +# GFX10: global_store_dword_addtid v1, s[2:3] offset:16 glc slc dlc +0x10,0x90,0x5f,0xdc,0x00,0x01,0x02,0x00 + +# GFX10: global_store_dword v[254:255], v1, s[2:3] offset:16 +0x10,0x80,0x70,0xdc,0xfe,0x01,0x02,0x00 + +# GFX10: global_atomic_csub v2, v[0:1], v2, off offset:100 glc slc +0x64,0x80,0xd3,0xdc,0x00,0x02,0x7d,0x02 + +# GFX10: global_atomic_csub v2, v[0:1], v2, off glc +0x00,0x80,0xd1,0xdc,0x00,0x02,0x7d,0x02 + +# GFX10: global_atomic_csub v2, v[0:1], v2, s[2:3] glc +0x00,0x80,0xd1,0xdc,0x00,0x02,0x02,0x02 + +# GFX10: global_atomic_csub v2, v[0:1], v2, s[2:3] offset:100 glc slc +0x64,0x80,0xd3,0xdc,0x00,0x02,0x02,0x02 + +# GFX10: buffer_atomic_csub v5, off, s[8:11], s3 +0x00,0x40,0xd0,0xe0,0x00,0x05,0x02,0x03 + +# GFX10: buffer_atomic_csub v5, off, s[8:11], s3 offset:4095 glc +0xff,0x4f,0xd0,0xe0,0x00,0x05,0x02,0x03 + +# GFX10: buffer_atomic_csub v5, off, s[8:11], -1 offset:4095 glc +0xff,0x4f,0xd0,0xe0,0x00,0x05,0x02,0xc1 + +# GFX10: buffer_atomic_csub v5, v0, s[8:11], s3 offen offset:4095 glc +0xff,0x5f,0xd0,0xe0,0x00,0x05,0x02,0x03 + +# GFX10: buffer_atomic_csub v5, v0, s[8:11], s3 idxen offset:4095 glc +0xff,0x6f,0xd0,0xe0,0x00,0x05,0x02,0x03 + +# GFX10: buffer_atomic_csub v5, off, s[8:11], s3 glc slc +0x00,0x40,0xd0,0xe0,0x00,0x05,0x42,0x03 + +# GFX10: v_fma_legacy_f32 v0, v1, v2, v3 +0x00,0x00,0x40,0xd5,0x01,0x05,0x0e,0x04 + +# GFX10: v_fma_legacy_f32 v0, v1, |v2|, -v3 +0x00,0x02,0x40,0xd5,0x01,0x05,0x0e,0x84 + +# GFX10: v_fma_legacy_f32 v0, s1, 2.0, -v3 +0x00,0x00,0x40,0xd5,0x01,0xe8,0x0d,0x84 + +# GFX10: image_msaa_load v[1:4], v5, s[8:15] dmask:0xf dim:SQ_RSRC_IMG_1D +0x01,0x0f,0x00,0xf0,0x05,0x01,0x02,0x00 + +# GFX10: image_msaa_load v[1:4], v5, s[8:15] dmask:0xf dim:SQ_RSRC_IMG_1D glc +0x01,0x2f,0x00,0xf0,0x05,0x01,0x02,0x00 + +# GFX10: image_msaa_load v5, v[1:2], s[8:15] dmask:0x1 dim:SQ_RSRC_IMG_2D d16 +0x09,0x01,0x00,0xf0,0x01,0x05,0x02,0x80 + +# GFX10: image_msaa_load v[1:4], v5, s[8:15] dmask:0xf dim:SQ_RSRC_IMG_1D +0x01,0x0f,0x00,0xf0,0x05,0x01,0x02,0x00 + +# GFX10: image_msaa_load v14, [v204, v11, v14, v19], s[40:47] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY +0x3b,0x01,0x00,0xf0,0xcc,0x0e,0x0a,0x00,0x0b,0x0e,0x13,0x00 diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp --- a/llvm/tools/llvm-readobj/ELFDumper.cpp +++ b/llvm/tools/llvm-readobj/ELFDumper.cpp @@ -1797,6 +1797,7 @@ LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1010), LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1011), LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1012), + LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1030), LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_XNACK), LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_SRAM_ECC) };