Index: clang/include/clang/Basic/BuiltinsAMDGPU.def =================================================================== --- clang/include/clang/Basic/BuiltinsAMDGPU.def +++ clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -209,6 +209,12 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_f64, "dd*3d", "t", "gfx90a-insts") TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_f32, "ff*3f", "t", "gfx8-insts") +TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_f32, "ff*0f", "t", "gfx940-insts") +TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2f16, "V2hV2h*0V2h", "t", "gfx940-insts") +TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2bf16, "V2sV2s*0V2s", "t", "gfx940-insts") +TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2bf16, "V2sV2s*1V2s", "t", "gfx940-insts") +TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2bf16, "V2sV2s*3V2s", "t", "gfx940-insts") + //===----------------------------------------------------------------------===// // Deep learning builtins. //===----------------------------------------------------------------------===// Index: clang/lib/CodeGen/CGBuiltin.cpp =================================================================== --- clang/lib/CodeGen/CGBuiltin.cpp +++ clang/lib/CodeGen/CGBuiltin.cpp @@ -16510,7 +16510,9 @@ case AMDGPU::BI__builtin_amdgcn_global_atomic_fmax_f64: case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f64: case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmin_f64: - case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: { + case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: + case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32: + case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16: { Intrinsic::ID IID; llvm::Type *ArgTy = llvm::Type::getDoubleTy(getLLVMContext()); switch (BuiltinID) { @@ -16541,6 +16543,15 @@ case AMDGPU::BI__builtin_amdgcn_flat_atomic_fmax_f64: IID = Intrinsic::amdgcn_flat_atomic_fmax; break; + case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_f32: + ArgTy = llvm::Type::getFloatTy(getLLVMContext()); + IID = Intrinsic::amdgcn_flat_atomic_fadd; + break; + case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2f16: + ArgTy = llvm::FixedVectorType::get( + llvm::Type::getHalfTy(getLLVMContext()), 2); + IID = Intrinsic::amdgcn_flat_atomic_fadd; + break; } llvm::Value *Addr = EmitScalarExpr(E->getArg(0)); llvm::Value *Val = EmitScalarExpr(E->getArg(1)); @@ -16548,6 +16559,22 @@ CGM.getIntrinsic(IID, {ArgTy, Addr->getType(), Val->getType()}); return Builder.CreateCall(F, {Addr, Val}); } + case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16: + case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16: { + Intrinsic::ID IID; + switch (BuiltinID) { + case AMDGPU::BI__builtin_amdgcn_global_atomic_fadd_v2bf16: + IID = Intrinsic::amdgcn_global_atomic_fadd_v2bf16; + break; + case AMDGPU::BI__builtin_amdgcn_flat_atomic_fadd_v2bf16: + IID = Intrinsic::amdgcn_flat_atomic_fadd_v2bf16; + break; + } + llvm::Value *Addr = EmitScalarExpr(E->getArg(0)); + llvm::Value *Val = EmitScalarExpr(E->getArg(1)); + llvm::Function *F = CGM.getIntrinsic(IID, {Addr->getType()}); + return Builder.CreateCall(F, {Addr, Val}); + } case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64: case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32: { Intrinsic::ID IID; Index: clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx90a-err.cl =================================================================== --- /dev/null +++ clang/test/CodeGenOpenCL/builtins-amdgcn-fp-atomics-gfx90a-err.cl @@ -0,0 +1,17 @@ +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a \ +// RUN: -verify -S -o - %s + +// REQUIRES: amdgpu-registered-target + +typedef half __attribute__((ext_vector_type(2))) half2; +typedef short __attribute__((ext_vector_type(2))) short2; + +void test_atomic_fadd(__global half2 *addrh2, half2 xh2, + __global short2 *addrs2, __local short2 *addrs2l, short2 xs2, + __global float *addrf, float xf) { + __builtin_amdgcn_flat_atomic_fadd_f32(addrf, xf); // expected-error{{'__builtin_amdgcn_flat_atomic_fadd_f32' needs target feature gfx940-insts}} + __builtin_amdgcn_flat_atomic_fadd_v2f16(addrh2, xh2); // expected-error{{'__builtin_amdgcn_flat_atomic_fadd_v2f16' needs target feature gfx940-insts}} + __builtin_amdgcn_flat_atomic_fadd_v2bf16(addrs2, xs2); // expected-error{{'__builtin_amdgcn_flat_atomic_fadd_v2bf16' needs target feature gfx940-insts}} + __builtin_amdgcn_global_atomic_fadd_v2bf16(addrs2, xs2); // expected-error{{'__builtin_amdgcn_global_atomic_fadd_v2bf16' needs target feature gfx940-insts}} + __builtin_amdgcn_ds_atomic_fadd_v2bf16(addrs2l, xs2); // expected-error{{'__builtin_amdgcn_ds_atomic_fadd_v2bf16' needs target feature gfx940-insts}} +} Index: clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl =================================================================== --- /dev/null +++ clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl @@ -0,0 +1,50 @@ +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx940 \ +// RUN: %s -S -emit-llvm -o - | FileCheck %s + +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx940 \ +// RUN: -S -o - %s | FileCheck -check-prefix=GFX940 %s + +// REQUIRES: amdgpu-registered-target + +typedef half __attribute__((ext_vector_type(2))) half2; +typedef short __attribute__((ext_vector_type(2))) short2; + +// CHECK-LABEL: test_flat_add_f32 +// CHECK: call float @llvm.amdgcn.flat.atomic.fadd.f32.p0f32.f32(float* %{{.*}}, float %{{.*}}) +// GFX940-LABEL: test_flat_add_f32 +// GFX940: flat_atomic_add_f32 +half2 test_flat_add_f32(__generic float *addr, float x) { + return __builtin_amdgcn_flat_atomic_fadd_f32(addr, x); +} + +// CHECK-LABEL: test_flat_add_2f16 +// CHECK: call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0v2f16.v2f16(<2 x half>* %{{.*}}, <2 x half> %{{.*}}) +// GFX940-LABEL: test_flat_add_2f16 +// GFX940: flat_atomic_pk_add_f16 +half2 test_flat_add_2f16(__generic half2 *addr, half2 x) { + return __builtin_amdgcn_flat_atomic_fadd_v2f16(addr, x); +} + +// CHECK-LABEL: test_flat_add_2bf16 +// CHECK: call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0v2i16(<2 x i16>* %{{.*}}, <2 x i16> %{{.*}}) +// GFX940-LABEL: test_flat_add_2bf16 +// GFX940: flat_atomic_pk_add_bf16 +short2 test_flat_add_2bf16(__generic short2 *addr, short2 x) { + return __builtin_amdgcn_flat_atomic_fadd_v2bf16(addr, x); +} + +// CHECK-LABEL: test_global_add_2bf16 +// CHECK: call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1v2i16(<2 x i16> addrspace(1)* %{{.*}}, <2 x i16> %{{.*}}) +// GFX940-LABEL: test_global_add_2bf16 +// GFX940: global_atomic_pk_add_bf16 +short2 test_global_add_2bf16(__global short2 *addr, short2 x) { + return __builtin_amdgcn_global_atomic_fadd_v2bf16(addr, x); +} + +// CHECK-LABEL: test_local_add_2bf16 +// CHECK: call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(<2 x i16> addrspace(3)* %{{.*}}, <2 x i16> % +// GFX940-LABEL: test_local_add_2bf16 +// GFX940: ds_pk_add_rtn_bf16 +short2 test_local_add_2bf16(__local short2 *addr, short2 x) { + return __builtin_amdgcn_ds_atomic_fadd_v2bf16(addr, x); +} Index: llvm/include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1978,6 +1978,19 @@ def int_amdgcn_mfma_f64_16x16x4f64 : AMDGPUMfmaIntrinsic; def int_amdgcn_mfma_f64_4x4x4f64 : AMDGPUMfmaIntrinsic; +//===----------------------------------------------------------------------===// +// gfx940 intrinsics +// ===----------------------------------------------------------------------===// + +// bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm. +def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUGlobalAtomicRtn; +def int_amdgcn_flat_atomic_fadd_v2bf16 : AMDGPUGlobalAtomicRtn; +def int_amdgcn_ds_fadd_v2bf16 : Intrinsic< + [llvm_v2i16_ty], + [LLVMQualPointerType, llvm_v2i16_ty], + [IntrArgMemOnly, IntrWillReturn, NoCapture>]>, + GCCBuiltin<"__builtin_amdgcn_ds_atomic_fadd_v2bf16">; + //===----------------------------------------------------------------------===// // Special Intrinsics for backend internal use only. No frontend // should emit calls to these. Index: llvm/lib/Target/AMDGPU/AMDGPU.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPU.td +++ llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1545,6 +1545,13 @@ def HasAtomicFaddInsts : Predicate<"Subtarget->hasAtomicFaddInsts()">, AssemblerPredicate<(all_of FeatureAtomicFaddInsts)>; +// Differentiate between two functionally equivalent, but incompatible +// encoding-wise FP atomics between gfx90* and gfx940 +def HasAtomicFaddInstsGFX90X : Predicate<"Subtarget->hasAtomicFaddInsts()">, + AssemblerPredicate<(all_of FeatureAtomicFaddInsts, (not FeatureGFX940Insts))>; +def HasAtomicFaddInstsGFX940 : Predicate<"Subtarget->hasAtomicFaddInsts()">, + AssemblerPredicate<(all_of FeatureAtomicFaddInsts, FeatureGFX940Insts)>; + def HasDsSrc2Insts : Predicate<"!Subtarget->hasDsSrc2Insts()">, AssemblerPredicate<(all_of FeatureDsSrc2Insts)>; Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1297,6 +1297,8 @@ Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); if (ST.hasGFX90AInsts()) Atomic.legalFor({{S64, LocalPtr}}); + if (ST.hasGFX940Insts()) + Atomic.legalFor({{V2S16, LocalPtr}}); } if (ST.hasAtomicFaddInsts()) Atomic.legalFor({{S32, GlobalPtr}}); Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4339,6 +4339,8 @@ case Intrinsic::amdgcn_flat_atomic_fadd: case Intrinsic::amdgcn_flat_atomic_fmin: case Intrinsic::amdgcn_flat_atomic_fmax: + case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: + case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: return getDefaultMappingAllVGPR(MI); case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: { Index: llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -205,9 +205,12 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; Index: llvm/lib/Target/AMDGPU/DSInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/DSInstructions.td +++ llvm/lib/Target/AMDGPU/DSInstructions.td @@ -462,6 +462,13 @@ defm DS_ADD_RTN_F64 : DS_1A1D_RET_mc_gfx9<"ds_add_rtn_f64", VReg_64, "ds_add_f64">; } // End SubtargetPredicate = isGFX90APlus +let SubtargetPredicate = isGFX940Plus in { + defm DS_PK_ADD_F16 : DS_1A1D_NORET_mc_gfx9<"ds_pk_add_f16">; + defm DS_PK_ADD_RTN_F16 : DS_1A1D_RET_mc_gfx9<"ds_pk_add_rtn_f16", VGPR_32, "ds_pk_add_f16">; + defm DS_PK_ADD_BF16 : DS_1A1D_NORET_mc_gfx9<"ds_pk_add_bf16">; + defm DS_PK_ADD_RTN_BF16 : DS_1A1D_RET_mc_gfx9<"ds_pk_add_rtn_bf16", VGPR_32, "ds_pk_add_bf16">; +} // End SubtargetPredicate = isGFX940Plus + defm DS_MSKOR_B32 : DS_1A2D_NORET_mc<"ds_mskor_b32">; defm DS_CMPST_B32 : DS_1A2D_NORET_mc<"ds_cmpst_b32">; defm DS_CMPST_F32 : DS_1A2D_NORET_mc<"ds_cmpst_f32">; @@ -998,6 +1005,14 @@ def : DSAtomicRetPat; } +let SubtargetPredicate = isGFX940Plus in { +def : DSAtomicRetPat; +def : GCNPat < + (v2i16 (int_amdgcn_ds_fadd_v2bf16 i32:$ptr, v2i16:$src)), + (DS_PK_ADD_RTN_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0) +>; +} + def : Pat < (SIds_ordered_count i32:$value, i16:$offset), (DS_ORDERED_COUNT $value, (as_i16imm $offset)) @@ -1410,3 +1425,10 @@ def DS_ADD_F64_vi : DS_Real_vi<0x5c, DS_ADD_F64>; def DS_ADD_RTN_F64_vi : DS_Real_vi<0x7c, DS_ADD_RTN_F64>; } // End SubtargetPredicate = isGFX90APlus + +let SubtargetPredicate = isGFX940Plus in { + def DS_PK_ADD_F16_vi : DS_Real_vi<0x17, DS_PK_ADD_F16>; + def DS_PK_ADD_RTN_F16_vi : DS_Real_vi<0xb7, DS_PK_ADD_RTN_F16>; + def DS_PK_ADD_BF16_vi : DS_Real_vi<0x18, DS_PK_ADD_BF16>; + def DS_PK_ADD_RTN_BF16_vi : DS_Real_vi<0xb8, DS_PK_ADD_RTN_BF16>; +} // End SubtargetPredicate = isGFX940Plus Index: llvm/lib/Target/AMDGPU/FLATInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/FLATInstructions.td +++ llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -630,6 +630,13 @@ defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", VReg_64, f64, int_amdgcn_global_atomic_fmax>; } // End SubtargetPredicate = isGFX90APlus +let SubtargetPredicate = isGFX940Plus in { + defm FLAT_ATOMIC_ADD_F32 : FLAT_Atomic_Pseudo<"flat_atomic_add_f32", VGPR_32, f32, int_amdgcn_flat_atomic_fadd>; + defm FLAT_ATOMIC_PK_ADD_F16 : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_f16", VGPR_32, v2f16, int_amdgcn_flat_atomic_fadd>; + defm FLAT_ATOMIC_PK_ADD_BF16 : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_bf16", VGPR_32, v2f16>; + defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Atomic_Pseudo<"global_atomic_pk_add_bf16", VGPR_32, v2f16>; +} // End SubtargetPredicate = isGFX940Plus + defm GLOBAL_LOAD_UBYTE : FLAT_Global_Load_Pseudo <"global_load_ubyte", VGPR_32>; defm GLOBAL_LOAD_SBYTE : FLAT_Global_Load_Pseudo <"global_load_sbyte", VGPR_32>; defm GLOBAL_LOAD_USHORT : FLAT_Global_Load_Pseudo <"global_load_ushort", VGPR_32>; @@ -1280,6 +1287,13 @@ def : FlatSignedAtomicPat ; } +let OtherPredicates = [isGFX940Plus] in { +def : FlatSignedAtomicPat ; +def : FlatSignedAtomicPat ; +def : FlatSignedAtomicPat ; +defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_PK_ADD_BF16", int_amdgcn_global_atomic_fadd_v2bf16, v2i16>; +} + } // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 let OtherPredicates = [HasFlatScratchInsts, EnableFlatScratch] in { @@ -1432,6 +1446,14 @@ def _SADDR_vi : FLAT_Real_vi(NAME#"_SADDR"), has_sccb>; } +class FLAT_Real_gfx940 op, FLAT_Pseudo ps> : + FLAT_Real , + SIMCInstr { + let AssemblerPredicate = isGFX940Plus; + let DecoderNamespace = "GFX9"; + let Inst{25} = !if(ps.has_sccb, cpol{CPolBit.SCC}, ps.sccbValue); +} + def FLAT_LOAD_UBYTE_vi : FLAT_Real_vi <0x10, FLAT_LOAD_UBYTE>; def FLAT_LOAD_SBYTE_vi : FLAT_Real_vi <0x11, FLAT_LOAD_SBYTE>; def FLAT_LOAD_USHORT_vi : FLAT_Real_vi <0x12, FLAT_LOAD_USHORT>; @@ -1574,7 +1596,7 @@ defm SCRATCH_STORE_DWORDX3 : FLAT_Real_AllAddr_vi <0x1e>; defm SCRATCH_STORE_DWORDX4 : FLAT_Real_AllAddr_vi <0x1f>; -let SubtargetPredicate = HasAtomicFaddInsts in { +let SubtargetPredicate = HasAtomicFaddInstsGFX90X in { defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Real_Atomics_vi <0x04d, 0>; defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Real_Atomics_vi <0x04e, 0>; } @@ -1588,6 +1610,40 @@ defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Real_Atomics_vi<0x51, 0>; } // End SubtargetPredicate = isGFX90AOnly +multiclass FLAT_Real_AllAddr_gfx940 op> { + def _gfx940 : FLAT_Real_gfx940(NAME)>; + def _SADDR_gfx940 : FLAT_Real_gfx940(NAME#"_SADDR")>; +} + +multiclass FLAT_Real_Atomics_gfx940 op, FLAT_Pseudo ps> { + def _gfx940 : FLAT_Real_gfx940(ps.PseudoInstr)>; + def _RTN_gfx940 : FLAT_Real_gfx940(ps.PseudoInstr # "_RTN")>; +} + +multiclass FLAT_Global_Real_Atomics_gfx940 op> : + FLAT_Real_AllAddr_gfx940 { + def _RTN_gfx940 : FLAT_Real_gfx940 (NAME#"_RTN")>; + def _SADDR_RTN_gfx940 : FLAT_Real_gfx940 (NAME#"_SADDR_RTN")>; +} + +let SubtargetPredicate = HasAtomicFaddInstsGFX940 in { + defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Real_Atomics_gfx940 <0x04d>; + defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Real_Atomics_gfx940 <0x04e>; +} + +let SubtargetPredicate = isGFX940Plus in { + defm FLAT_ATOMIC_ADD_F64 : FLAT_Real_Atomics_gfx940<0x4f, FLAT_ATOMIC_ADD_F64>; + defm FLAT_ATOMIC_MIN_F64 : FLAT_Real_Atomics_gfx940<0x50, FLAT_ATOMIC_MIN_F64>; + defm FLAT_ATOMIC_MAX_F64 : FLAT_Real_Atomics_gfx940<0x51, FLAT_ATOMIC_MAX_F64>; + defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Real_Atomics_gfx940<0x4f>; + defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Real_Atomics_gfx940<0x50>; + defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Real_Atomics_gfx940<0x51>; + defm FLAT_ATOMIC_ADD_F32 : FLAT_Real_Atomics_vi<0x4d, FLAT_ATOMIC_ADD_F32>; + defm FLAT_ATOMIC_PK_ADD_F16 : FLAT_Real_Atomics_vi<0x4e, FLAT_ATOMIC_PK_ADD_F16>; + defm FLAT_ATOMIC_PK_ADD_BF16 : FLAT_Real_Atomics_vi<0x52, FLAT_ATOMIC_PK_ADD_BF16>; + defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Real_Atomics_vi<0x52>; +} // End SubtargetPredicate = isGFX940Plus + //===----------------------------------------------------------------------===// // GFX10. //===----------------------------------------------------------------------===// Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1270,7 +1270,9 @@ case Intrinsic::amdgcn_global_atomic_fmax: case Intrinsic::amdgcn_flat_atomic_fadd: case Intrinsic::amdgcn_flat_atomic_fmin: - case Intrinsic::amdgcn_flat_atomic_fmax: { + case Intrinsic::amdgcn_flat_atomic_fmax: + case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: + case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(CI.getType()); Info.ptrVal = CI.getOperand(0); @@ -1326,6 +1328,8 @@ case Intrinsic::amdgcn_flat_atomic_fadd: case Intrinsic::amdgcn_flat_atomic_fmin: case Intrinsic::amdgcn_flat_atomic_fmax: + case Intrinsic::amdgcn_global_atomic_fadd_v2bf16: + case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: case Intrinsic::amdgcn_global_atomic_csub: { Value *Ptr = II->getArgOperand(0); AccessTy = II->getType(); @@ -12448,6 +12452,9 @@ if ((AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) && Subtarget->hasAtomicFaddInsts()) { + if (Subtarget->hasGFX940Insts()) + return AtomicExpansionKind::None; + // The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe // floating point atomic instructions. May generate more efficient code, // but may not respect rounding and denormal modes, and may give incorrect Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll @@ -0,0 +1,86 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -march=amdgcn -mcpu=gfx940 -verify-machineinstrs | FileCheck %s -check-prefix=GFX940 + +declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0f32.f32(float* %ptr, float %data) +declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0v2f16.v2f16(<2 x half>* %ptr, <2 x half> %data) + +; bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm. +declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0v2i16(<2 x i16>* %ptr, <2 x i16> %data) +declare <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1v2i16(<2 x i16> addrspace(1)* %ptr, <2 x i16> %data) +declare <2 x half> @llvm.amdgcn.ds.fadd.v2f16(<2 x half> addrspace(3) * %ptr, <2 x half> %data, i32, i32, i1) +declare <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(<2 x i16> addrspace(3) * %ptr, <2 x i16> %data) + +define amdgpu_kernel void @flat_atomic_fadd_f32_noret(float* %ptr, float %data) { +; GFX940-LABEL: flat_atomic_fadd_f32_noret: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 +; GFX940-NEXT: s_endpgm + %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0f32.f32(float* %ptr, float %data) + ret void +} + +define float @flat_atomic_fadd_f32_rtn(float* %ptr, float %data) { +; GFX940-LABEL: flat_atomic_fadd_f32_rtn: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0f32.f32(float* %ptr, float %data) + ret float %ret +} + +define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(<2 x half>* %ptr, <2 x half> %data) { +; GFX940-LABEL: flat_atomic_fadd_v2f16_noret: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 +; GFX940-NEXT: s_endpgm + %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0v2f16.v2f16(<2 x half>* %ptr, <2 x half> %data) + ret void +} + +define <2 x half> @flat_atomic_fadd_v2f16_rtn(<2 x half>* %ptr, <2 x half> %data) { +; GFX940-LABEL: flat_atomic_fadd_v2f16_rtn: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0v2f16.v2f16(<2 x half>* %ptr, <2 x half> %data) + ret <2 x half> %ret +} + +define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(<2 x half> addrspace(3)* %ptr, <2 x half> %data) { +; GFX940-LABEL: local_atomic_fadd_v2f16_noret: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s3, s[0:1], 0x28 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NEXT: ds_pk_add_f16 v0, v1 +; GFX940-NEXT: s_endpgm + %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(<2 x half> addrspace(3)* %ptr, <2 x half> %data, i32 0, i32 0, i1 0) + ret void +} + +define <2 x half> @local_atomic_fadd_v2f16_rtn(<2 x half> addrspace(3)* %ptr, <2 x half> %data) { +; GFX940-LABEL: local_atomic_fadd_v2f16_rtn: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(<2 x half> addrspace(3)* %ptr, <2 x half> %data, i32 0, i32 0, i1 0) + ret <2 x half> %ret +} Index: llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll @@ -0,0 +1,224 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -march=amdgcn -mcpu=gfx940 -verify-machineinstrs | FileCheck %s -check-prefix=GFX940 + +declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0f32.f32(float* %ptr, float %data) +declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0v2f16.v2f16(<2 x half>* %ptr, <2 x half> %data) + +; bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm. +declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0v2i16(<2 x i16>* %ptr, <2 x i16> %data) +declare <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1v2i16(<2 x i16> addrspace(1)* %ptr, <2 x i16> %data) +declare <2 x half> @llvm.amdgcn.ds.fadd.v2f16(<2 x half> addrspace(3) * %ptr, <2 x half> %data, i32, i32, i1) +declare <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(<2 x i16> addrspace(3) * %ptr, <2 x i16> %data) + +define amdgpu_kernel void @flat_atomic_fadd_f32_noret(float* %ptr, float %data) { +; GFX940-LABEL: flat_atomic_fadd_f32_noret: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 +; GFX940-NEXT: s_endpgm + %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0f32.f32(float* %ptr, float %data) + ret void +} + +define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(float* %ptr) { +; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: buffer_wbl2 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_invl2 +; GFX940-NEXT: buffer_wbinvl1_vol +; GFX940-NEXT: s_endpgm + %ret = atomicrmw fadd float* %ptr, float 4.0 seq_cst + ret void +} + +define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(float* %ptr) #0 { +; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat_ieee: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: buffer_wbl2 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_invl2 +; GFX940-NEXT: buffer_wbinvl1_vol +; GFX940-NEXT: s_endpgm + %ret = atomicrmw fadd float* %ptr, float 4.0 seq_cst + ret void +} + +define float @flat_atomic_fadd_f32_rtn(float* %ptr, float %data) { +; GFX940-LABEL: flat_atomic_fadd_f32_rtn: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0f32.f32(float* %ptr, float %data) + ret float %ret +} + +define float @flat_atomic_fadd_f32_rtn_pat(float* %ptr, float %data) { +; GFX940-LABEL: flat_atomic_fadd_f32_rtn_pat: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 +; GFX940-NEXT: buffer_wbl2 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_invl2 +; GFX940-NEXT: buffer_wbinvl1_vol +; GFX940-NEXT: s_setpc_b64 s[30:31] + %ret = atomicrmw fadd float* %ptr, float 4.0 seq_cst + ret float %ret +} + +define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(<2 x half>* %ptr, <2 x half> %data) { +; GFX940-LABEL: flat_atomic_fadd_v2f16_noret: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 +; GFX940-NEXT: s_endpgm + %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0v2f16.v2f16(<2 x half>* %ptr, <2 x half> %data) + ret void +} + +define <2 x half> @flat_atomic_fadd_v2f16_rtn(<2 x half>* %ptr, <2 x half> %data) { +; GFX940-LABEL: flat_atomic_fadd_v2f16_rtn: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0v2f16.v2f16(<2 x half>* %ptr, <2 x half> %data) + ret <2 x half> %ret +} + +define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(<2 x i16>* %ptr, <2 x i16> %data) { +; GFX940-LABEL: flat_atomic_fadd_v2bf16_noret: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 +; GFX940-NEXT: s_endpgm + %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0v2i16(<2 x i16>* %ptr, <2 x i16> %data) + ret void +} + +define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(<2 x i16>* %ptr, <2 x i16> %data) { +; GFX940-LABEL: flat_atomic_fadd_v2bf16_rtn: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0v2i16(<2 x i16>* %ptr, <2 x i16> %data) + ret <2 x i16> %ret +} + +define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(<2 x i16> addrspace(1)* %ptr, <2 x i16> %data) { +; GFX940-LABEL: global_atomic_fadd_v2bf16_noret: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: v_mov_b32_e32 v0, 0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v1, s4 +; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v1, s[2:3] +; GFX940-NEXT: s_endpgm + %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1v2i16(<2 x i16> addrspace(1)* %ptr, <2 x i16> %data) + ret void +} + +define <2 x i16> @global_atomic_fadd_v2bf16_rtn(<2 x i16> addrspace(1)* %ptr, <2 x i16> %data) { +; GFX940-LABEL: global_atomic_fadd_v2bf16_rtn: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1v2i16(<2 x i16> addrspace(1)* %ptr, <2 x i16> %data) + ret <2 x i16> %ret +} + +define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(<2 x half> addrspace(3)* %ptr, <2 x half> %data) { +; GFX940-LABEL: local_atomic_fadd_v2f16_noret: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s3, s[0:1], 0x28 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NEXT: ds_pk_add_f16 v0, v1 +; GFX940-NEXT: s_endpgm + %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(<2 x half> addrspace(3)* %ptr, <2 x half> %data, i32 0, i32 0, i1 0) + ret void +} + +define <2 x half> @local_atomic_fadd_v2f16_rtn(<2 x half> addrspace(3)* %ptr, <2 x half> %data) { +; GFX940-LABEL: local_atomic_fadd_v2f16_rtn: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: ds_pk_add_rtn_f16 v0, v0, v1 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(<2 x half> addrspace(3)* %ptr, <2 x half> %data, i32 0, i32 0, i1 0) + ret <2 x half> %ret +} + +define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(<2 x i16> addrspace(3)* %ptr, <2 x i16> %data) { +; GFX940-LABEL: local_atomic_fadd_v2bf16_noret: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s3, s[0:1], 0x28 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NEXT: buffer_wbl2 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: ds_pk_add_bf16 v0, v1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_invl2 +; GFX940-NEXT: buffer_wbinvl1_vol +; GFX940-NEXT: s_endpgm + %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(<2 x i16> addrspace(3)* %ptr, <2 x i16> %data) + ret void +} + +define <2 x i16> @local_atomic_fadd_v2bf16_rtn(<2 x i16> addrspace(3)* %ptr, <2 x i16> %data) { +; GFX940-LABEL: local_atomic_fadd_v2bf16_rtn: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_invl2 +; GFX940-NEXT: buffer_wbinvl1_vol +; GFX940-NEXT: s_setpc_b64 s[30:31] + %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(<2 x i16> addrspace(3)* %ptr, <2 x i16> %data) + ret <2 x i16> %ret +} + +attributes #0 = { "denormal-fp-math-f32"="ieee,ieee" } Index: llvm/test/MC/AMDGPU/gfx940_asm_features.s =================================================================== --- llvm/test/MC/AMDGPU/gfx940_asm_features.s +++ llvm/test/MC/AMDGPU/gfx940_asm_features.s @@ -1,6 +1,6 @@ // RUN: llvm-mc -arch=amdgcn -mcpu=gfx940 -show-encoding %s | FileCheck --check-prefix=GFX940 --strict-whitespace %s // RUN: not llvm-mc -arch=amdgcn -mcpu=gfx90a %s 2>&1 | FileCheck --check-prefixes=NOT-GFX940,GFX90A --implicit-check-not=error: %s -// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=NOT-GFX940 --implicit-check-not=error: %s +// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=NOT-GFX940,GFX10 --implicit-check-not=error: %s // NOT-GFX940: error: invalid operand for instruction // GFX940: global_load_dword v2, v[2:3], off sc0 ; encoding: [0x00,0x80,0x51,0xdc,0x02,0x00,0x7f,0x02] @@ -33,6 +33,102 @@ // GFX940: buffer_load_dword v5, off, s[8:11], s3 sc0 nt sc1 ; encoding: [0x00,0xc0,0x52,0xe0,0x00,0x05,0x02,0x03] buffer_load_dword v5, off, s[8:11], s3 sc0 nt sc1 +// NOT-GFX940: error: instruction not supported on this GPU +// GFX940: flat_atomic_add_f32 v[2:3], v1 ; encoding: [0x00,0x00,0x34,0xdd,0x02,0x01,0x00,0x00] +flat_atomic_add_f32 v[2:3], v1 + +// NOT-GFX940: error: instruction not supported on this GPU +// GFX940: flat_atomic_add_f32 v[2:3], a1 ; encoding: [0x00,0x00,0x34,0xdd,0x02,0x01,0x80,0x00] +flat_atomic_add_f32 v[2:3], a1 + +// NOT-GFX940: error: instruction not supported on this GPU +// GFX940: flat_atomic_add_f32 v4, v[2:3], v1 sc0 ; encoding: [0x00,0x00,0x35,0xdd,0x02,0x01,0x00,0x04] +flat_atomic_add_f32 v4, v[2:3], v1 sc0 + +// NOT-GFX940: error: instruction not supported on this GPU +// GFX940: flat_atomic_add_f32 a4, v[2:3], a1 sc0 ; encoding: [0x00,0x00,0x35,0xdd,0x02,0x01,0x80,0x04] +flat_atomic_add_f32 a4, v[2:3], a1 sc0 + +// NOT-GFX940: error: instruction not supported on this GPU +// GFX940: flat_atomic_pk_add_f16 v4, v[2:3], v1 sc0 ; encoding: [0x00,0x00,0x39,0xdd,0x02,0x01,0x00,0x04] +flat_atomic_pk_add_f16 v4, v[2:3], v1 sc0 + +// NOT-GFX940: error: instruction not supported on this GPU +// GFX940: flat_atomic_pk_add_f16 a4, v[2:3], a1 sc0 ; encoding: [0x00,0x00,0x39,0xdd,0x02,0x01,0x80,0x04] +flat_atomic_pk_add_f16 a4, v[2:3], a1 sc0 + +// NOT-GFX940: error: instruction not supported on this GPU +// GFX940: flat_atomic_pk_add_f16 v[2:3], v1 ; encoding: [0x00,0x00,0x38,0xdd,0x02,0x01,0x00,0x00] +flat_atomic_pk_add_f16 v[2:3], v1 + +// NOT-GFX940: error: instruction not supported on this GPU +// GFX940: flat_atomic_pk_add_f16 v[2:3], a1 ; encoding: [0x00,0x00,0x38,0xdd,0x02,0x01,0x80,0x00] +flat_atomic_pk_add_f16 v[2:3], a1 + +// NOT-GFX940: error: instruction not supported on this GPU +// GFX940: flat_atomic_pk_add_bf16 v4, v[2:3], v1 sc0 ; encoding: [0x00,0x00,0x49,0xdd,0x02,0x01,0x00,0x04] +flat_atomic_pk_add_bf16 v4, v[2:3], v1 sc0 + +// NOT-GFX940: error: instruction not supported on this GPU +// GFX940: flat_atomic_pk_add_bf16 a4, v[2:3], a1 sc0 ; encoding: [0x00,0x00,0x49,0xdd,0x02,0x01,0x80,0x04] +flat_atomic_pk_add_bf16 a4, v[2:3], a1 sc0 + +// NOT-GFX940: error: instruction not supported on this GPU +// GFX940: flat_atomic_pk_add_bf16 v[2:3], v1 ; encoding: [0x00,0x00,0x48,0xdd,0x02,0x01,0x00,0x00] +flat_atomic_pk_add_bf16 v[2:3], v1 + +// NOT-GFX940: error: instruction not supported on this GPU +// GFX940: flat_atomic_pk_add_bf16 v[2:3], a1 ; encoding: [0x00,0x00,0x48,0xdd,0x02,0x01,0x80,0x00] +flat_atomic_pk_add_bf16 v[2:3], a1 + +// NOT-GFX940: error: instruction not supported on this GPU +// GFX940: global_atomic_pk_add_bf16 v4, v[2:3], v1, off sc0 ; encoding: [0x00,0x80,0x49,0xdd,0x02,0x01,0x7f,0x04] +global_atomic_pk_add_bf16 v4, v[2:3], v1, off sc0 + +// NOT-GFX940: error: instruction not supported on this GPU +// GFX940: global_atomic_pk_add_bf16 a4, v[2:3], a1, off sc0 ; encoding: [0x00,0x80,0x49,0xdd,0x02,0x01,0xff,0x04] +global_atomic_pk_add_bf16 a4, v[2:3], a1, off sc0 + +// NOT-GFX940: error: instruction not supported on this GPU +// GFX940: global_atomic_pk_add_bf16 v[2:3], v1, off ; encoding: [0x00,0x80,0x48,0xdd,0x02,0x01,0x7f,0x00] +global_atomic_pk_add_bf16 v[2:3], v1, off + +// NOT-GFX940: error: instruction not supported on this GPU +// GFX940: global_atomic_pk_add_bf16 v[2:3], a1, off ; encoding: [0x00,0x80,0x48,0xdd,0x02,0x01,0xff,0x00] +global_atomic_pk_add_bf16 v[2:3], a1, off + +// NOT-GFX940: error: instruction not supported on this GPU +// GFX940: ds_pk_add_f16 v2, v1 ; encoding: [0x00,0x00,0x2e,0xd8,0x02,0x01,0x00,0x00] +ds_pk_add_f16 v2, v1 + +// NOT-GFX940: error: instruction not supported on this GPU +// GFX940: ds_pk_add_f16 v2, a1 ; encoding: [0x00,0x00,0x2e,0xda,0x02,0x01,0x00,0x00] +ds_pk_add_f16 v2, a1 + +// NOT-GFX940: error: instruction not supported on this GPU +// GFX940: ds_pk_add_rtn_f16 v3, v2, v1 ; encoding: [0x00,0x00,0x6e,0xd9,0x02,0x01,0x00,0x03] +ds_pk_add_rtn_f16 v3, v2, v1 + +// NOT-GFX940: error: instruction not supported on this GPU +// GFX940: ds_pk_add_rtn_f16 a3, v2, a1 ; encoding: [0x00,0x00,0x6e,0xdb,0x02,0x01,0x00,0x03] +ds_pk_add_rtn_f16 a3, v2, a1 + +// NOT-GFX940: error: instruction not supported on this GPU +// GFX940: ds_pk_add_bf16 v2, v1 ; encoding: [0x00,0x00,0x30,0xd8,0x02,0x01,0x00,0x00] +ds_pk_add_bf16 v2, v1 + +// NOT-GFX940: error: instruction not supported on this GPU +// GFX940: ds_pk_add_bf16 v2, a1 ; encoding: [0x00,0x00,0x30,0xda,0x02,0x01,0x00,0x00] +ds_pk_add_bf16 v2, a1 + +// NOT-GFX940: error: instruction not supported on this GPU +// GFX940: ds_pk_add_rtn_bf16 v3, v2, v1 ; encoding: [0x00,0x00,0x70,0xd9,0x02,0x01,0x00,0x03] +ds_pk_add_rtn_bf16 v3, v2, v1 + +// NOT-GFX940: error: instruction not supported on this GPU +// GFX940: ds_pk_add_rtn_bf16 a3, v2, a1 ; encoding: [0x00,0x00,0x70,0xdb,0x02,0x01,0x00,0x03] +ds_pk_add_rtn_bf16 a3, v2, a1 + // NOT-GFX940: error: instruction not supported on this GPU // GFX940: v_mov_b64_e32 v[2:3], v[4:5] ; encoding: [0x04,0x71,0x04,0x7e] v_mov_b64 v[2:3], v[4:5] @@ -68,3 +164,63 @@ // GFX90A: error: instruction not supported on this GPU // GFX940: v_fmaak_f32 v0, v2, v3, 0x42c80000 ; encoding: [0x02,0x07,0x00,0x30,0x00,0x00,0xc8,0x42] v_fmaak_f32 v0, v2, v3, 100.0 + +// GFX90A: error: invalid operand for instruction +// GFX10: error: instruction not supported on this GPU +// GFX940: global_atomic_add_f32 v0, v[0:1], v2, off sc0 sc1 ; encoding: [0x00,0x80,0x35,0xdf,0x00,0x02,0x7f,0x00] +global_atomic_add_f32 v0, v[0:1], v2, off sc0 sc1 + +// GFX90A: error: invalid operand for instruction +// GFX10: error: instruction not supported on this GPU +// GFX940: global_atomic_add_f32 v[0:1], v2, off sc1 ; encoding: [0x00,0x80,0x34,0xdf,0x00,0x02,0x7f,0x00] +global_atomic_add_f32 v[0:1], v2, off sc1 + +// GFX90A: error: invalid operand for instruction +// GFX10: error: instruction not supported on this GPU +// GFX940: global_atomic_add_f32 v0, v2, s[0:1] sc1 ; encoding: [0x00,0x80,0x34,0xdf,0x00,0x02,0x00,0x00] +global_atomic_add_f32 v0, v2, s[0:1] sc1 + +// GFX90A: error: invalid operand for instruction +// GFX10: error: instruction not supported on this GPU +// GFX940: global_atomic_add_f32 v1, v0, v2, s[0:1] sc0 sc1 ; encoding: [0x00,0x80,0x35,0xdf,0x00,0x02,0x00,0x01] +global_atomic_add_f32 v1, v0, v2, s[0:1] sc0 sc1 + +// GFX90A: error: invalid operand for instruction +// GFX10: error: instruction not supported on this GPU +// GFX940: global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0 sc1 ; encoding: [0x00,0x80,0x39,0xdf,0x00,0x02,0x7f,0x00] +global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0 sc1 + +// GFX90A: error: invalid operand for instruction +// GFX10: error: instruction not supported on this GPU +// GFX940: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 sc1 ; encoding: [0x00,0x00,0x3d,0xdf,0x00,0x02,0x00,0x00] +flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 sc1 + +// GFX90A: error: invalid operand for instruction +// GFX10: error: instruction not supported on this GPU +// GFX940: flat_atomic_add_f64 v[0:1], v[2:3] sc1 ; encoding: [0x00,0x00,0x3c,0xdf,0x00,0x02,0x00,0x00] +flat_atomic_add_f64 v[0:1], v[2:3] sc1 + +// GFX90A: error: invalid operand for instruction +// GFX10: error: instruction not supported on this GPU +// GFX940: flat_atomic_min_f64 v[0:1], v[2:3] sc1 ; encoding: [0x00,0x00,0x40,0xdf,0x00,0x02,0x00,0x00] +flat_atomic_min_f64 v[0:1], v[2:3] sc1 + +// GFX90A: error: invalid operand for instruction +// GFX10: error: instruction not supported on this GPU +// GFX940: flat_atomic_max_f64 v[0:1], v[2:3] sc1 ; encoding: [0x00,0x00,0x44,0xdf,0x00,0x02,0x00,0x00] +flat_atomic_max_f64 v[0:1], v[2:3] sc1 + +// GFX90A: error: invalid operand for instruction +// GFX10: error: instruction not supported on this GPU +// GFX940: global_atomic_add_f64 v[0:1], v[2:3], off sc1 ; encoding: [0x00,0x80,0x3c,0xdf,0x00,0x02,0x7f,0x00] +global_atomic_add_f64 v[0:1], v[2:3], off sc1 + +// GFX90A: error: invalid operand for instruction +// GFX10: error: instruction not supported on this GPU +// GFX940: global_atomic_min_f64 v[0:1], v[2:3], off sc1 ; encoding: [0x00,0x80,0x40,0xdf,0x00,0x02,0x7f,0x00] +global_atomic_min_f64 v[0:1], v[2:3], off sc1 + +// GFX90A: error: invalid operand for instruction +// GFX10: error: instruction not supported on this GPU +// GFX940: global_atomic_max_f64 v[0:1], v[2:3], off sc1 ; encoding: [0x00,0x80,0x44,0xdf,0x00,0x02,0x7f,0x00] +global_atomic_max_f64 v[0:1], v[2:3], off sc1 Index: llvm/test/MC/Disassembler/AMDGPU/gfx940_dasm_features.txt =================================================================== --- llvm/test/MC/Disassembler/AMDGPU/gfx940_dasm_features.txt +++ llvm/test/MC/Disassembler/AMDGPU/gfx940_dasm_features.txt @@ -15,6 +15,78 @@ # GFX940: buffer_load_dword v5, off, s[8:11], s3 sc0 nt sc1 ; encoding: [0x00,0xc0,0x52,0xe0,0x00,0x05,0x02,0x03] 0x00,0xc0,0x52,0xe0,0x00,0x05,0x02,0x03 +# GFX940: flat_atomic_add_f32 v[2:3], v1 ; encoding: [0x00,0x00,0x34,0xdd,0x02,0x01,0x00,0x00] +0x00,0x00,0x34,0xdd,0x02,0x01,0x00,0x00 + +# GFX940: flat_atomic_add_f32 v[2:3], a1 ; encoding: [0x00,0x00,0x34,0xdd,0x02,0x01,0x80,0x00] +0x00,0x00,0x34,0xdd,0x02,0x01,0x80,0x00 + +# GFX940: flat_atomic_add_f32 v4, v[2:3], v1 sc0 ; encoding: [0x00,0x00,0x35,0xdd,0x02,0x01,0x00,0x04] +0x00,0x00,0x35,0xdd,0x02,0x01,0x00,0x04 + +# GFX940: flat_atomic_add_f32 a4, v[2:3], a1 sc0 ; encoding: [0x00,0x00,0x35,0xdd,0x02,0x01,0x80,0x04] +0x00,0x00,0x35,0xdd,0x02,0x01,0x80,0x04 + +# GFX940: flat_atomic_pk_add_f16 v4, v[2:3], v1 sc0 ; encoding: [0x00,0x00,0x39,0xdd,0x02,0x01,0x00,0x04] +0x00,0x00,0x39,0xdd,0x02,0x01,0x00,0x04 + +# GFX940: flat_atomic_pk_add_f16 a4, v[2:3], a1 sc0 ; encoding: [0x00,0x00,0x39,0xdd,0x02,0x01,0x80,0x04] +0x00,0x00,0x39,0xdd,0x02,0x01,0x80,0x04 + +# GFX940: flat_atomic_pk_add_f16 v[2:3], v1 ; encoding: [0x00,0x00,0x38,0xdd,0x02,0x01,0x00,0x00] +0x00,0x00,0x38,0xdd,0x02,0x01,0x00,0x00 + +# GFX940: flat_atomic_pk_add_f16 v[2:3], a1 ; encoding: [0x00,0x00,0x38,0xdd,0x02,0x01,0x80,0x00] +0x00,0x00,0x38,0xdd,0x02,0x01,0x80,0x00 + +# GFX940: flat_atomic_pk_add_bf16 v4, v[2:3], v1 sc0 ; encoding: [0x00,0x00,0x49,0xdd,0x02,0x01,0x00,0x04] +0x00,0x00,0x49,0xdd,0x02,0x01,0x00,0x04 + +# GFX940: flat_atomic_pk_add_bf16 a4, v[2:3], a1 sc0 ; encoding: [0x00,0x00,0x49,0xdd,0x02,0x01,0x80,0x04] +0x00,0x00,0x49,0xdd,0x02,0x01,0x80,0x04 + +# GFX940: flat_atomic_pk_add_bf16 v[2:3], v1 ; encoding: [0x00,0x00,0x48,0xdd,0x02,0x01,0x00,0x00] +0x00,0x00,0x48,0xdd,0x02,0x01,0x00,0x00 + +# GFX940: flat_atomic_pk_add_bf16 v[2:3], a1 ; encoding: [0x00,0x00,0x48,0xdd,0x02,0x01,0x80,0x00] +0x00,0x00,0x48,0xdd,0x02,0x01,0x80,0x00 + +# GFX940: global_atomic_pk_add_bf16 v4, v[2:3], v1, off sc0 ; encoding: [0x00,0x80,0x49,0xdd,0x02,0x01,0x7f,0x04] +0x00,0x80,0x49,0xdd,0x02,0x01,0x7f,0x04 + +# GFX940: global_atomic_pk_add_bf16 a4, v[2:3], a1, off sc0 ; encoding: [0x00,0x80,0x49,0xdd,0x02,0x01,0xff,0x04] +0x00,0x80,0x49,0xdd,0x02,0x01,0xff,0x04 + +# GFX940: global_atomic_pk_add_bf16 v[2:3], v1, off ; encoding: [0x00,0x80,0x48,0xdd,0x02,0x01,0x7f,0x00] +0x00,0x80,0x48,0xdd,0x02,0x01,0x7f,0x00 + +# GFX940: global_atomic_pk_add_bf16 v[2:3], a1, off ; encoding: [0x00,0x80,0x48,0xdd,0x02,0x01,0xff,0x00] +0x00,0x80,0x48,0xdd,0x02,0x01,0xff,0x00 + +# GFX940: ds_pk_add_f16 v2, v1 ; encoding: [0x00,0x00,0x2e,0xd8,0x02,0x01,0x00,0x00] +0x00,0x00,0x2e,0xd8,0x02,0x01,0x00,0x00 + +# GFX940: ds_pk_add_f16 v2, a1 ; encoding: [0x00,0x00,0x2e,0xda,0x02,0x01,0x00,0x00] +0x00,0x00,0x2e,0xda,0x02,0x01,0x00,0x00 + +# GFX940: ds_pk_add_rtn_f16 v3, v2, v1 ; encoding: [0x00,0x00,0x6e,0xd9,0x02,0x01,0x00,0x03] +0x00,0x00,0x6e,0xd9,0x02,0x01,0x00,0x03 + +# GFX940: ds_pk_add_rtn_f16 a3, v2, a1 ; encoding: [0x00,0x00,0x6e,0xdb,0x02,0x01,0x00,0x03] +0x00,0x00,0x6e,0xdb,0x02,0x01,0x00,0x03 + +# GFX940: ds_pk_add_bf16 v2, v1 ; encoding: [0x00,0x00,0x30,0xd8,0x02,0x01,0x00,0x00] +0x00,0x00,0x30,0xd8,0x02,0x01,0x00,0x00 + +# GFX940: ds_pk_add_bf16 v2, a1 ; encoding: [0x00,0x00,0x30,0xda,0x02,0x01,0x00,0x00] +0x00,0x00,0x30,0xda,0x02,0x01,0x00,0x00 + +# GFX940: ds_pk_add_rtn_bf16 v3, v2, v1 ; encoding: [0x00,0x00,0x70,0xd9,0x02,0x01,0x00,0x03] +0x00,0x00,0x70,0xd9,0x02,0x01,0x00,0x03 + +# GFX940: ds_pk_add_rtn_bf16 a3, v2, a1 ; encoding: [0x00,0x00,0x70,0xdb,0x02,0x01,0x00,0x03] +0x00,0x00,0x70,0xdb,0x02,0x01,0x00,0x03 + # GFX940: v_mov_b64_e32 v[2:3], v[4:5] ; encoding: [0x04,0x71,0x04,0x7e] 0x04,0x71,0x04,0x7e @@ -41,3 +113,39 @@ # GFX940: v_fmaak_f32 v0, v2, v3, 0x42c80000 ; encoding: [0x02,0x07,0x00,0x30,0x00,0x00,0xc8,0x42] 0x02,0x07,0x00,0x30,0x00,0x00,0xc8,0x42 + +# GFX940: global_atomic_add_f32 v0, v[0:1], v2, off sc0 sc1 ; encoding: [0x00,0x80,0x35,0xdf,0x00,0x02,0x7f,0x00] +0x00,0x80,0x35,0xdf,0x00,0x02,0x7f,0x00 + +# GFX940: global_atomic_add_f32 v[0:1], v2, off sc1 ; encoding: [0x00,0x80,0x34,0xdf,0x00,0x02,0x7f,0x00] +0x00,0x80,0x34,0xdf,0x00,0x02,0x7f,0x00 + +# GFX940: global_atomic_add_f32 v0, v2, s[0:1] sc1 ; encoding: [0x00,0x80,0x34,0xdf,0x00,0x02,0x00,0x00] +0x00,0x80,0x34,0xdf,0x00,0x02,0x00,0x00 + +# GFX940: global_atomic_add_f32 v1, v0, v2, s[0:1] sc0 sc1 ; encoding: [0x00,0x80,0x35,0xdf,0x00,0x02,0x00,0x01] +0x00,0x80,0x35,0xdf,0x00,0x02,0x00,0x01 + +# GFX940: global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0 sc1 ; encoding: [0x00,0x80,0x39,0xdf,0x00,0x02,0x7f,0x00] +0x00,0x80,0x39,0xdf,0x00,0x02,0x7f,0x00 + +# GFX940: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 sc1 ; encoding: [0x00,0x00,0x3d,0xdf,0x00,0x02,0x00,0x00] +0x00,0x00,0x3d,0xdf,0x00,0x02,0x00,0x00 + +# GFX940: flat_atomic_add_f64 v[0:1], v[2:3] sc1 ; encoding: [0x00,0x00,0x3c,0xdf,0x00,0x02,0x00,0x00] +0x00,0x00,0x3c,0xdf,0x00,0x02,0x00,0x00 + +# GFX940: flat_atomic_min_f64 v[0:1], v[2:3] sc1 ; encoding: [0x00,0x00,0x40,0xdf,0x00,0x02,0x00,0x00] +0x00,0x00,0x40,0xdf,0x00,0x02,0x00,0x00 + +# GFX940: flat_atomic_max_f64 v[0:1], v[2:3] sc1 ; encoding: [0x00,0x00,0x44,0xdf,0x00,0x02,0x00,0x00] +0x00,0x00,0x44,0xdf,0x00,0x02,0x00,0x00 + +# GFX940: global_atomic_add_f64 v[0:1], v[2:3], off sc1 ; encoding: [0x00,0x80,0x3c,0xdf,0x00,0x02,0x7f,0x00] +0x00,0x80,0x3c,0xdf,0x00,0x02,0x7f,0x00 + +# GFX940: global_atomic_min_f64 v[0:1], v[2:3], off sc1 ; encoding: [0x00,0x80,0x40,0xdf,0x00,0x02,0x7f,0x00] +0x00,0x80,0x40,0xdf,0x00,0x02,0x7f,0x00 + +# GFX940: global_atomic_max_f64 v[0:1], v[2:3], off sc1 ; encoding: [0x00,0x80,0x44,0xdf,0x00,0x02,0x7f,0x00] +0x00,0x80,0x44,0xdf,0x00,0x02,0x7f,0x00