Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -294,6 +294,19 @@ def int_amdgcn_atomic_inc : AMDGPUAtomicIncIntrin; def int_amdgcn_atomic_dec : AMDGPUAtomicIncIntrin; +class AMDGPUAtomicF32Intrin : Intrinsic<[llvm_float_ty], + [LLVMAnyPointerType, + llvm_float_ty, + llvm_i32_ty, // ordering + llvm_i32_ty, // scope + llvm_i1_ty], // isVolatile + [IntrArgMemOnly, NoCapture<0>] +>; + +def int_amdgcn_atomic_add : AMDGPUAtomicF32Intrin; +def int_amdgcn_atomic_min : AMDGPUAtomicF32Intrin; +def int_amdgcn_atomic_max : AMDGPUAtomicF32Intrin; + class AMDGPUImageLoad : Intrinsic < [llvm_anyfloat_ty], // vdata(VGPR) [llvm_anyint_ty, // vaddr(VGPR) Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -450,7 +450,10 @@ } if (isa(N) || - (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC)) + (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC || + Opc == AMDGPUISD::ATOMIC_LOAD_FADD || + Opc == AMDGPUISD::ATOMIC_LOAD_FMIN || + Opc == AMDGPUISD::ATOMIC_LOAD_FMAX)) N = glueCopyToM0(N); switch (Opc) { Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -445,6 +445,9 @@ ATOMIC_CMP_SWAP, ATOMIC_INC, ATOMIC_DEC, + ATOMIC_LOAD_FADD, + ATOMIC_LOAD_FMIN, + ATOMIC_LOAD_FMAX, BUFFER_LOAD, BUFFER_LOAD_FORMAT, BUFFER_STORE, Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3980,6 +3980,9 @@ NODE_NAME_CASE(ATOMIC_CMP_SWAP) NODE_NAME_CASE(ATOMIC_INC) NODE_NAME_CASE(ATOMIC_DEC) + NODE_NAME_CASE(ATOMIC_LOAD_FADD) + NODE_NAME_CASE(ATOMIC_LOAD_FMIN) + NODE_NAME_CASE(ATOMIC_LOAD_FMAX) NODE_NAME_CASE(BUFFER_LOAD) NODE_NAME_CASE(BUFFER_LOAD_FORMAT) NODE_NAME_CASE(BUFFER_STORE) Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -449,6 +449,9 @@ case Intrinsic::r600_read_tidig_z: case Intrinsic::amdgcn_atomic_inc: case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_atomic_add: + case Intrinsic::amdgcn_atomic_min: + case Intrinsic::amdgcn_atomic_max: case Intrinsic::amdgcn_image_atomic_swap: case Intrinsic::amdgcn_image_atomic_add: case Intrinsic::amdgcn_image_atomic_sub: Index: lib/Target/AMDGPU/DSInstructions.td =================================================================== --- lib/Target/AMDGPU/DSInstructions.td +++ lib/Target/AMDGPU/DSInstructions.td @@ -440,7 +440,7 @@ defm DS_MSKOR_RTN_B32 : DS_1A2D_RET_mc<"ds_mskor_rtn_b32", VGPR_32, "ds_mskor_b32">; defm DS_CMPST_RTN_B32 : DS_1A2D_RET_mc<"ds_cmpst_rtn_b32", VGPR_32, "ds_cmpst_b32">; defm DS_CMPST_RTN_F32 : DS_1A2D_RET_mc<"ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">; -defm DS_MIN_RTN_F32 : DS_1A1D_RET_mc <"ds_min_rtn_f32", VGPR_32, "ds_min_f32">; +defm DS_MIN_RTN_F32 : DS_1A1D_RET_mc<"ds_min_rtn_f32", VGPR_32, "ds_min_f32">; defm DS_MAX_RTN_F32 : DS_1A1D_RET_mc<"ds_max_rtn_f32", VGPR_32, "ds_max_f32">; defm DS_WRXCHG_RTN_B32 : DS_1A1D_RET_mc<"ds_wrxchg_rtn_b32">; @@ -769,6 +769,9 @@ defm : DSAtomicRetPat_mc; defm : DSAtomicRetPat_mc; defm : DSAtomicCmpXChg_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; // 64-bit atomics. defm : DSAtomicRetPat_mc; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -561,7 +561,10 @@ unsigned IntrID) const { switch (IntrID) { case Intrinsic::amdgcn_atomic_inc: - case Intrinsic::amdgcn_atomic_dec: { + case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_atomic_add: + case Intrinsic::amdgcn_atomic_min: + case Intrinsic::amdgcn_atomic_max: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(CI.getType()); Info.ptrVal = CI.getOperand(0); @@ -583,7 +586,10 @@ Type *&AccessTy) const { switch (II->getIntrinsicID()) { case Intrinsic::amdgcn_atomic_inc: - case Intrinsic::amdgcn_atomic_dec: { + case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_atomic_add: + case Intrinsic::amdgcn_atomic_min: + case Intrinsic::amdgcn_atomic_max: { Value *Ptr = II->getArgOperand(0); AccessTy = II->getType(); Ops.push_back(Ptr); @@ -4259,10 +4265,31 @@ switch (IntrID) { case Intrinsic::amdgcn_atomic_inc: - case Intrinsic::amdgcn_atomic_dec: { + case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_atomic_add: + case Intrinsic::amdgcn_atomic_min: + case Intrinsic::amdgcn_atomic_max: { MemSDNode *M = cast(Op); - unsigned Opc = (IntrID == Intrinsic::amdgcn_atomic_inc) ? - AMDGPUISD::ATOMIC_INC : AMDGPUISD::ATOMIC_DEC; + unsigned Opc; + switch (IntrID) { + case Intrinsic::amdgcn_atomic_inc: + Opc = AMDGPUISD::ATOMIC_INC; + break; + case Intrinsic::amdgcn_atomic_dec: + Opc = AMDGPUISD::ATOMIC_DEC; + break; + case Intrinsic::amdgcn_atomic_add: + Opc = AMDGPUISD::ATOMIC_LOAD_FADD; + break; + case Intrinsic::amdgcn_atomic_min: + Opc = AMDGPUISD::ATOMIC_LOAD_FMIN; + break; + case Intrinsic::amdgcn_atomic_max: + Opc = AMDGPUISD::ATOMIC_LOAD_FMAX; + break; + default: + llvm_unreachable("Unknown intrinsic!"); + } SDValue Ops[] = { M->getOperand(0), // Chain M->getOperand(2), // Ptr @@ -6519,7 +6546,10 @@ case ISD::ATOMIC_LOAD_UMIN: case ISD::ATOMIC_LOAD_UMAX: case AMDGPUISD::ATOMIC_INC: - case AMDGPUISD::ATOMIC_DEC: // TODO: Target mem intrinsics. + case AMDGPUISD::ATOMIC_DEC: + case AMDGPUISD::ATOMIC_LOAD_FADD: + case AMDGPUISD::ATOMIC_LOAD_FMIN: + case AMDGPUISD::ATOMIC_LOAD_FMAX: // TODO: Target mem intrinsics. if (DCI.isBeforeLegalize()) break; return performMemSDNodeCombine(cast(N), DCI); Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -45,6 +45,22 @@ [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] >; +def SDTAtomic2_f32 : SDTypeProfile<1, 2, [ + SDTCisSameAs<0,2>, SDTCisFP<0>, SDTCisPtrTy<1> +]>; + +def SIatomic_fadd : SDNode<"AMDGPUISD::ATOMIC_LOAD_FADD", SDTAtomic2_f32, + [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] +>; + +def SIatomic_fmin : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMIN", SDTAtomic2_f32, + [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] +>; + +def SIatomic_fmax : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMAX", SDTAtomic2_f32, + [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] +>; + def SItbuffer_load : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT", SDTypeProfile<1, 9, [ // vdata @@ -163,6 +179,9 @@ def atomic_inc_local : local_binary_atomic_op; def atomic_dec_local : local_binary_atomic_op; +def atomic_load_fadd_local : local_binary_atomic_op; +def atomic_load_fmin_local : local_binary_atomic_op; +def atomic_load_fmax_local : local_binary_atomic_op; //===----------------------------------------------------------------------===// // SDNodes PatFrags for loads/stores with a glue input. @@ -297,10 +316,11 @@ (shl $src0, $src1) >; -multiclass SIAtomicM0Glue2 { +multiclass SIAtomicM0Glue2 { def _glue : SDNode < - !if(is_amdgpu, "AMDGPUISD", "ISD")#"::ATOMIC_"#op_name, SDTAtomic2, + !if(is_amdgpu, "AMDGPUISD", "ISD")#"::ATOMIC_"#op_name, tc, [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] >; @@ -319,6 +339,9 @@ defm atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">; defm atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">; defm atomic_swap : SIAtomicM0Glue2 <"SWAP">; +defm atomic_load_fadd : SIAtomicM0Glue2 <"LOAD_FADD", 1, SDTAtomic2_f32>; +defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 1, SDTAtomic2_f32>; +defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 1, SDTAtomic2_f32>; def atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3, [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] Index: lib/Transforms/Scalar/InferAddressSpaces.cpp =================================================================== --- lib/Transforms/Scalar/InferAddressSpaces.cpp +++ lib/Transforms/Scalar/InferAddressSpaces.cpp @@ -260,7 +260,10 @@ switch (II->getIntrinsicID()) { case Intrinsic::amdgcn_atomic_inc: - case Intrinsic::amdgcn_atomic_dec:{ + case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_atomic_add: + case Intrinsic::amdgcn_atomic_min: + case Intrinsic::amdgcn_atomic_max: { const ConstantInt *IsVolatile = dyn_cast(II->getArgOperand(4)); if (!IsVolatile || !IsVolatile->isZero()) return false; @@ -289,6 +292,9 @@ case Intrinsic::objectsize: case Intrinsic::amdgcn_atomic_inc: case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_atomic_add: + case Intrinsic::amdgcn_atomic_min: + case Intrinsic::amdgcn_atomic_max: appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(0), PostorderStack, Visited); break; Index: test/CodeGen/AMDGPU/lds_atomic_f32.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/lds_atomic_f32.ll @@ -0,0 +1,47 @@ +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope %s +declare float @llvm.amdgcn.atomic.add.f32(float addrspace(3)* nocapture, float, i32, i32, i1) +declare float @llvm.amdgcn.atomic.min.f32(float addrspace(3)* nocapture, float, i32, i32, i1) +declare float @llvm.amdgcn.atomic.max.f32(float addrspace(3)* nocapture, float, i32, i32, i1) + +; CHECK-LABEL: {{^}}lds_atomic_add_f32: +; CHECK: v_mov_b32_e32 [[V0:v[0-9]+]], 0x42280000 +; CHECK: ds_add_rtn_f32 [[V2:v[0-9]+]], [[V1:v[0-9]+]], [[V0]] +; CHECK: ds_add_f32 [[V1]], [[V0]] +; CHECK: s_waitcnt lgkmcnt(1) +; CHECK: ds_add_rtn_f32 v2, [[V1]], [[V2]] +define amdgpu_kernel void @lds_atomic_add_f32(float addrspace(1)* %out, float addrspace(3)* %ptrf) { + %a1 = call float @llvm.amdgcn.atomic.add.f32(float addrspace(3)* %ptrf, float 4.2e+1, i32 0, i32 0, i1 false) + %a2 = call float @llvm.amdgcn.atomic.add.f32(float addrspace(3)* %ptrf, float 4.2e+1, i32 0, i32 0, i1 false) + %a3 = call float @llvm.amdgcn.atomic.add.f32(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false) + store float %a3, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}lds_atomic_min_f32: +; CHECK: v_mov_b32_e32 [[V0:v[0-9]+]], 0x42280000 +; CHECK: ds_min_rtn_f32 [[V2:v[0-9]+]], [[V1:v[0-9]+]], [[V0]] +; CHECK: ds_min_f32 [[V1]], [[V0]] +; CHECK: s_waitcnt lgkmcnt(1) +; CHECK: ds_min_rtn_f32 v2, [[V1]], [[V2]] +define amdgpu_kernel void @lds_atomic_min_f32(float addrspace(1)* %out, float addrspace(3)* %ptrf) { + %a1 = call float @llvm.amdgcn.atomic.min.f32(float addrspace(3)* %ptrf, float 4.2e+1, i32 0, i32 0, i1 false) + %a2 = call float @llvm.amdgcn.atomic.min.f32(float addrspace(3)* %ptrf, float 4.2e+1, i32 0, i32 0, i1 false) + %a3 = call float @llvm.amdgcn.atomic.min.f32(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false) + store float %a3, float addrspace(1)* %out + ret void +} + +; CHECK-LABEL: {{^}}lds_atomic_max_f32: +; CHECK: v_mov_b32_e32 [[V0:v[0-9]+]], 0x42280000 +; CHECK: ds_max_rtn_f32 [[V2:v[0-9]+]], [[V1:v[0-9]+]], [[V0]] +; CHECK: ds_max_f32 [[V1]], [[V0]] +; CHECK: s_waitcnt lgkmcnt(1) +; CHECK: ds_max_rtn_f32 v2, [[V1]], [[V2]] +define amdgpu_kernel void @lds_atomic_max_f32(float addrspace(1)* %out, float addrspace(3)* %ptrf) { + %a1 = call float @llvm.amdgcn.atomic.max.f32(float addrspace(3)* %ptrf, float 4.2e+1, i32 0, i32 0, i1 false) + %a2 = call float @llvm.amdgcn.atomic.max.f32(float addrspace(3)* %ptrf, float 4.2e+1, i32 0, i32 0, i1 false) + %a3 = call float @llvm.amdgcn.atomic.max.f32(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false) + store float %a3, float addrspace(1)* %out + ret void +} +