Index: llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td +++ llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td @@ -295,6 +295,21 @@ def int_amdgcn_atomic_inc : AMDGPUAtomicIncIntrin; def int_amdgcn_atomic_dec : AMDGPUAtomicIncIntrin; +class AMDGPUAtomicF32Intrin : + GCCBuiltin, + Intrinsic<[llvm_float_ty], + [LLVMAnyPointerType, + llvm_float_ty, + llvm_i32_ty, // ordering + llvm_i32_ty, // scope + llvm_i1_ty], // isVolatile + [IntrArgMemOnly, NoCapture<0>] +>; + +def int_amdgcn_atomic_fadd : AMDGPUAtomicF32Intrin<"__builtin_amdgcn_ds_fadd">; +def int_amdgcn_atomic_fmin : AMDGPUAtomicF32Intrin<"__builtin_amdgcn_ds_fmin">; +def int_amdgcn_atomic_fmax : AMDGPUAtomicF32Intrin<"__builtin_amdgcn_ds_fmax">; + class AMDGPUImageLoad : Intrinsic < [llvm_anyfloat_ty], // vdata(VGPR) [llvm_anyint_ty, // vaddr(VGPR) Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -450,7 +450,10 @@ } if (isa(N) || - (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC)) + (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC || + Opc == AMDGPUISD::ATOMIC_LOAD_FADD || + Opc == AMDGPUISD::ATOMIC_LOAD_FMIN || + Opc == AMDGPUISD::ATOMIC_LOAD_FMAX)) N = glueCopyToM0(N); switch (Opc) { Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -457,6 +457,9 @@ ATOMIC_CMP_SWAP, ATOMIC_INC, ATOMIC_DEC, + ATOMIC_LOAD_FADD, + ATOMIC_LOAD_FMIN, + ATOMIC_LOAD_FMAX, BUFFER_LOAD, BUFFER_LOAD_FORMAT, BUFFER_LOAD_FORMAT_D16, Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3982,6 +3982,9 @@ NODE_NAME_CASE(ATOMIC_CMP_SWAP) NODE_NAME_CASE(ATOMIC_INC) NODE_NAME_CASE(ATOMIC_DEC) + NODE_NAME_CASE(ATOMIC_LOAD_FADD) + NODE_NAME_CASE(ATOMIC_LOAD_FMIN) + NODE_NAME_CASE(ATOMIC_LOAD_FMAX) NODE_NAME_CASE(BUFFER_LOAD) NODE_NAME_CASE(BUFFER_LOAD_FORMAT) NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16) Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -475,6 +475,9 @@ case Intrinsic::r600_read_tidig_z: case Intrinsic::amdgcn_atomic_inc: case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_atomic_fadd: + case Intrinsic::amdgcn_atomic_fmin: + case Intrinsic::amdgcn_atomic_fmax: case Intrinsic::amdgcn_image_atomic_swap: case Intrinsic::amdgcn_image_atomic_add: case Intrinsic::amdgcn_image_atomic_sub: Index: llvm/trunk/lib/Target/AMDGPU/DSInstructions.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/DSInstructions.td +++ llvm/trunk/lib/Target/AMDGPU/DSInstructions.td @@ -440,7 +440,7 @@ defm DS_MSKOR_RTN_B32 : DS_1A2D_RET_mc<"ds_mskor_rtn_b32", VGPR_32, "ds_mskor_b32">; defm DS_CMPST_RTN_B32 : DS_1A2D_RET_mc<"ds_cmpst_rtn_b32", VGPR_32, "ds_cmpst_b32">; defm DS_CMPST_RTN_F32 : DS_1A2D_RET_mc<"ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">; -defm DS_MIN_RTN_F32 : DS_1A1D_RET_mc <"ds_min_rtn_f32", VGPR_32, "ds_min_f32">; +defm DS_MIN_RTN_F32 : DS_1A1D_RET_mc<"ds_min_rtn_f32", VGPR_32, "ds_min_f32">; defm DS_MAX_RTN_F32 : DS_1A1D_RET_mc<"ds_max_rtn_f32", VGPR_32, "ds_max_f32">; defm DS_WRXCHG_RTN_B32 : DS_1A1D_RET_mc<"ds_wrxchg_rtn_b32">; @@ -769,6 +769,9 @@ defm : DSAtomicRetPat_mc; defm : DSAtomicRetPat_mc; defm : DSAtomicCmpXChg_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; // 64-bit atomics. defm : DSAtomicRetPat_mc; Index: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp @@ -565,7 +565,10 @@ unsigned IntrID) const { switch (IntrID) { case Intrinsic::amdgcn_atomic_inc: - case Intrinsic::amdgcn_atomic_dec: { + case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_atomic_fadd: + case Intrinsic::amdgcn_atomic_fmin: + case Intrinsic::amdgcn_atomic_fmax: { Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(CI.getType()); Info.ptrVal = CI.getOperand(0); @@ -803,7 +806,10 @@ Type *&AccessTy) const { switch (II->getIntrinsicID()) { case Intrinsic::amdgcn_atomic_inc: - case Intrinsic::amdgcn_atomic_dec: { + case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_atomic_fadd: + case Intrinsic::amdgcn_atomic_fmin: + case Intrinsic::amdgcn_atomic_fmax: { Value *Ptr = II->getArgOperand(0); AccessTy = II->getType(); Ops.push_back(Ptr); @@ -4548,10 +4554,31 @@ switch (IntrID) { case Intrinsic::amdgcn_atomic_inc: - case Intrinsic::amdgcn_atomic_dec: { + case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_atomic_fadd: + case Intrinsic::amdgcn_atomic_fmin: + case Intrinsic::amdgcn_atomic_fmax: { MemSDNode *M = cast(Op); - unsigned Opc = (IntrID == Intrinsic::amdgcn_atomic_inc) ? - AMDGPUISD::ATOMIC_INC : AMDGPUISD::ATOMIC_DEC; + unsigned Opc; + switch (IntrID) { + case Intrinsic::amdgcn_atomic_inc: + Opc = AMDGPUISD::ATOMIC_INC; + break; + case Intrinsic::amdgcn_atomic_dec: + Opc = AMDGPUISD::ATOMIC_DEC; + break; + case Intrinsic::amdgcn_atomic_fadd: + Opc = AMDGPUISD::ATOMIC_LOAD_FADD; + break; + case Intrinsic::amdgcn_atomic_fmin: + Opc = AMDGPUISD::ATOMIC_LOAD_FMIN; + break; + case Intrinsic::amdgcn_atomic_fmax: + Opc = AMDGPUISD::ATOMIC_LOAD_FMAX; + break; + default: + llvm_unreachable("Unknown intrinsic!"); + } SDValue Ops[] = { M->getOperand(0), // Chain M->getOperand(2), // Ptr @@ -6817,7 +6844,10 @@ case ISD::ATOMIC_LOAD_UMIN: case ISD::ATOMIC_LOAD_UMAX: case AMDGPUISD::ATOMIC_INC: - case AMDGPUISD::ATOMIC_DEC: // TODO: Target mem intrinsics. + case AMDGPUISD::ATOMIC_DEC: + case AMDGPUISD::ATOMIC_LOAD_FADD: + case AMDGPUISD::ATOMIC_LOAD_FMIN: + case AMDGPUISD::ATOMIC_LOAD_FMAX: // TODO: Target mem intrinsics. if (DCI.isBeforeLegalize()) break; return performMemSDNodeCombine(cast(N), DCI); Index: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td +++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td @@ -46,6 +46,22 @@ [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] >; +def SDTAtomic2_f32 : SDTypeProfile<1, 2, [ + SDTCisSameAs<0,2>, SDTCisFP<0>, SDTCisPtrTy<1> +]>; + +def SIatomic_fadd : SDNode<"AMDGPUISD::ATOMIC_LOAD_FADD", SDTAtomic2_f32, + [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] +>; + +def SIatomic_fmin : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMIN", SDTAtomic2_f32, + [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] +>; + +def SIatomic_fmax : SDNode<"AMDGPUISD::ATOMIC_LOAD_FMAX", SDTAtomic2_f32, + [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] +>; + def SDTbuffer_load : SDTypeProfile<1, 9, [ // vdata SDTCisVT<1, v4i32>, // rsrc @@ -207,6 +223,9 @@ def atomic_inc_local : local_binary_atomic_op; def atomic_dec_local : local_binary_atomic_op; +def atomic_load_fadd_local : local_binary_atomic_op; +def atomic_load_fmin_local : local_binary_atomic_op; +def atomic_load_fmax_local : local_binary_atomic_op; //===----------------------------------------------------------------------===// // SDNodes PatFrags for loads/stores with a glue input. @@ -341,10 +360,11 @@ (shl $src0, $src1) >; -multiclass SIAtomicM0Glue2 { +multiclass SIAtomicM0Glue2 { def _glue : SDNode < - !if(is_amdgpu, "AMDGPUISD", "ISD")#"::ATOMIC_"#op_name, SDTAtomic2, + !if(is_amdgpu, "AMDGPUISD", "ISD")#"::ATOMIC_"#op_name, tc, [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] >; @@ -363,6 +383,9 @@ defm atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">; defm atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">; defm atomic_swap : SIAtomicM0Glue2 <"SWAP">; +defm atomic_load_fadd : SIAtomicM0Glue2 <"LOAD_FADD", 1, SDTAtomic2_f32>; +defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 1, SDTAtomic2_f32>; +defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 1, SDTAtomic2_f32>; def atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3, [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] Index: llvm/trunk/lib/Transforms/Scalar/InferAddressSpaces.cpp =================================================================== --- llvm/trunk/lib/Transforms/Scalar/InferAddressSpaces.cpp +++ llvm/trunk/lib/Transforms/Scalar/InferAddressSpaces.cpp @@ -260,7 +260,10 @@ switch (II->getIntrinsicID()) { case Intrinsic::amdgcn_atomic_inc: - case Intrinsic::amdgcn_atomic_dec:{ + case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_atomic_fadd: + case Intrinsic::amdgcn_atomic_fmin: + case Intrinsic::amdgcn_atomic_fmax: { const ConstantInt *IsVolatile = dyn_cast(II->getArgOperand(4)); if (!IsVolatile || !IsVolatile->isZero()) return false; @@ -289,6 +292,9 @@ case Intrinsic::objectsize: case Intrinsic::amdgcn_atomic_inc: case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_atomic_fadd: + case Intrinsic::amdgcn_atomic_fmin: + case Intrinsic::amdgcn_atomic_fmax: appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(0), PostorderStack, Visited); break; Index: llvm/trunk/test/CodeGen/AMDGPU/lds_atomic_f32.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/lds_atomic_f32.ll +++ llvm/trunk/test/CodeGen/AMDGPU/lds_atomic_f32.ll @@ -0,0 +1,69 @@ +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s + +declare float @llvm.amdgcn.atomic.fadd.f32(float addrspace(3)* nocapture, float, i32, i32, i1) +declare float @llvm.amdgcn.atomic.fmin.f32(float addrspace(3)* nocapture, float, i32, i32, i1) +declare float @llvm.amdgcn.atomic.fmax.f32(float addrspace(3)* nocapture, float, i32, i32, i1) + +; GCN-LABEL: {{^}}lds_atomic_fadd_f32: +; VI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 +; GCN-DAG: v_mov_b32_e32 [[V0:v[0-9]+]], 0x42280000 +; GCN: ds_add_rtn_f32 [[V2:v[0-9]+]], [[V1:v[0-9]+]], [[V0]] offset:32 +; GCN: ds_add_f32 [[V3:v[0-9]+]], [[V0]] offset:64 +; GCN: s_waitcnt lgkmcnt(1) +; GCN: ds_add_rtn_f32 {{v[0-9]+}}, {{v[0-9]+}}, [[V2]] +define amdgpu_kernel void @lds_atomic_fadd_f32(float addrspace(1)* %out, float addrspace(3)* %ptrf, i32 %idx) { + %idx.add = add nuw i32 %idx, 4 + %shl0 = shl i32 %idx.add, 3 + %shl1 = shl i32 %idx.add, 4 + %ptr0 = inttoptr i32 %shl0 to float addrspace(3)* + %ptr1 = inttoptr i32 %shl1 to float addrspace(3)* + %a1 = call float @llvm.amdgcn.atomic.fadd.f32(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false) + %a2 = call float @llvm.amdgcn.atomic.fadd.f32(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false) + %a3 = call float @llvm.amdgcn.atomic.fadd.f32(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false) + store float %a3, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}lds_atomic_fmin_f32: +; VI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 +; GCN-DAG: v_mov_b32_e32 [[V0:v[0-9]+]], 0x42280000 +; GCN: ds_min_rtn_f32 [[V2:v[0-9]+]], [[V1:v[0-9]+]], [[V0]] offset:32 +; GCN: ds_min_f32 [[V3:v[0-9]+]], [[V0]] offset:64 +; GCN: s_waitcnt lgkmcnt(1) +; GCN: ds_min_rtn_f32 {{v[0-9]+}}, {{v[0-9]+}}, [[V2]] +define amdgpu_kernel void @lds_atomic_fmin_f32(float addrspace(1)* %out, float addrspace(3)* %ptrf, i32 %idx) { + %idx.add = add nuw i32 %idx, 4 + %shl0 = shl i32 %idx.add, 3 + %shl1 = shl i32 %idx.add, 4 + %ptr0 = inttoptr i32 %shl0 to float addrspace(3)* + %ptr1 = inttoptr i32 %shl1 to float addrspace(3)* + %a1 = call float @llvm.amdgcn.atomic.fmin.f32(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false) + %a2 = call float @llvm.amdgcn.atomic.fmin.f32(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false) + %a3 = call float @llvm.amdgcn.atomic.fmin.f32(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false) + store float %a3, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}lds_atomic_fmax_f32: +; VI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 +; GCN-DAG: v_mov_b32_e32 [[V0:v[0-9]+]], 0x42280000 +; GCN: ds_max_rtn_f32 [[V2:v[0-9]+]], [[V1:v[0-9]+]], [[V0]] offset:32 +; GCN: ds_max_f32 [[V3:v[0-9]+]], [[V0]] offset:64 +; GCN: s_waitcnt lgkmcnt(1) +; GCN: ds_max_rtn_f32 {{v[0-9]+}}, {{v[0-9]+}}, [[V2]] +define amdgpu_kernel void @lds_atomic_fmax_f32(float addrspace(1)* %out, float addrspace(3)* %ptrf, i32 %idx) { + %idx.add = add nuw i32 %idx, 4 + %shl0 = shl i32 %idx.add, 3 + %shl1 = shl i32 %idx.add, 4 + %ptr0 = inttoptr i32 %shl0 to float addrspace(3)* + %ptr1 = inttoptr i32 %shl1 to float addrspace(3)* + %a1 = call float @llvm.amdgcn.atomic.fmax.f32(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false) + %a2 = call float @llvm.amdgcn.atomic.fmax.f32(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false) + %a3 = call float @llvm.amdgcn.atomic.fmax.f32(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false) + store float %a3, float addrspace(1)* %out + ret void +}