Index: llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -716,8 +716,7 @@ (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC || Opc == ISD::ATOMIC_LOAD_FADD || Opc == AMDGPUISD::ATOMIC_LOAD_FMIN || - Opc == AMDGPUISD::ATOMIC_LOAD_FMAX || - Opc == AMDGPUISD::ATOMIC_LOAD_CSUB)) { + Opc == AMDGPUISD::ATOMIC_LOAD_FMAX)) { N = glueCopyToM0LDSInit(N); SelectCode(N); return; Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -508,7 +508,6 @@ ATOMIC_DEC, ATOMIC_LOAD_FMIN, ATOMIC_LOAD_FMAX, - ATOMIC_LOAD_CSUB, BUFFER_LOAD, BUFFER_LOAD_UBYTE, BUFFER_LOAD_USHORT, Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4340,7 +4340,6 @@ NODE_NAME_CASE(ATOMIC_DEC) NODE_NAME_CASE(ATOMIC_LOAD_FMIN) NODE_NAME_CASE(ATOMIC_LOAD_FMAX) - NODE_NAME_CASE(ATOMIC_LOAD_CSUB) NODE_NAME_CASE(BUFFER_LOAD) NODE_NAME_CASE(BUFFER_LOAD_UBYTE) NODE_NAME_CASE(BUFFER_LOAD_USHORT) Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4200,6 +4200,7 @@ case Intrinsic::amdgcn_ds_fadd: case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: + case Intrinsic::amdgcn_global_atomic_csub: return getDefaultMappingAllVGPR(MI); case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: { Index: llvm/lib/Target/AMDGPU/BUFInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/BUFInstructions.td +++ llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1006,7 +1006,7 @@ let SubtargetPredicate = HasGFX10_BEncoding in defm BUFFER_ATOMIC_CSUB : MUBUF_Pseudo_Atomics_RTN < - "buffer_atomic_csub", VGPR_32, i32, atomic_csub_global_32 + "buffer_atomic_csub", VGPR_32, i32, int_amdgcn_global_atomic_csub >; let SubtargetPredicate = isGFX8GFX9 in { Index: llvm/lib/Target/AMDGPU/FLATInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/FLATInstructions.td +++ llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -662,7 +662,7 @@ let SubtargetPredicate = HasGFX10_BEncoding in defm GLOBAL_ATOMIC_CSUB : FLAT_Global_Atomic_Pseudo_RTN <"global_atomic_csub", - VGPR_32, i32, atomic_csub_global_32>; + VGPR_32, i32, int_amdgcn_global_atomic_csub>; } // End is_flat_global = 1 @@ -959,7 +959,7 @@ def : FlatSignedAtomicPat ; def : FlatSignedAtomicPat ; def : FlatSignedAtomicPat ; -def : FlatSignedAtomicPat ; +def : FlatSignedAtomicPat ; def : FlatSignedAtomicPat ; def : FlatSignedAtomicPat ; Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7197,19 +7197,6 @@ return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, Op->getVTList(), Ops, VT, M->getMemOperand()); } - case Intrinsic::amdgcn_global_atomic_csub: { - MemSDNode *M = cast(Op); - SDValue Ops[] = { - M->getOperand(0), // Chain - M->getOperand(2), // Ptr - M->getOperand(3) // Value - }; - - return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_LOAD_CSUB, SDLoc(Op), - M->getVTList(), Ops, M->getMemoryVT(), - M->getMemOperand()); - } - default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrID)) Index: llvm/lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -55,10 +55,6 @@ [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] >; -def SIatomic_csub : SDNode<"AMDGPUISD::ATOMIC_LOAD_CSUB", SDTAtomic2, - [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] ->; - def SDTAtomic2_f32 : SDTypeProfile<1, 2, [ SDTCisSameAs<0,2>, SDTCisFP<0>, SDTCisPtrTy<1> ]>; @@ -311,10 +307,6 @@ // PatFrags for global memory operations //===----------------------------------------------------------------------===// -let AddressSpaces = !cast("LoadAddress_global").AddrSpaces in { -defm atomic_csub_global : binary_atomic_op; -} - foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in { let AddressSpaces = !cast("LoadAddress_"#as).AddrSpaces in { @@ -668,7 +660,6 @@ defm atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">; defm atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">; -defm atomic_load_csub : SIAtomicM0Glue2 <"LOAD_CSUB", 1>; defm atomic_inc : SIAtomicM0Glue2 <"INC", 1>; defm atomic_dec : SIAtomicM0Glue2 <"DEC", 1>; defm atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">; Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll @@ -0,0 +1,115 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN + +define i32 @global_atomic_csub(i32 addrspace(1)* %ptr, i32 %data) { +; GCN-LABEL: global_atomic_csub: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_waitcnt_vscnt null, 0x0 +; GCN-NEXT: global_atomic_csub v0, v[0:1], v2, off glc +; GCN-NEXT: ; implicit-def: $vcc_hi +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* %ptr, i32 %data) + ret i32 %ret +} + +define i32 @global_atomic_csub_offset(i32 addrspace(1)* %ptr, i32 %data) { +; GCN-LABEL: global_atomic_csub_offset: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_waitcnt_vscnt null, 0x0 +; GCN-NEXT: s_movk_i32 s4, 0x1000 +; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: v_mov_b32_e32 v3, s4 +; GCN-NEXT: v_mov_b32_e32 v4, s5 +; GCN-NEXT: ; implicit-def: $vcc_hi +; GCN-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v3 +; GCN-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo +; GCN-NEXT: global_atomic_csub v0, v[0:1], v2, off glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 1024 + %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* %gep, i32 %data) + ret i32 %ret +} + +define void @global_atomic_csub_nortn(i32 addrspace(1)* %ptr, i32 %data) { +; GCN-LABEL: global_atomic_csub_nortn: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_waitcnt_vscnt null, 0x0 +; GCN-NEXT: global_atomic_csub v0, v[0:1], v2, off glc +; GCN-NEXT: ; implicit-def: $vcc_hi +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* %ptr, i32 %data) + ret void +} + +define void @global_atomic_csub_offset_nortn(i32 addrspace(1)* %ptr, i32 %data) { +; GCN-LABEL: global_atomic_csub_offset_nortn: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_waitcnt_vscnt null, 0x0 +; GCN-NEXT: s_movk_i32 s4, 0x1000 +; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: v_mov_b32_e32 v3, s4 +; GCN-NEXT: v_mov_b32_e32 v4, s5 +; GCN-NEXT: ; implicit-def: $vcc_hi +; GCN-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v3 +; GCN-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo +; GCN-NEXT: global_atomic_csub v0, v[0:1], v2, off glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 1024 + %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* %gep, i32 %data) + ret void +} + +define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(i32 addrspace(1)* %ptr, i32 %data) { +; GCN-LABEL: global_atomic_csub_sgpr_base_offset: +; GCN: ; %bb.0: +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x8 +; GCN-NEXT: ; implicit-def: $vcc_hi +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_add_u32 s0, s0, 0x1000 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: global_atomic_csub v0, v[0:1], v2, off glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: global_store_dword v[0:1], v0, off +; GCN-NEXT: s_endpgm + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 1024 + %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* %gep, i32 %data) + store i32 %ret, i32 addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset_nortn(i32 addrspace(1)* %ptr, i32 %data) { +; GCN-LABEL: global_atomic_csub_sgpr_base_offset_nortn: +; GCN: ; %bb.0: +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x8 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_add_u32 s0, s0, 0x1000 +; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: global_atomic_csub v0, v[0:1], v2, off glc +; GCN-NEXT: s_endpgm + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 1024 + %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* %gep, i32 %data) + ret void +} + +declare i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* nocapture, i32) #1 + +attributes #0 = { nounwind willreturn } +attributes #1 = { argmemonly nounwind }