Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -313,6 +313,7 @@ STORE_MSKOR, LOAD_CONSTANT, TBUFFER_STORE_FORMAT, + ATOMIC_CMP_SWAP, LAST_AMDGPU_ISD_NUMBER }; Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2812,6 +2812,7 @@ NODE_NAME_CASE(INTERP_P2) NODE_NAME_CASE(STORE_MSKOR) NODE_NAME_CASE(TBUFFER_STORE_FORMAT) + NODE_NAME_CASE(ATOMIC_CMP_SWAP) case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; } return nullptr; Index: lib/Target/AMDGPU/AMDGPUInstrInfo.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -183,6 +183,11 @@ SDTypeProfile<0, 2, []>, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def AMDGPUatomic_cmp_swap : SDNode<"AMDGPUISD::ATOMIC_CMP_SWAP", + SDTypeProfile<1, 2, [SDTCisVec<0>, + SDTCisPtrTy<1>, SDTCisSameAs<0, 2>]>, [SDNPHasChain, + SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; + def AMDGPUround : SDNode<"ISD::FROUND", SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>>; Index: lib/Target/AMDGPU/AMDGPUInstructions.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstructions.td +++ lib/Target/AMDGPU/AMDGPUInstructions.td @@ -400,6 +400,8 @@ def atomic_umin_global : global_binary_atomic_op; def atomic_xor_global : global_binary_atomic_op; +def atomic_cmp_swap_global : global_binary_atomic_op; + //===----------------------------------------------------------------------===// // Misc Pattern Fragments //===----------------------------------------------------------------------===// Index: lib/Target/AMDGPU/CIInstructions.td =================================================================== --- lib/Target/AMDGPU/CIInstructions.td +++ lib/Target/AMDGPU/CIInstructions.td @@ -156,7 +156,7 @@ flat<0x30, 0x40>, "flat_atomic_swap", VGPR_32 >; defm FLAT_ATOMIC_CMPSWAP : FLAT_ATOMIC < - flat<0x31, 0x41>, "flat_atomic_cmpswap", VGPR_32, VReg_64 + flat<0x31, 0x41>, "flat_atomic_cmpswap", VReg_64 >; defm FLAT_ATOMIC_ADD : FLAT_ATOMIC < flat<0x32, 0x42>, "flat_atomic_add", VGPR_32 @@ -195,7 +195,7 @@ flat<0x50, 0x60>, "flat_atomic_swap_x2", VReg_64 >; defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_ATOMIC < - flat<0x51, 0x61>, "flat_atomic_cmpswap_x2", VReg_64, VReg_128 + flat<0x51, 0x61>, "flat_atomic_cmpswap_x2", VReg_128 >; defm FLAT_ATOMIC_ADD_X2 : FLAT_ATOMIC < flat<0x52, 0x62>, "flat_atomic_add_x2", VReg_64 @@ -322,6 +322,9 @@ def : FlatAtomicPat ; def : FlatAtomicPat ; def : FlatAtomicPat ; +def : FlatAtomicPat ; def : FlatAtomicPat ; +def : FlatAtomicPat ; + } // End Predicates = [isCIVI] Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -41,6 +41,7 @@ SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool Signed) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -257,6 +257,16 @@ setOperationAction(ISD::FDIV, MVT::f32, Custom); setOperationAction(ISD::FDIV, MVT::f64, Custom); + // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, + // and output demarshalling + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); + + // We can't return success/failure, only the old value, + // let LLVM add the comparison + setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand); + setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); setTargetDAGCombine(ISD::FMINNUM); @@ -1156,6 +1166,7 @@ return LowerTrig(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::FDIV: return LowerFDIV(Op, DAG); + case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); case ISD::GlobalAddress: { MachineFunction &MF = DAG.getMachineFunction(); @@ -2003,6 +2014,45 @@ } } +SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const { + AtomicSDNode *AtomicNode = cast(Op); + assert(AtomicNode && AtomicNode->isCompareAndSwap()); + + MemSDNode *MemNode = cast(Op); + unsigned AS = MemNode->getAddressSpace (); + + // No custom lowering required for local address space + if (!isFlatGlobalAddrSpace(AS)) + return Op; + + // Non-local address space requires custom lowering for atomic compare + // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2 + SDLoc DL(Op); + SDValue ChainIn = Op.getOperand(0); + SDValue Addr = Op.getOperand(1); + SDValue Old = Op.getOperand(2); + SDValue New = Op.getOperand(3); + EVT VT = Op.getValueType(); + MVT SVT = VT.getSimpleVT(); + + SDValue NewOld = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::getVectorVT(SVT, 2), + New, Old); + SDValue Ops[] = { ChainIn, Addr, NewOld }; + MachineMemOperand *MMO = MemNode->getMemOperand(); + + SDValue ResVal = DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, + Op->getVTList(), Ops, VT, MMO); + + // Extract returned previous value + SDValue Zero = DAG.getConstant(0, DL, SVT); + SDValue Previous = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, + ResVal.getValue(0), Zero); + + // Merge return value and Chain + return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), + Previous, ResVal.getValue(1)); +} + //===----------------------------------------------------------------------===// // Custom DAG optimizations //===----------------------------------------------------------------------===// Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -1012,7 +1012,7 @@ mubuf<0x30, 0x40>, "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global >; defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Atomic < - mubuf<0x31, 0x41>, "buffer_atomic_cmpswap", VReg_64, v2i32, null_frag + mubuf<0x31, 0x41>, "buffer_atomic_cmpswap", VReg_64, v2i32, atomic_cmp_swap_global >; defm BUFFER_ATOMIC_ADD : MUBUF_Atomic < mubuf<0x32, 0x42>, "buffer_atomic_add", VGPR_32, i32, atomic_add_global @@ -1048,7 +1048,9 @@ //def BUFFER_ATOMIC_FMIN : MUBUF_ , "buffer_atomic_fmin", []>; // isn't on VI //def BUFFER_ATOMIC_FMAX : MUBUF_ , "buffer_atomic_fmax", []>; // isn't on VI //def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 , "buffer_atomic_swap_x2", []>; -//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 , "buffer_atomic_cmpswap_x2", []>; +defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Atomic < + mubuf<0x51, 0x61>, "buffer_atomic_cmpswap_x2", VReg_128, v2i64, atomic_cmp_swap_global +>; //def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 , "buffer_atomic_add_x2", []>; //def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 , "buffer_atomic_sub_x2", []>; //def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 , "buffer_atomic_rsub_x2", []>; // isn't on CI & VI Index: test/CodeGen/AMDGPU/global_atomics.ll =================================================================== --- test/CodeGen/AMDGPU/global_atomics.ll +++ test/CodeGen/AMDGPU/global_atomics.ll @@ -758,6 +758,95 @@ ret void } +; CMP_SWAP + +; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_offset: +; GCN: buffer_atomic_cmpswap v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} +define void @atomic_cmpxchg_i32_offset(i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_ret_offset: +; GCN: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]{{:[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} +; GCN: buffer_store_dword v[[RET]] +define void @atomic_cmpxchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst + %1 = extractvalue { i32, i1 } %0, 0 + store i32 %1, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_addr64_offset: +; SI: buffer_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} +define void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_ret_addr64_offset: +; SI: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} +; VI: flat_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}} +; GCN: buffer_store_dword v[[RET]] +define void @atomic_cmpxchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst + %1 = extractvalue { i32, i1 } %0, 0 + store i32 %1, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_cmpxchg_i32: +; GCN: buffer_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} +define void @atomic_cmpxchg_i32(i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %0 = cmpxchg volatile i32 addrspace(1)* %out, i32 %old, i32 %in seq_cst seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_ret: +; GCN: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc +; GCN: buffer_store_dword v[[RET]] +define void @atomic_cmpxchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) { +entry: + %0 = cmpxchg volatile i32 addrspace(1)* %out, i32 %old, i32 %in seq_cst seq_cst + %1 = extractvalue { i32, i1 } %0, 0 + store i32 %1, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_addr64: +; SI: buffer_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} +; VI: flat_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}} +define void @atomic_cmpxchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = cmpxchg volatile i32 addrspace(1)* %ptr, i32 %old, i32 %in seq_cst seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_ret_addr64: +; SI: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} +; VI: flat_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}} +; GCN: buffer_store_dword v[[RET]] +define void @atomic_cmpxchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = cmpxchg volatile i32 addrspace(1)* %ptr, i32 %old, i32 %in seq_cst seq_cst + %1 = extractvalue { i32, i1 } %0, 0 + store i32 %1, i32 addrspace(1)* %out2 + ret void +} + ; FUNC-LABEL: {{^}}atomic_xor_i32_offset: ; GCN: buffer_atomic_xor v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} define void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) {