Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -313,6 +313,7 @@ STORE_MSKOR, LOAD_CONSTANT, TBUFFER_STORE_FORMAT, + ATOMIC_CMP_SWAP, LAST_AMDGPU_ISD_NUMBER }; Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2812,6 +2812,7 @@ NODE_NAME_CASE(INTERP_P2) NODE_NAME_CASE(STORE_MSKOR) NODE_NAME_CASE(TBUFFER_STORE_FORMAT) + NODE_NAME_CASE(ATOMIC_CMP_SWAP) case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; } return nullptr; Index: lib/Target/AMDGPU/AMDGPUInstrInfo.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -183,6 +183,11 @@ SDTypeProfile<0, 2, []>, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def AMDGPUatomic_cmp_swap : SDNode<"AMDGPUISD::ATOMIC_CMP_SWAP", + SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisVec<2>]>, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, + SDNPMemOperand]>; + def AMDGPUround : SDNode<"ISD::FROUND", SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>>; Index: lib/Target/AMDGPU/AMDGPUInstructions.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstructions.td +++ lib/Target/AMDGPU/AMDGPUInstructions.td @@ -400,6 +400,13 @@ def atomic_umin_global : global_binary_atomic_op; def atomic_xor_global : global_binary_atomic_op; +def atomic_cmp_swap_global : global_binary_atomic_op; +def atomic_cmp_swap_global_nortn : PatFrag< + (ops node:$ptr, node:$value), + (atomic_cmp_swap_global node:$ptr, node:$value), + [{ return SDValue(N, 0).use_empty(); }] +>; + //===----------------------------------------------------------------------===// // Misc Pattern Fragments //===----------------------------------------------------------------------===// Index: lib/Target/AMDGPU/CIInstructions.td =================================================================== --- lib/Target/AMDGPU/CIInstructions.td +++ lib/Target/AMDGPU/CIInstructions.td @@ -308,8 +308,9 @@ def : FlatStorePat ; def : FlatStorePat ; -class FlatAtomicPat : Pat < - (vt (node i64:$addr, vt:$data)), +class FlatAtomicPat : Pat < + (vt (node i64:$addr, data_vt:$data)), (inst $addr, $data, 0, 0) >; @@ -322,6 +323,9 @@ def : FlatAtomicPat ; def : FlatAtomicPat ; def : FlatAtomicPat ; +def : FlatAtomicPat ; def : FlatAtomicPat ; +def : FlatAtomicPat ; + } // End Predicates = [isCIVI] Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -41,6 +41,7 @@ SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool Signed) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -257,6 +257,16 @@ setOperationAction(ISD::FDIV, MVT::f32, Custom); setOperationAction(ISD::FDIV, MVT::f64, Custom); + // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, + // and output demarshalling + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); + + // We can't return success/failure, only the old value, + // let LLVM add the comparison + setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand); + setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand); + setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); setTargetDAGCombine(ISD::FMINNUM); @@ -1156,6 +1166,7 @@ return LowerTrig(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::FDIV: return LowerFDIV(Op, DAG); + case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); case ISD::GlobalAddress: { MachineFunction &MF = DAG.getMachineFunction(); @@ -2003,6 +2014,34 @@ } } +SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const { + AtomicSDNode *AtomicNode = cast(Op); + assert(AtomicNode && AtomicNode->isCompareAndSwap()); + unsigned AS = AtomicNode->getAddressSpace (); + + // No custom lowering required for local address space + if (!isFlatGlobalAddrSpace(AS)) + return Op; + + // Non-local address space requires custom lowering for atomic compare + // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2 + SDLoc DL(Op); + SDValue ChainIn = Op.getOperand(0); + SDValue Addr = Op.getOperand(1); + SDValue Old = Op.getOperand(2); + SDValue New = Op.getOperand(3); + EVT VT = Op.getValueType(); + MVT SimpleVT = VT.getSimpleVT(); + MVT VecType = MVT::getVectorVT(SimpleVT, 2); + + SDValue NewOld = DAG.getNode(ISD::BUILD_VECTOR, DL, VecType, + New, Old); + SDValue Ops[] = { ChainIn, Addr, NewOld }; + SDVTList VTList = DAG.getVTList(VT, MVT::Other); + return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, + VTList, Ops, VT, AtomicNode->getMemOperand()); +} + //===----------------------------------------------------------------------===// // Custom DAG optimizations //===----------------------------------------------------------------------===// @@ -2849,8 +2888,31 @@ if (!Node->hasAnyUseOfValue(0)) { MI->setDesc(TII->get(NoRetAtomicOp)); MI->RemoveOperand(0); + return; } + // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg + // instruction, because the return type of these instructions is a vec2 of + // the memory type, so it can be tied to the input operand. + // This means these instructions always have a use, so we need to add a + // special case to check if the atomic has only one extract_subreg use, + // which itself has no uses. + if ((Node->hasNUsesOfValue(1, 0) && + Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG && + !Node->use_begin()->hasAnyUseOfValue(0))) { + unsigned Def = MI->getOperand(0).getReg(); + + // Change this into a noret atomic. + MI->setDesc(TII->get(NoRetAtomicOp)); + MI->RemoveOperand(0); + + // If we only remove the def operand from the atomic instruction, the + // extract_subreg will be left with a use of a vreg without a def. + // So we need to insert an implicit_def to avoid machine verifier + // errors. + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(AMDGPU::IMPLICIT_DEF), Def); + } return; } } Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -1048,7 +1048,9 @@ //def BUFFER_ATOMIC_FMIN : MUBUF_ , "buffer_atomic_fmin", []>; // isn't on VI //def BUFFER_ATOMIC_FMAX : MUBUF_ , "buffer_atomic_fmax", []>; // isn't on VI //def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 , "buffer_atomic_swap_x2", []>; -//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 , "buffer_atomic_cmpswap_x2", []>; +defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Atomic < + mubuf<0x51, 0x61>, "buffer_atomic_cmpswap_x2", VReg_128, v2i64, null_frag +>; //def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 , "buffer_atomic_add_x2", []>; //def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 , "buffer_atomic_sub_x2", []>; //def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 , "buffer_atomic_rsub_x2", []>; // isn't on CI & VI @@ -3186,6 +3188,37 @@ def : MUBUFScratchStorePat ; def : MUBUFScratchStorePat ; + +multiclass MUBUFCmpSwapPat { + +let Predicates = [isSI] in { + def : Pat < + (node_vt (node (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i32:$soffset, + i16:$offset, i1:$slc), data_vt:$vdata_in)), + (EXTRACT_SUBREG + (inst_addr64 $vdata_in, $vaddr, $srsrc, $soffset, $offset, $slc), sub0) + >; + +} + + def : Pat < + (node_vt (node (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset, + i1:$slc), data_vt:$vdata_in)), + (EXTRACT_SUBREG + (inst_offset $vdata_in, $srsrc, $soffset, $offset, $slc), sub0) + >; +} + +defm : MUBUFCmpSwapPat ; + +defm : MUBUFCmpSwapPat ; + //===----------------------------------------------------------------------===// // MTBUF Patterns //===----------------------------------------------------------------------===// Index: test/CodeGen/AMDGPU/global_atomics.ll =================================================================== --- test/CodeGen/AMDGPU/global_atomics.ll +++ test/CodeGen/AMDGPU/global_atomics.ll @@ -758,6 +758,95 @@ ret void } +; CMP_SWAP + +; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_offset: +; GCN: buffer_atomic_cmpswap v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} +define void @atomic_cmpxchg_i32_offset(i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_ret_offset: +; GCN: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]{{:[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} +; GCN: buffer_store_dword v[[RET]] +define void @atomic_cmpxchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %0 = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst + %1 = extractvalue { i32, i1 } %0, 0 + store i32 %1, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_addr64_offset: +; SI: buffer_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} +define void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_ret_addr64_offset: +; SI: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} +; VI: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}} +; GCN: buffer_store_dword v[[RET]] +define void @atomic_cmpxchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %0 = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst + %1 = extractvalue { i32, i1 } %0, 0 + store i32 %1, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_cmpxchg_i32: +; GCN: buffer_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} +define void @atomic_cmpxchg_i32(i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %0 = cmpxchg volatile i32 addrspace(1)* %out, i32 %old, i32 %in seq_cst seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_ret: +; GCN: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 glc +; GCN: buffer_store_dword v[[RET]] +define void @atomic_cmpxchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) { +entry: + %0 = cmpxchg volatile i32 addrspace(1)* %out, i32 %old, i32 %in seq_cst seq_cst + %1 = extractvalue { i32, i1 } %0, 0 + store i32 %1, i32 addrspace(1)* %out2 + ret void +} + +; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_addr64: +; SI: buffer_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} +; VI: flat_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}} +define void @atomic_cmpxchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = cmpxchg volatile i32 addrspace(1)* %ptr, i32 %old, i32 %in seq_cst seq_cst + ret void +} + +; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_ret_addr64: +; SI: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} +; VI: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}} +; GCN: buffer_store_dword v[[RET]] +define void @atomic_cmpxchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %0 = cmpxchg volatile i32 addrspace(1)* %ptr, i32 %old, i32 %in seq_cst seq_cst + %1 = extractvalue { i32, i1 } %0, 0 + store i32 %1, i32 addrspace(1)* %out2 + ret void +} + ; FUNC-LABEL: {{^}}atomic_xor_i32_offset: ; GCN: buffer_atomic_xor v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} define void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) {