Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -319,6 +319,7 @@ STORE_MSKOR, LOAD_CONSTANT, TBUFFER_STORE_FORMAT, + ATOMIC_CMP_SWAP, LAST_AMDGPU_ISD_NUMBER }; Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2830,6 +2830,7 @@ NODE_NAME_CASE(INTERP_P2) NODE_NAME_CASE(STORE_MSKOR) NODE_NAME_CASE(TBUFFER_STORE_FORMAT) + NODE_NAME_CASE(ATOMIC_CMP_SWAP) case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; } return nullptr; Index: lib/Target/AMDGPU/AMDGPUInstrInfo.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -183,6 +183,16 @@ SDTypeProfile<0, 2, []>, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def AMDGPUatomic_cmp_swap : SDNode<"AMDGPUISD::ATOMIC_CMP_SWAP", + SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisPtrTy<1>, + SDTCisVT<2, v2i32>]>, [SDNPHasChain, SDNPMayStore, + SDNPMayLoad, SDNPMemOperand]>; + +def AMDGPUatomic_cmp_swap_x2 : SDNode<"AMDGPUISD::ATOMIC_CMP_SWAP", + SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisPtrTy<1>, + SDTCisVT<2, v2i64>]>, [SDNPHasChain, SDNPMayStore, + SDNPMayLoad, SDNPMemOperand]>; + def AMDGPUround : SDNode<"ISD::FROUND", SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>>; Index: lib/Target/AMDGPU/AMDGPUInstructions.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstructions.td +++ lib/Target/AMDGPU/AMDGPUInstructions.td @@ -381,6 +381,17 @@ def atomic_umin_global : global_binary_atomic_op; def atomic_xor_global : global_binary_atomic_op; +def atomic_cmp_swap_global : global_binary_atomic_op; +def atomic_cmp_swap_x2_global : global_binary_atomic_op; + +class flat_binary_atomic_op : PatFrag< + (ops node:$ptr, node:$value), + (atomic_op node:$ptr, node:$value), + [{return cast(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS;}] +>; + +def atomic_cmp_swap_flat : flat_binary_atomic_op; +def atomic_cmp_swap_x2_flat : flat_binary_atomic_op; //===----------------------------------------------------------------------===// // Misc Pattern Fragments //===----------------------------------------------------------------------===// Index: lib/Target/AMDGPU/CIInstructions.td =================================================================== --- lib/Target/AMDGPU/CIInstructions.td +++ lib/Target/AMDGPU/CIInstructions.td @@ -306,8 +306,9 @@ def : FlatStorePat ; def : FlatStorePat ; -class FlatAtomicPat : Pat < - (vt (node i64:$addr, vt:$data)), +class FlatAtomicPat : Pat < + (return_type (node i64:$addr, data_type:$data)), (inst $addr, $data, 0, 0) >; @@ -320,6 +321,11 @@ def : FlatAtomicPat ; def : FlatAtomicPat ; def : FlatAtomicPat ; +def : FlatAtomicPat ; +def : FlatAtomicPat ; def : FlatAtomicPat ; +def : FlatAtomicPat ; +def : FlatAtomicPat ; + } // End Predicates = [isCIVI] Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -41,6 +41,7 @@ SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool Signed) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -254,6 +254,9 @@ setOperationAction(ISD::FDIV, MVT::f32, Custom); setOperationAction(ISD::FDIV, MVT::f64, Custom); + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); + setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); setTargetDAGCombine(ISD::FMINNUM); @@ -1141,6 +1144,7 @@ return LowerTrig(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::FDIV: return LowerFDIV(Op, DAG); + case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); case ISD::GlobalAddress: { MachineFunction &MF = DAG.getMachineFunction(); @@ -1978,6 +1982,40 @@ } } +SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const { + MemSDNode *MemNode = cast(Op); + unsigned AS = MemNode->getAddressSpace (); + + // No custom lowering required for local address space + if(!isFlatGlobalAddrSpace (AS)) + return Op; + + // Non-local address space requires custom lowering for atomic compare + // and swap; cmp and swap should be in a v2i32 vector + SDLoc DL(Op); + SDValue ChainIn = Op.getOperand(0); + SDValue Addr = Op.getOperand(1); + SDValue CmpVal = Op.getOperand(2); + SDValue SwapVal = Op.getOperand(3); + EVT VT = Op.getValueType(); + + SDValue Res; + SDVTList VTList; + if (VT.getSizeInBits() == 32) { + Res = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i32, SwapVal, CmpVal); + VTList = DAG.getVTList(MVT::i32, MVT::Other); + } else if (VT.getSizeInBits() == 64) { + Res = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i64, SwapVal, CmpVal); + VTList = DAG.getVTList(MVT::i64, MVT::Other); + } else { + llvm_unreachable("Wrong type"); + } + SDValue Ops[] = { ChainIn, Addr, Res }; + MachineMemOperand *MMO = MemNode->getMemOperand(); + + return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, VTList, Ops, VT, MMO); +} + //===----------------------------------------------------------------------===// // Custom DAG optimizations //===----------------------------------------------------------------------===// Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -2499,8 +2499,9 @@ // for VI appropriately. } -multiclass MUBUF_Atomic { +multiclass MUBUF_Atomic { let mayStore = 1, mayLoad = 1, hasPostISelHook = 1 in { @@ -2509,41 +2510,41 @@ defm _ADDR64 : MUBUFAtomicAddr64_m < op, name#"_addr64", (outs), - (ins rc:$vdata, VReg_64:$vaddr, SReg_128:$srsrc, + (ins inputRC:$vdata, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc), name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#"$slc", [], 0 >; defm _OFFSET : MUBUFAtomicOffset_m < op, name#"_offset", (outs), - (ins rc:$vdata, SReg_128:$srsrc, SCSrc_32:$soffset, mbuf_offset:$offset, + (ins inputRC:$vdata, SReg_128:$srsrc, SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc), name#" $vdata, $srsrc, $soffset"#"$offset"#"$slc", [], 0 >; } // glc = 0 // Variant that return values - let glc = 1, Constraints = "$vdata = $vdata_in", + let glc = 1, DisableEncoding = "$vdata_in" in { defm _RTN_ADDR64 : MUBUFAtomicAddr64_m < - op, name#"_rtn_addr64", (outs rc:$vdata), - (ins rc:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc, + op, name#"_rtn_addr64", (outs outputRC:$vdata), + (ins inputRC:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc), name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#" glc"#"$slc", - [(set vt:$vdata, + [(set outputVT:$vdata, (atomic (MUBUFAddr64Atomic v4i32:$srsrc, i64:$vaddr, i32:$soffset, - i16:$offset, i1:$slc), vt:$vdata_in))], 1 + i16:$offset, i1:$slc), inputVT:$vdata_in))], 1 >; defm _RTN_OFFSET : MUBUFAtomicOffset_m < - op, name#"_rtn_offset", (outs rc:$vdata), - (ins rc:$vdata_in, SReg_128:$srsrc, SCSrc_32:$soffset, + op, name#"_rtn_offset", (outs outputRC:$vdata), + (ins inputRC:$vdata_in, SReg_128:$srsrc, SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc), name#" $vdata, $srsrc, $soffset"#"$offset"#" glc$slc", - [(set vt:$vdata, + [(set outputVT:$vdata, (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset, - i1:$slc), vt:$vdata_in))], 1 + i1:$slc), inputVT:$vdata_in))], 1 >; } // glc = 1 Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -980,36 +980,38 @@ >; defm BUFFER_ATOMIC_SWAP : MUBUF_Atomic < - mubuf<0x30, 0x40>, "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global + mubuf<0x30, 0x40>, "buffer_atomic_swap", atomic_swap_global, VGPR_32, i32 +>; +defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Atomic < + mubuf<0x31, 0x41>, "buffer_atomic_cmpswap", atomic_cmp_swap_global, VGPR_32, i32, VReg_64, v2i32 >; -//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ , "buffer_atomic_cmpswap", []>; defm BUFFER_ATOMIC_ADD : MUBUF_Atomic < - mubuf<0x32, 0x42>, "buffer_atomic_add", VGPR_32, i32, atomic_add_global + mubuf<0x32, 0x42>, "buffer_atomic_add", atomic_add_global, VGPR_32, i32 >; defm BUFFER_ATOMIC_SUB : MUBUF_Atomic < - mubuf<0x33, 0x43>, "buffer_atomic_sub", VGPR_32, i32, atomic_sub_global + mubuf<0x33, 0x43>, "buffer_atomic_sub", atomic_sub_global, VGPR_32, i32 >; //def BUFFER_ATOMIC_RSUB : MUBUF_ , "buffer_atomic_rsub", []>; // isn't on CI & VI defm BUFFER_ATOMIC_SMIN : MUBUF_Atomic < - mubuf<0x35, 0x44>, "buffer_atomic_smin", VGPR_32, i32, atomic_min_global + mubuf<0x35, 0x44>, "buffer_atomic_smin", atomic_min_global, VGPR_32, i32 >; defm BUFFER_ATOMIC_UMIN : MUBUF_Atomic < - mubuf<0x36, 0x45>, "buffer_atomic_umin", VGPR_32, i32, atomic_umin_global + mubuf<0x36, 0x45>, "buffer_atomic_umin", atomic_umin_global, VGPR_32, i32 >; defm BUFFER_ATOMIC_SMAX : MUBUF_Atomic < - mubuf<0x37, 0x46>, "buffer_atomic_smax", VGPR_32, i32, atomic_max_global + mubuf<0x37, 0x46>, "buffer_atomic_smax", atomic_max_global, VGPR_32, i32 >; defm BUFFER_ATOMIC_UMAX : MUBUF_Atomic < - mubuf<0x38, 0x47>, "buffer_atomic_umax", VGPR_32, i32, atomic_umax_global + mubuf<0x38, 0x47>, "buffer_atomic_umax", atomic_umax_global, VGPR_32, i32 >; defm BUFFER_ATOMIC_AND : MUBUF_Atomic < - mubuf<0x39, 0x48>, "buffer_atomic_and", VGPR_32, i32, atomic_and_global + mubuf<0x39, 0x48>, "buffer_atomic_and", atomic_and_global, VGPR_32, i32 >; defm BUFFER_ATOMIC_OR : MUBUF_Atomic < - mubuf<0x3a, 0x49>, "buffer_atomic_or", VGPR_32, i32, atomic_or_global + mubuf<0x3a, 0x49>, "buffer_atomic_or", atomic_or_global, VGPR_32, i32 >; defm BUFFER_ATOMIC_XOR : MUBUF_Atomic < - mubuf<0x3b, 0x4a>, "buffer_atomic_xor", VGPR_32, i32, atomic_xor_global + mubuf<0x3b, 0x4a>, "buffer_atomic_xor", atomic_xor_global, VGPR_32, i32 >; //def BUFFER_ATOMIC_INC : MUBUF_ , "buffer_atomic_inc", []>; //def BUFFER_ATOMIC_DEC : MUBUF_ , "buffer_atomic_dec", []>; @@ -1017,7 +1019,9 @@ //def BUFFER_ATOMIC_FMIN : MUBUF_ , "buffer_atomic_fmin", []>; // isn't on VI //def BUFFER_ATOMIC_FMAX : MUBUF_ , "buffer_atomic_fmax", []>; // isn't on VI //def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 , "buffer_atomic_swap_x2", []>; -//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 , "buffer_atomic_cmpswap_x2", []>; +defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Atomic < + mubuf<0x51, 0x61>, "buffer_atomic_cmpswap_x2", atomic_cmp_swap_x2_global, VReg_64, i64, VReg_128, v2i64 +>; //def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 , "buffer_atomic_add_x2", []>; //def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 , "buffer_atomic_sub_x2", []>; //def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 , "buffer_atomic_rsub_x2", []>; // isn't on CI & VI Index: lib/Target/AMDGPU/SIIntrinsics.td =================================================================== --- lib/Target/AMDGPU/SIIntrinsics.td +++ lib/Target/AMDGPU/SIIntrinsics.td @@ -52,6 +52,34 @@ llvm_i32_ty], // tfe(imm) [IntrReadArgMem]>; + // Fully-flexible BUFFER_ATOMIC_* except for the ADDR64 bit, which is not exposed + class MubufAtomicRaw : Intrinsic < + [llvm_anyint_ty], // vdata(VGPR), overloaded for types i32, i64 + [llvm_anyint_ty, // rsrc(SGPR) + llvm_anyint_ty, // vaddr(VGPR) + llvm_i32_ty, // soffset(SGPR) + llvm_i32_ty, // inst_offset(imm) + llvm_i32_ty, // offen(imm) + llvm_i32_ty, // idxen(imm) + llvm_i32_ty, // glc(imm) + llvm_i32_ty, // slc(imm) + llvm_i32_ty], // tfe(imm) + [IntrNoMem]>; + + def int_SI_buffer_atomic_swap : MubufAtomicRaw; + def int_SI_buffer_atomic_cmpswap : MubufAtomicRaw; // not sure + def int_SI_buffer_atomic_add : MubufAtomicRaw; + def int_SI_buffer_atomic_sub : MubufAtomicRaw; + def int_SI_buffer_atomic_smin : MubufAtomicRaw; + def int_SI_buffer_atomic_umin : MubufAtomicRaw; + def int_SI_buffer_atomic_smax : MubufAtomicRaw; + def int_SI_buffer_atomic_umax : MubufAtomicRaw; + def int_SI_buffer_atomic_and : MubufAtomicRaw; + def int_SI_buffer_atomic_or : MubufAtomicRaw; + def int_SI_buffer_atomic_xor : MubufAtomicRaw; + def int_SI_buffer_atomic_inc : MubufAtomicRaw; + def int_SI_buffer_atomic_dec : MubufAtomicRaw; + def int_SI_sendmsg : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], []>; // Fully-flexible SAMPLE instruction.