Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4245,6 +4245,7 @@ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size); break; } + case Intrinsic::amdgcn_global_atomic_fadd: case Intrinsic::amdgcn_global_atomic_csub: return getDefaultMappingAllVGPR(MI); case Intrinsic::amdgcn_ds_ordered_add: Index: llvm/lib/Target/AMDGPU/BUFInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/BUFInstructions.td +++ llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1099,7 +1099,7 @@ "buffer_atomic_add_f32", VGPR_32, f32, atomic_fadd_global_noret >; defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN < - "buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_pk_fadd_global_noret + "buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_fadd_global_noret >; } // End SubtargetPredicate = HasAtomicFaddInsts Index: llvm/lib/Target/AMDGPU/FLATInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/FLATInstructions.td +++ llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -717,7 +717,7 @@ "global_atomic_add_f32", VGPR_32, f32, atomic_fadd_global_noret >; defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_NO_RTN < - "global_atomic_pk_add_f16", VGPR_32, v2f16, atomic_pk_fadd_global_noret + "global_atomic_pk_add_f16", VGPR_32, v2f16, atomic_fadd_global_noret >; } // End SubtargetPredicate = HasAtomicFaddInsts @@ -784,7 +784,7 @@ class FlatAtomicPatNoRtn : GCNPat < (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), vt:$data), - (inst $vaddr, $data, $offset, $slc) + (inst VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset, $slc) >; class FlatSignedAtomicPat ; def : FlatAtomicPatNoRtn ; -def : FlatAtomicPatNoRtn ; +def : FlatAtomicPatNoRtn ; } // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1140,6 +1140,8 @@ ->getPointerElementType()); Info.ptrVal = CI.getOperand(0); Info.align.reset(); + + // FIXME: Should report an atomic ordering here. Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; return true; @@ -7520,21 +7522,6 @@ Op->getVTList(), Ops, VT, M->getMemOperand()); } - - case Intrinsic::amdgcn_global_atomic_fadd: { - SDValue Ops[] = { - Chain, - Op.getOperand(2), // ptr - Op.getOperand(3) // vdata - }; - - EVT VT = Op.getOperand(3).getValueType(); - auto *M = cast(Op); - - return DAG.getAtomic(ISD::ATOMIC_LOAD_FADD, DL, VT, - DAG.getVTList(VT, MVT::Other), Ops, - M->getMemOperand()).getValue(1); - } case Intrinsic::amdgcn_end_cf: return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other, Op->getOperand(2), Chain), 0); @@ -8566,7 +8553,7 @@ /// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset /// by the chain and intrinsic ID. Theoretically we would also need to check the -/// specific intrinsic. +/// specific intrinsic, but they all place the pointer operand first. static unsigned getBasePtrIndex(const MemSDNode *N) { switch (N->getOpcode()) { case ISD::STORE: Index: llvm/lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -316,7 +316,7 @@ } // End let AddressSpaces = ... } // End foreach AddrSpace -def atomic_fadd_global_noret : PatFrag< +def atomic_fadd_global_noret_impl : PatFrag< (ops node:$ptr, node:$value), (atomic_load_fadd node:$ptr, node:$value)> { // FIXME: Move this @@ -325,14 +325,9 @@ let AddressSpaces = StoreAddress_global.AddrSpaces; } -def atomic_pk_fadd_global_noret : PatFrag< - (ops node:$ptr, node:$value), - (atomic_load_fadd node:$ptr, node:$value)> { - // FIXME: Move this - let MemoryVT = v2f16; - let IsAtomic = 1; - let AddressSpaces = StoreAddress_global.AddrSpaces; -} +def atomic_fadd_global_noret : PatFrags<(ops node:$src0, node:$src1), + [(int_amdgcn_global_atomic_fadd node:$src0, node:$src1), + (atomic_fadd_global_noret_impl node:$src0, node:$src1)]>; //===----------------------------------------------------------------------===// // SDNodes PatFrags for loads/stores with a glue input. Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll @@ -0,0 +1,101 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX908 %s + +define void @global_atomic_fadd_f32(float addrspace(1)* %ptr, float %data) { +; GFX908-LABEL: global_atomic_fadd_f32: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: s_setpc_b64 s[30:31] + call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %ptr, float %data) + ret void +} + +define void @global_atomic_fadd_f32_off_2048(float addrspace(1)* %ptr, float %data) { +; GFX908-LABEL: global_atomic_fadd_f32_off_2048: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: s_movk_i32 s4, 0x800 +; GFX908-NEXT: s_mov_b32 s5, 0 +; GFX908-NEXT: v_mov_b32_e32 v3, s4 +; GFX908-NEXT: v_mov_b32_e32 v4, s5 +; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc +; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr float, float addrspace(1)* %ptr, i64 512 + call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %gep, float %data) + ret void +} + +define void @global_atomic_fadd_f32_off_neg2047(float addrspace(1)* %ptr, float %data) { +; GFX908-LABEL: global_atomic_fadd_f32_off_neg2047: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: s_mov_b32 s4, 0xfffff804 +; GFX908-NEXT: s_mov_b32 s5, -1 +; GFX908-NEXT: v_mov_b32_e32 v3, s4 +; GFX908-NEXT: v_mov_b32_e32 v4, s5 +; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc +; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr float, float addrspace(1)* %ptr, i64 -511 + call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %gep, float %data) + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_f32_off_ss(float addrspace(1)* %ptr, float %data) { +; GFX908-LABEL: global_atomic_fadd_f32_off_ss: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX908-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: s_add_u32 s0, s0, 0x800 +; GFX908-NEXT: s_addc_u32 s1, s1, 0 +; GFX908-NEXT: v_mov_b32_e32 v0, s0 +; GFX908-NEXT: v_mov_b32_e32 v1, s1 +; GFX908-NEXT: v_mov_b32_e32 v2, s2 +; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GFX908-NEXT: s_endpgm + %gep = getelementptr float, float addrspace(1)* %ptr, i64 512 + call void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* %gep, float %data) + ret void +} + +define void @global_atomic_fadd_v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data) { +; GFX908-LABEL: global_atomic_fadd_v2f16: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: s_setpc_b64 s[30:31] + call void @llvm.amdgcn.global.atomic.fadd.p1v2f16.v2f16(<2 x half> addrspace(1)* %ptr, <2 x half> %data) + ret void +} + +define void @global_atomic_fadd_v2f16_off_neg2047(<2 x half> addrspace(1)* %ptr, <2 x half> %data) { +; GFX908-LABEL: global_atomic_fadd_v2f16_off_neg2047: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: s_mov_b32 s4, 0xfffff804 +; GFX908-NEXT: s_mov_b32 s5, -1 +; GFX908-NEXT: v_mov_b32_e32 v3, s4 +; GFX908-NEXT: v_mov_b32_e32 v4, s5 +; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc +; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %ptr, i64 -511 + call void @llvm.amdgcn.global.atomic.fadd.p1v2f16.v2f16(<2 x half> addrspace(1)* %gep, <2 x half> %data) + ret void +} + +declare void @llvm.amdgcn.global.atomic.fadd.p1f32.f32(float addrspace(1)* nocapture, float) #0 +declare void @llvm.amdgcn.global.atomic.fadd.p1v2f16.v2f16(<2 x half> addrspace(1)* nocapture, <2 x half>) #0 + +attributes #0 = { argmemonly nounwind willreturn }