Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -307,6 +307,8 @@ MVT getFenceOperandTy(const DataLayout &DL) const override { return MVT::i32; } + + AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override; }; namespace AMDGPUISD { Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4477,3 +4477,10 @@ return false; } } + +TargetLowering::AtomicExpansionKind +AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { + if (RMW->getOperation() == AtomicRMWInst::Nand) + return AtomicExpansionKind::CmpXChg; + return AtomicExpansionKind::None; +} Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -601,6 +601,7 @@ disablePass(&FuncletLayoutID); disablePass(&PatchableFunctionID); + addPass(createAtomicExpandPass()); addPass(createAMDGPULowerIntrinsicsPass()); if (TM.getTargetTriple().getArch() == Triple::r600 || Index: test/CodeGen/AMDGPU/atomicrmw-nand.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/atomicrmw-nand.ll @@ -0,0 +1,85 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +define i32 @atomic_nand_i32_lds(i32 addrspace(3)* %ptr) nounwind { +; GCN-LABEL: atomic_nand_i32_lds: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: ds_read_b32 v2, v0 +; GCN-NEXT: s_mov_b64 s[6:7], 0 +; GCN-NEXT: BB0_1: ; %atomicrmw.start +; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_not_b32_e32 v1, v2 +; GCN-NEXT: v_or_b32_e32 v1, -5, v1 +; GCN-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GCN-NEXT: v_mov_b32_e32 v2, v1 +; GCN-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN-NEXT: s_cbranch_execnz BB0_1 +; GCN-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw nand i32 addrspace(3)* %ptr, i32 4 seq_cst + ret i32 %result +} + +define i32 @atomic_nand_i32_global(i32 addrspace(1)* %ptr) nounwind { +; GCN-LABEL: atomic_nand_i32_global: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: global_load_dword v3, v[0:1], off +; GCN-NEXT: s_mov_b64 s[6:7], 0 +; GCN-NEXT: BB1_1: ; %atomicrmw.start +; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_not_b32_e32 v2, v3 +; GCN-NEXT: v_or_b32_e32 v2, -5, v2 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_wbinvl1_vol +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN-NEXT: v_mov_b32_e32 v3, v2 +; GCN-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN-NEXT: s_cbranch_execnz BB1_1 +; GCN-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: v_mov_b32_e32 v0, v2 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw nand i32 addrspace(1)* %ptr, i32 4 seq_cst + ret i32 %result +} + +define i32 @atomic_nand_i32_flat(i32* %ptr) nounwind { +; GCN-LABEL: atomic_nand_i32_flat: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: flat_load_dword v3, v[0:1] +; GCN-NEXT: s_mov_b64 s[6:7], 0 +; GCN-NEXT: BB2_1: ; %atomicrmw.start +; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: v_not_b32_e32 v2, v3 +; GCN-NEXT: v_or_b32_e32 v2, -5, v2 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_wbinvl1_vol +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN-NEXT: v_mov_b32_e32 v3, v2 +; GCN-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN-NEXT: s_cbranch_execnz BB2_1 +; GCN-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: v_mov_b32_e32 v0, v2 +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = atomicrmw nand i32* %ptr, i32 4 seq_cst + ret i32 %result +} Index: test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-nand.ll =================================================================== --- /dev/null +++ test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-nand.ll @@ -0,0 +1,60 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -atomic-expand %s | FileCheck %s +; RUN: opt -mtriple=r600-mesa-mesa3d -S -atomic-expand %s | FileCheck %s + +define i32 @test_atomicrmw_nand_i32_flat(i32* %ptr, i32 %value) { +; CHECK-LABEL: @test_atomicrmw_nand_i32_flat( +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[PTR:%.*]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[LOADED]], [[VALUE:%.*]] +; CHECK-NEXT: [[NEW:%.*]] = xor i32 [[TMP2]], -1 +; CHECK-NEXT: [[TMP3:%.*]] = cmpxchg i32* [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: +; CHECK-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw nand i32* %ptr, i32 %value seq_cst + ret i32 %res +} + +define i32 @test_atomicrmw_nand_i32_global(i32 addrspace(1)* %ptr, i32 %value) { +; CHECK-LABEL: @test_atomicrmw_nand_i32_global( +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(1)* [[PTR:%.*]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[LOADED]], [[VALUE:%.*]] +; CHECK-NEXT: [[NEW:%.*]] = xor i32 [[TMP2]], -1 +; CHECK-NEXT: [[TMP3:%.*]] = cmpxchg i32 addrspace(1)* [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: +; CHECK-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw nand i32 addrspace(1)* %ptr, i32 %value seq_cst + ret i32 %res +} + +define i32 @test_atomicrmw_nand_i32_local(i32 addrspace(3)* %ptr, i32 %value) { +; CHECK-LABEL: @test_atomicrmw_nand_i32_local( +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(3)* [[PTR:%.*]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[LOADED]], [[VALUE:%.*]] +; CHECK-NEXT: [[NEW:%.*]] = xor i32 [[TMP2]], -1 +; CHECK-NEXT: [[TMP3:%.*]] = cmpxchg i32 addrspace(3)* [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP3]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: +; CHECK-NEXT: ret i32 [[NEWLOADED]] +; + %res = atomicrmw nand i32 addrspace(3)* %ptr, i32 %value seq_cst + ret i32 %res +} Index: test/Transforms/AtomicExpand/AMDGPU/lit.local.cfg =================================================================== --- /dev/null +++ test/Transforms/AtomicExpand/AMDGPU/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'AMDGPU' in config.root.targets: + config.unsupported = True