Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -45,6 +45,7 @@ FunctionPass *createSIWholeQuadModePass(); FunctionPass *createSIFixControlFlowLiveIntervalsPass(); FunctionPass *createSIFixSGPRCopiesPass(); +FunctionPass *createSIMemoryLegalizerPass(); FunctionPass *createSIDebuggerInsertNopsPass(); FunctionPass *createSIInsertWaitsPass(); FunctionPass *createSIInsertWaitcntsPass(); @@ -125,6 +126,9 @@ void initializeSIAnnotateControlFlowPass(PassRegistry&); extern char &SIAnnotateControlFlowPassID; +void initializeSIMemoryLegalizerPass(PassRegistry&); +extern char &SIMemoryLegalizerID; + void initializeSIDebuggerInsertNopsPass(PassRegistry&); extern char &SIDebuggerInsertNopsID; Index: lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h =================================================================== --- /dev/null +++ lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h @@ -0,0 +1,57 @@ +//===--- AMDGPUMachineModuleInfo.h ------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief AMDGPU Machine Module Info. +/// +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEMODULEINFO_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEMODULEINFO_H + +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineModuleInfoImpls.h" +#include "llvm/IR/LLVMContext.h" + +namespace llvm { + +class AMDGPUMachineModuleInfo final : public MachineModuleInfoELF { +private: + + // All supported memory/synchronization scopes can be found here: + // http://llvm.org/docs/AMDGPUUsage.html#memory-scopes + + /// \brief Agent synchronization scope ID. + SyncScope::ID AgentSSID; + /// \brief Workgroup synchronization scope ID. + SyncScope::ID WorkgroupSSID; + /// \brief Wavefront synchronization scope ID. + SyncScope::ID WavefrontSSID; + +public: + AMDGPUMachineModuleInfo(const MachineModuleInfo &MMI); + + /// \returns Agent synchronization scope ID. + SyncScope::ID getAgentSSID() const { + return AgentSSID; + } + /// \returns Workgroup synchronization scope ID. + SyncScope::ID getWorkgroupSSID() const { + return WorkgroupSSID; + } + /// \returns Wavefront synchronization scope ID. + SyncScope::ID getWavefrontSSID() const { + return WavefrontSSID; + } +}; + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEMODULEINFO_H Index: lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp @@ -0,0 +1,29 @@ +//===--- AMDGPUMachineModuleInfo.cpp ----------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief AMDGPU Machine Module Info. +/// +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUMachineModuleInfo.h" +#include "llvm/IR/Module.h" + +namespace llvm { + +AMDGPUMachineModuleInfo::AMDGPUMachineModuleInfo(const MachineModuleInfo &MMI) + : MachineModuleInfoELF(MMI) { + LLVMContext &CTX = MMI.getModule()->getContext(); + AgentSSID = CTX.getOrInsertSyncScopeID("agent"); + WorkgroupSSID = CTX.getOrInsertSyncScopeID("workgroup"); + WavefrontSSID = CTX.getOrInsertSyncScopeID("wavefront"); +} + +} // end namespace llvm Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -150,6 +150,7 @@ initializeSIWholeQuadModePass(*PR); initializeSILowerControlFlowPass(*PR); initializeSIInsertSkipsPass(*PR); + initializeSIMemoryLegalizerPass(*PR); initializeSIDebuggerInsertNopsPass(*PR); initializeSIOptimizeExecMaskingPass(*PR); initializeAMDGPUUnifyDivergentExitNodesPass(*PR); @@ -806,6 +807,7 @@ addPass(createSIInsertWaitsPass()); addPass(createSIShrinkInstructionsPass()); addPass(&SIInsertSkipsPassID); + addPass(createSIMemoryLegalizerPass()); addPass(createSIDebuggerInsertNopsPass()); addPass(&BranchRelaxationPassID); } Index: lib/Target/AMDGPU/BUFInstructions.td =================================================================== --- lib/Target/AMDGPU/BUFInstructions.td +++ lib/Target/AMDGPU/BUFInstructions.td @@ -437,6 +437,7 @@ let PseudoInstr = opName # "_" # getAddrName.ret; let mayLoad = 1; let mayStore = 0; + let maybeAtomic = 1; } // FIXME: tfe can't be an operand because it requires a separate @@ -483,6 +484,7 @@ let PseudoInstr = opName # "_" # getAddrName.ret; let mayLoad = 0; let mayStore = 1; + let maybeAtomic = 1; } multiclass MUBUF_Pseudo_Stores; def : Pat < (vt (atomic_ld (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset))), - (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 1, 0, 0) + (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0) >; } @@ -1171,12 +1174,12 @@ def : Pat < (atomic_st (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$slc), vt:$val), - (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 1, $slc, 0) + (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0) >; def : Pat < (atomic_st (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset), vt:$val), - (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 1, 0, 0) + (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0) >; } let Predicates = [isSICI] in { Index: lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/CMakeLists.txt +++ lib/Target/AMDGPU/CMakeLists.txt @@ -51,6 +51,7 @@ AMDGPUMCInstLower.cpp AMDGPUMachineCFGStructurizer.cpp AMDGPUMachineFunction.cpp + AMDGPUMachineModuleInfo.cpp AMDGPUUnifyMetadata.cpp AMDGPUOpenCLImageTypeLoweringPass.cpp AMDGPUSubtarget.cpp @@ -93,6 +94,7 @@ SILowerI1Copies.cpp SIMachineFunctionInfo.cpp SIMachineScheduler.cpp + SIMemoryLegalizer.cpp SIOptimizeExecMasking.cpp SIPeepholeSDWA.cpp SIRegisterInfo.cpp Index: lib/Target/AMDGPU/FLATInstructions.td =================================================================== --- lib/Target/AMDGPU/FLATInstructions.td +++ lib/Target/AMDGPU/FLATInstructions.td @@ -111,6 +111,7 @@ " $vdst, $vaddr$offset$glc$slc"> { let has_data = 0; let mayLoad = 1; + let maybeAtomic = 1; } class FLAT_Global_Load_Pseudo : @@ -134,6 +135,7 @@ let mayLoad = 0; let mayStore = 1; let has_vdst = 0; + let maybeAtomic = 1; } class FLAT_Global_Store_Pseudo : @@ -169,6 +171,7 @@ let glcValue = 0; let has_vdst = 0; let PseudoInstr = NAME; + let maybeAtomic = 1; } def _RTN : FLAT_Pseudo : Pat < (vt (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc))), - (inst $vaddr, $offset, 1, $slc) + (inst $vaddr, $offset, 0, $slc) >; class FlatStorePat : Pat < @@ -404,7 +408,7 @@ // atomic store follows atomic binop convention so the address comes // first. (node (FLATAtomic i64:$vaddr, i16:$offset, i1:$slc), vt:$data), - (inst $vaddr, $data, $offset, 1, $slc) + (inst $vaddr, $data, $offset, 0, $slc) >; class FlatAtomicPat DisableSIDecoder = 0; Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -99,6 +99,7 @@ [(atomic_fence (i32 imm:$ordering), (i32 imm:$scope))], "ATOMIC_FENCE $ordering, $scope"> { let hasSideEffects = 1; + let maybeAtomic = 1; } let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { Index: lib/Target/AMDGPU/SIMemoryLegalizer.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -0,0 +1,490 @@ +//===--- SIMemoryLegalizer.cpp ----------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Memory legalizer - implements memory model. More information can be +/// found here: +/// http://llvm.org/docs/AMDGPUUsage.html#memory-model +/// +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUMachineModuleInfo.h" +#include "AMDGPUSubtarget.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/IR/DiagnosticInfo.h" + +using namespace llvm; +using namespace llvm::AMDGPU; + +#define DEBUG_TYPE "si-memory-legalizer" +#define PASS_NAME "SI Memory Legalizer" + +namespace { + +class SIMemoryLegalizer final : public MachineFunctionPass { +private: + struct AtomicInfo final { + SyncScope::ID SSID = SyncScope::System; + AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent; + AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent; + + AtomicInfo() {} + + AtomicInfo(SyncScope::ID SSID, + AtomicOrdering Ordering, + AtomicOrdering FailureOrdering) + : SSID(SSID), + Ordering(Ordering), + FailureOrdering(FailureOrdering) {} + + AtomicInfo(const MachineMemOperand *MMO) + : SSID(MMO->getSyncScopeID()), + Ordering(MMO->getOrdering()), + FailureOrdering(MMO->getFailureOrdering()) {} + }; + + /// \brief LLVM context. + LLVMContext *CTX = nullptr; + /// \brief Machine module info. + const AMDGPUMachineModuleInfo *MMI = nullptr; + /// \brief Instruction info. + const SIInstrInfo *TII = nullptr; + + /// \brief Immediate for "vmcnt(0)". + unsigned Vmcnt0Immediate = 0; + /// \brief Opcode for cache invalidation instruction (L1). + unsigned Wbinvl1Opcode = 0; + + /// \brief List of atomic pseudo instructions. + std::list AtomicPseudoMIs; + + /// \brief Inserts "buffer_wbinvl1_vol" instruction \p Before or after \p MI. + /// Always returns true. + bool insertBufferWbinvl1Vol(MachineBasicBlock::iterator &MI, + bool Before = true) const; + /// \brief Inserts "s_waitcnt vmcnt(0)" instruction \p Before or after \p MI. + /// Always returns true. + bool insertWaitcntVmcnt0(MachineBasicBlock::iterator &MI, + bool Before = true) const; + + /// \brief Sets GLC bit if present in \p MI. Returns true if \p MI is + /// modified, false otherwise. + bool setGLC(const MachineBasicBlock::iterator &MI) const; + + /// \brief Removes all processed atomic pseudo instructions from the current + /// function. Returns true if current function is modified, false otherwise. + bool removeAtomicPseudoMIs(); + + /// \brief Reports unknown synchronization scope used in \p MI to LLVM + /// context. + void reportUnknownSynchScope(const MachineBasicBlock::iterator &MI); + + /// \returns Atomic fence info if \p MI is an atomic fence operation, + /// "None" otherwise. + Optional getAtomicFenceInfo( + const MachineBasicBlock::iterator &MI) const; + /// \returns Atomic load info if \p MI is an atomic load operation, + /// "None" otherwise. + Optional getAtomicLoadInfo( + const MachineBasicBlock::iterator &MI) const; + /// \returns Atomic store info if \p MI is an atomic store operation, + /// "None" otherwise. + Optional getAtomicStoreInfo( + const MachineBasicBlock::iterator &MI) const; + /// \returns Atomic cmpxchg info if \p MI is an atomic cmpxchg operation, + /// "None" otherwise. + Optional getAtomicCmpxchgInfo( + const MachineBasicBlock::iterator &MI) const; + /// \returns Atomic rmw info if \p MI is an atomic rmw operation, + /// "None" otherwise. + Optional getAtomicRmwInfo( + const MachineBasicBlock::iterator &MI) const; + + /// \brief Expands atomic fence operation \p MI. Returns true if + /// instructions are added/deleted or \p MI is modified, false otherwise. + bool expandAtomicFence(const AtomicInfo &AI, + MachineBasicBlock::iterator &MI); + /// \brief Expands atomic load operation \p MI. Returns true if + /// instructions are added/deleted or \p MI is modified, false otherwise. + bool expandAtomicLoad(const AtomicInfo &AI, + MachineBasicBlock::iterator &MI); + /// \brief Expands atomic store operation \p MI. Returns true if + /// instructions are added/deleted or \p MI is modified, false otherwise. + bool expandAtomicStore(const AtomicInfo &AI, + MachineBasicBlock::iterator &MI); + /// \brief Expands atomic cmpxchg operation \p MI. Returns true if + /// instructions are added/deleted or \p MI is modified, false otherwise. + bool expandAtomicCmpxchg(const AtomicInfo &AI, + MachineBasicBlock::iterator &MI); + /// \brief Expands atomic rmw operation \p MI. Returns true if + /// instructions are added/deleted or \p MI is modified, false otherwise. + bool expandAtomicRmw(const AtomicInfo &AI, + MachineBasicBlock::iterator &MI); + +public: + static char ID; + + SIMemoryLegalizer() + : MachineFunctionPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + StringRef getPassName() const override { + return PASS_NAME; + } + + bool runOnMachineFunction(MachineFunction &MF) override; +}; + +} // end namespace anonymous + +bool SIMemoryLegalizer::insertBufferWbinvl1Vol(MachineBasicBlock::iterator &MI, + bool Before) const { + MachineBasicBlock &MBB = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + + if (!Before) + ++MI; + + BuildMI(MBB, MI, DL, TII->get(Wbinvl1Opcode)); + + if (!Before) + --MI; + + return true; +} + +bool SIMemoryLegalizer::insertWaitcntVmcnt0(MachineBasicBlock::iterator &MI, + bool Before) const { + MachineBasicBlock &MBB = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + + if (!Before) + ++MI; + + BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Vmcnt0Immediate); + + if (!Before) + --MI; + + return true; +} + +bool SIMemoryLegalizer::setGLC(const MachineBasicBlock::iterator &MI) const { + int GLCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::glc); + if (GLCIdx == -1) + return false; + + MachineOperand &GLC = MI->getOperand(GLCIdx); + if (GLC.getImm() == 1) + return false; + + GLC.setImm(1); + return true; +} + +bool SIMemoryLegalizer::removeAtomicPseudoMIs() { + if (AtomicPseudoMIs.empty()) + return false; + + for (auto &MI : AtomicPseudoMIs) + MI->eraseFromParent(); + + AtomicPseudoMIs.clear(); + return true; +} + +void SIMemoryLegalizer::reportUnknownSynchScope( + const MachineBasicBlock::iterator &MI) { + DiagnosticInfoUnsupported Diag(*MI->getParent()->getParent()->getFunction(), + "Unsupported synchronization scope"); + CTX->diagnose(Diag); +} + +Optional SIMemoryLegalizer::getAtomicFenceInfo( + const MachineBasicBlock::iterator &MI) const { + assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); + + if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE) + return None; + + SyncScope::ID SSID = + static_cast(MI->getOperand(1).getImm()); + AtomicOrdering Ordering = + static_cast(MI->getOperand(0).getImm()); + return AtomicInfo(SSID, Ordering, AtomicOrdering::NotAtomic); +} + +Optional SIMemoryLegalizer::getAtomicLoadInfo( + const MachineBasicBlock::iterator &MI) const { + assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); + + if (!(MI->mayLoad() && !MI->mayStore())) + return None; + if (!MI->hasOneMemOperand()) + return AtomicInfo(); + + const MachineMemOperand *MMO = *MI->memoperands_begin(); + if (!MMO->isAtomic()) + return None; + + return AtomicInfo(MMO); +} + +Optional SIMemoryLegalizer::getAtomicStoreInfo( + const MachineBasicBlock::iterator &MI) const { + assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); + + if (!(!MI->mayLoad() && MI->mayStore())) + return None; + if (!MI->hasOneMemOperand()) + return AtomicInfo(); + + const MachineMemOperand *MMO = *MI->memoperands_begin(); + if (!MMO->isAtomic()) + return None; + + return AtomicInfo(MMO); +} + +Optional SIMemoryLegalizer::getAtomicCmpxchgInfo( + const MachineBasicBlock::iterator &MI) const { + assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); + + if (!(MI->mayLoad() && MI->mayStore())) + return None; + if (!MI->hasOneMemOperand()) + return AtomicInfo(); + + const MachineMemOperand *MMO = *MI->memoperands_begin(); + if (!MMO->isAtomic()) + return None; + if (MMO->getFailureOrdering() == AtomicOrdering::NotAtomic) + return None; + + return AtomicInfo(MMO); +} + +Optional SIMemoryLegalizer::getAtomicRmwInfo( + const MachineBasicBlock::iterator &MI) const { + assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic); + + if (!(MI->mayLoad() && MI->mayStore())) + return None; + if (!MI->hasOneMemOperand()) + return AtomicInfo(); + + const MachineMemOperand *MMO = *MI->memoperands_begin(); + if (!MMO->isAtomic()) + return None; + if (MMO->getFailureOrdering() != AtomicOrdering::NotAtomic) + return None; + + return AtomicInfo(MMO); +} + +bool SIMemoryLegalizer::expandAtomicFence(const AtomicInfo &AI, + MachineBasicBlock::iterator &MI) { + assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE); + + bool Changed = false; + if (AI.SSID == SyncScope::System || + AI.SSID == MMI->getAgentSSID()) { + if (AI.Ordering == AtomicOrdering::Acquire || + AI.Ordering == AtomicOrdering::Release || + AI.Ordering == AtomicOrdering::AcquireRelease || + AI.Ordering == AtomicOrdering::SequentiallyConsistent) + Changed |= insertWaitcntVmcnt0(MI); + + if (AI.Ordering == AtomicOrdering::Acquire || + AI.Ordering == AtomicOrdering::AcquireRelease || + AI.Ordering == AtomicOrdering::SequentiallyConsistent) + Changed |= insertBufferWbinvl1Vol(MI); + + AtomicPseudoMIs.push_back(MI); + return Changed; + } else if (AI.SSID == SyncScope::SingleThread || + AI.SSID == MMI->getWorkgroupSSID() || + AI.SSID == MMI->getWavefrontSSID()) { + return Changed; + } else { + reportUnknownSynchScope(MI); + return Changed; + } +} + +bool SIMemoryLegalizer::expandAtomicLoad(const AtomicInfo &AI, + MachineBasicBlock::iterator &MI) { + assert(MI->mayLoad() && !MI->mayStore()); + + bool Changed = false; + if (AI.SSID == SyncScope::System || + AI.SSID == MMI->getAgentSSID()) { + if (AI.Ordering == AtomicOrdering::Acquire || + AI.Ordering == AtomicOrdering::SequentiallyConsistent) + Changed |= setGLC(MI); + + if (AI.Ordering == AtomicOrdering::SequentiallyConsistent) + Changed |= insertWaitcntVmcnt0(MI); + + if (AI.Ordering == AtomicOrdering::Acquire || + AI.Ordering == AtomicOrdering::SequentiallyConsistent) { + Changed |= insertWaitcntVmcnt0(MI, false); + Changed |= insertBufferWbinvl1Vol(MI, false); + } + + return Changed; + } else if (AI.SSID == SyncScope::SingleThread || + AI.SSID == MMI->getWorkgroupSSID() || + AI.SSID == MMI->getWavefrontSSID()) { + return Changed; + } else { + reportUnknownSynchScope(MI); + return Changed; + } +} + +bool SIMemoryLegalizer::expandAtomicStore(const AtomicInfo &AI, + MachineBasicBlock::iterator &MI) { + assert(!MI->mayLoad() && MI->mayStore()); + + bool Changed = false; + if (AI.SSID == SyncScope::System || + AI.SSID == MMI->getAgentSSID()) { + if (AI.Ordering == AtomicOrdering::Release || + AI.Ordering == AtomicOrdering::SequentiallyConsistent) + Changed |= insertWaitcntVmcnt0(MI); + + return Changed; + } else if (AI.SSID == SyncScope::SingleThread || + AI.SSID == MMI->getWorkgroupSSID() || + AI.SSID == MMI->getWavefrontSSID()) { + return Changed; + } else { + reportUnknownSynchScope(MI); + return Changed; + } +} + +bool SIMemoryLegalizer::expandAtomicCmpxchg(const AtomicInfo &AI, + MachineBasicBlock::iterator &MI) { + assert(MI->mayLoad() && MI->mayStore()); + + bool Changed = false; + if (AI.SSID == SyncScope::System || + AI.SSID == MMI->getAgentSSID()) { + if (AI.Ordering == AtomicOrdering::Release || + AI.Ordering == AtomicOrdering::AcquireRelease || + AI.Ordering == AtomicOrdering::SequentiallyConsistent || + AI.FailureOrdering == AtomicOrdering::SequentiallyConsistent) + Changed |= insertWaitcntVmcnt0(MI); + + if (AI.Ordering == AtomicOrdering::Acquire || + AI.Ordering == AtomicOrdering::AcquireRelease || + AI.Ordering == AtomicOrdering::SequentiallyConsistent || + AI.FailureOrdering == AtomicOrdering::Acquire || + AI.FailureOrdering == AtomicOrdering::SequentiallyConsistent) { + Changed |= insertWaitcntVmcnt0(MI, false); + Changed |= insertBufferWbinvl1Vol(MI, false); + } + + return Changed; + } else if (AI.SSID == SyncScope::SingleThread || + AI.SSID == MMI->getWorkgroupSSID() || + AI.SSID == MMI->getWavefrontSSID()) { + Changed |= setGLC(MI); + return Changed; + } else { + reportUnknownSynchScope(MI); + return Changed; + } +} + +bool SIMemoryLegalizer::expandAtomicRmw(const AtomicInfo &AI, + MachineBasicBlock::iterator &MI) { + assert(MI->mayLoad() && MI->mayStore()); + + bool Changed = false; + if (AI.SSID == SyncScope::System || + AI.SSID == MMI->getAgentSSID()) { + if (AI.Ordering == AtomicOrdering::Release || + AI.Ordering == AtomicOrdering::AcquireRelease || + AI.Ordering == AtomicOrdering::SequentiallyConsistent) + Changed |= insertWaitcntVmcnt0(MI); + + if (AI.Ordering == AtomicOrdering::Acquire || + AI.Ordering == AtomicOrdering::AcquireRelease || + AI.Ordering == AtomicOrdering::SequentiallyConsistent) { + Changed |= insertWaitcntVmcnt0(MI, false); + Changed |= insertBufferWbinvl1Vol(MI, false); + } + + return Changed; + } else if (AI.SSID == SyncScope::SingleThread || + AI.SSID == MMI->getWorkgroupSSID() || + AI.SSID == MMI->getWavefrontSSID()) { + Changed |= setGLC(MI); + return Changed; + } else { + reportUnknownSynchScope(MI); + return Changed; + } +} + +bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { + bool Changed = false; + const SISubtarget &ST = MF.getSubtarget(); + const IsaInfo::IsaVersion IV = IsaInfo::getIsaVersion(ST.getFeatureBits()); + + CTX = &MF.getFunction()->getContext(); + MMI = &MF.getMMI().getObjFileInfo(); + TII = ST.getInstrInfo(); + + Vmcnt0Immediate = + AMDGPU::encodeWaitcnt(IV, 0, getExpcntBitMask(IV), getLgkmcntBitMask(IV)); + Wbinvl1Opcode = ST.getGeneration() <= AMDGPUSubtarget::SOUTHERN_ISLANDS ? + AMDGPU::BUFFER_WBINVL1 : AMDGPU::BUFFER_WBINVL1_VOL; + + for (auto &MBB : MF) { + for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) { + if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic)) + continue; + + if (const auto &AI = getAtomicFenceInfo(MI)) + Changed |= expandAtomicFence(AI.getValue(), MI); + else if (const auto &AI = getAtomicLoadInfo(MI)) + Changed |= expandAtomicLoad(AI.getValue(), MI); + else if (const auto &AI = getAtomicStoreInfo(MI)) + Changed |= expandAtomicStore(AI.getValue(), MI); + else if (const auto &AI = getAtomicCmpxchgInfo(MI)) + Changed |= expandAtomicCmpxchg(AI.getValue(), MI); + else if (const auto &AI = getAtomicRmwInfo(MI)) + Changed |= expandAtomicRmw(AI.getValue(), MI); + } + } + + Changed |= removeAtomicPseudoMIs(); + return Changed; +} + +INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false) + +char SIMemoryLegalizer::ID = 0; +char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID; + +FunctionPass *llvm::createSIMemoryLegalizerPass() { + return new SIMemoryLegalizer(); +} Index: test/CodeGen/AMDGPU/fence-amdgiz.ll =================================================================== --- test/CodeGen/AMDGPU/fence-amdgiz.ll +++ test/CodeGen/AMDGPU/fence-amdgiz.ll @@ -3,11 +3,11 @@ target datalayout = "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5" target triple = "amdgcn-amd-amdhsa-amdgizcl" -; CHECK_LABEL: atomic_fence -; CHECK: BB#0: -; CHECK: ATOMIC_FENCE 4, 1 -; CHECK: s_endpgm - +; CHECK-LABEL: atomic_fence +; CHECK: BB#0: +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_wbinvl1_vol +; CHECK-NEXT: s_endpgm define amdgpu_kernel void @atomic_fence() { fence acquire ret void Index: test/CodeGen/AMDGPU/flat_atomics.ll =================================================================== --- test/CodeGen/AMDGPU/flat_atomics.ll +++ test/CodeGen/AMDGPU/flat_atomics.ll @@ -998,8 +998,8 @@ } ; GCN-LABEL: {{^}}atomic_store_i32_offset: -; CIVI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}} -; GFX9: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} offset:16 glc{{$}} +; CIVI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX9: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} offset:16{{$}} define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, i32 addrspace(4)* %out) { entry: %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 @@ -1008,7 +1008,7 @@ } ; GCN-LABEL: {{^}}atomic_store_i32: -; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}} +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} define amdgpu_kernel void @atomic_store_i32(i32 %in, i32 addrspace(4)* %out) { entry: store atomic i32 %in, i32 addrspace(4)* %out seq_cst, align 4 @@ -1016,8 +1016,8 @@ } ; GCN-LABEL: {{^}}atomic_store_i32_addr64_offset: -; CIVI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}} -; GFX9: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} offset:16 glc{{$}} +; CIVI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +; GFX9: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} offset:16{{$}} define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(4)* %out, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index @@ -1027,7 +1027,7 @@ } ; GCN-LABEL: {{^}}atomic_store_i32_addr64: -; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}} +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, i32 addrspace(4)* %out, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index Index: test/CodeGen/AMDGPU/flat_atomics_i64.ll =================================================================== --- test/CodeGen/AMDGPU/flat_atomics_i64.ll +++ test/CodeGen/AMDGPU/flat_atomics_i64.ll @@ -846,7 +846,7 @@ } ; GCN-LABEL: {{^}}atomic_store_i64_offset: -; GCN: flat_store_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; GCN: flat_store_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, i64 addrspace(4)* %out) { entry: %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4 @@ -855,7 +855,7 @@ } ; GCN-LABEL: {{^}}atomic_store_i64: -; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, v[{{[0-9]+}}:{{[0-9]+}}] glc +; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, v[{{[0-9]+}}:{{[0-9]+}}] define amdgpu_kernel void @atomic_store_i64(i64 %in, i64 addrspace(4)* %out) { entry: store atomic i64 %in, i64 addrspace(4)* %out seq_cst, align 8 @@ -863,7 +863,7 @@ } ; GCN-LABEL: {{^}}atomic_store_i64_addr64_offset: -; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}] glc{{$}} +; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, i64 addrspace(4)* %out, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index @@ -873,7 +873,7 @@ } ; GCN-LABEL: {{^}}atomic_store_i64_addr64: -; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}] glc{{$}} +; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, i64 addrspace(4)* %out, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index Index: test/CodeGen/AMDGPU/global_atomics.ll =================================================================== --- test/CodeGen/AMDGPU/global_atomics.ll +++ test/CodeGen/AMDGPU/global_atomics.ll @@ -1004,8 +1004,8 @@ } ; FUNC-LABEL: {{^}}atomic_store_i32_offset: -; SI: buffer_store_dword {{v[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} -; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}} +; SI: buffer_store_dword {{v[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} +; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, i32 addrspace(1)* %out) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 @@ -1014,8 +1014,8 @@ } ; FUNC-LABEL: {{^}}atomic_store_i32: -; SI: buffer_store_dword {{v[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc{{$}} -; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}} +; SI: buffer_store_dword {{v[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} +; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} define amdgpu_kernel void @atomic_store_i32(i32 %in, i32 addrspace(1)* %out) { entry: store atomic i32 %in, i32 addrspace(1)* %out seq_cst, align 4 @@ -1023,8 +1023,8 @@ } ; FUNC-LABEL: {{^}}atomic_store_i32_addr64_offset: -; SI: buffer_store_dword {{v[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} -; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}} +; SI: buffer_store_dword {{v[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} +; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index @@ -1034,8 +1034,8 @@ } ; FUNC-LABEL: {{^}}atomic_store_i32_addr64: -; SI: buffer_store_dword {{v[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} -; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}} +; SI: buffer_store_dword {{v[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} +; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, i32 addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index Index: test/CodeGen/AMDGPU/global_atomics_i64.ll =================================================================== --- test/CodeGen/AMDGPU/global_atomics_i64.ll +++ test/CodeGen/AMDGPU/global_atomics_i64.ll @@ -997,8 +997,8 @@ } ; FUNC-LABEL: {{^}}atomic_store_i64_offset: -; CI: buffer_store_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} -; VI: flat_store_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; CI: buffer_store_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}} +; VI: flat_store_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, i64 addrspace(1)* %out) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 @@ -1007,8 +1007,8 @@ } ; FUNC-LABEL: {{^}}atomic_store_i64: -; CI: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc -; VI: flat_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, v[{{[0-9]+}}:{{[0-9]+}}] glc +; CI: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} +; VI: flat_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, v[{{[0-9]+}}:{{[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_store_i64(i64 %in, i64 addrspace(1)* %out) { entry: store atomic i64 %in, i64 addrspace(1)* %out seq_cst, align 8 @@ -1016,8 +1016,8 @@ } ; FUNC-LABEL: {{^}}atomic_store_i64_addr64_offset: -; CI: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}} -; VI: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}] glc{{$}} +; CI: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} +; VI: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, i64 addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index @@ -1027,8 +1027,8 @@ } ; FUNC-LABEL: {{^}}atomic_store_i64_addr64: -; CI: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}} -; VI: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}] glc{{$}} +; CI: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} +; VI: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}]{{$}} define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, i64 addrspace(1)* %out, i64 %index) { entry: %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index Index: test/CodeGen/AMDGPU/memory-legalizer-atomic-cmpxchg.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/memory-legalizer-atomic-cmpxchg.ll @@ -0,0 +1,652 @@ +; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck %s + +; CHECK-LABEL: {{^}}system_monotonic_monotonic +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_monotonic_monotonic( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in monotonic monotonic + ret void +} + +; CHECK-LABEL: {{^}}system_acquire_monotonic +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_acquire_monotonic( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in acquire monotonic + ret void +} + +; CHECK-LABEL: {{^}}system_release_monotonic +; CHECK: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_release_monotonic( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in release monotonic + ret void +} + +; CHECK-LABEL: {{^}}system_acq_rel_monotonic +; CHECK: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_acq_rel_monotonic( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in acq_rel monotonic + ret void +} + +; CHECK-LABEL: {{^}}system_seq_cst_monotonic +; CHECK: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_seq_cst_monotonic( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in seq_cst monotonic + ret void +} + +; CHECK-LABEL: {{^}}system_acquire_acquire +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_acquire_acquire( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in acquire acquire + ret void +} + +; CHECK-LABEL: {{^}}system_release_acquire +; CHECK: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_release_acquire( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in release acquire + ret void +} + +; CHECK-LABEL: {{^}}system_acq_rel_acquire +; CHECK: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_acq_rel_acquire( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in acq_rel acquire + ret void +} + +; CHECK-LABEL: {{^}}system_seq_cst_acquire +; CHECK: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_seq_cst_acquire( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in seq_cst acquire + ret void +} + +; CHECK-LABEL: {{^}}system_seq_cst_seq_cst +; CHECK: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_seq_cst_seq_cst( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in seq_cst seq_cst + ret void +} + +; CHECK-LABEL: {{^}}singlethread_monotonic_monotonic +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_monotonic_monotonic( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic + ret void +} + +; CHECK-LABEL: {{^}}singlethread_acquire_monotonic +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_acquire_monotonic( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic + ret void +} + +; CHECK-LABEL: {{^}}singlethread_release_monotonic +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_release_monotonic( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic + ret void +} + +; CHECK-LABEL: {{^}}singlethread_acq_rel_monotonic +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_acq_rel_monotonic( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic + ret void +} + +; CHECK-LABEL: {{^}}singlethread_seq_cst_monotonic +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_seq_cst_monotonic( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic + ret void +} + +; CHECK-LABEL: {{^}}singlethread_acquire_acquire +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_acquire_acquire( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire + ret void +} + +; CHECK-LABEL: {{^}}singlethread_release_acquire +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_release_acquire( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("singlethread") release acquire + ret void +} + +; CHECK-LABEL: {{^}}singlethread_acq_rel_acquire +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_acq_rel_acquire( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire + ret void +} + +; CHECK-LABEL: {{^}}singlethread_seq_cst_acquire +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_seq_cst_acquire( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire + ret void +} + +; CHECK-LABEL: {{^}}singlethread_seq_cst_seq_cst +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_seq_cst_seq_cst( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst + ret void +} + +; CHECK-LABEL: {{^}}agent_monotonic_monotonic +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_monotonic_monotonic( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("agent") monotonic monotonic + ret void +} + +; CHECK-LABEL: {{^}}agent_acquire_monotonic +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_acquire_monotonic( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic + ret void +} + +; CHECK-LABEL: {{^}}agent_release_monotonic +; CHECK: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_release_monotonic( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("agent") release monotonic + ret void +} + +; CHECK-LABEL: {{^}}agent_acq_rel_monotonic +; CHECK: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_acq_rel_monotonic( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic + ret void +} + +; CHECK-LABEL: {{^}}agent_seq_cst_monotonic +; CHECK: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_seq_cst_monotonic( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic + ret void +} + +; CHECK-LABEL: {{^}}agent_acquire_acquire +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_acquire_acquire( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("agent") acquire acquire + ret void +} + +; CHECK-LABEL: {{^}}agent_release_acquire +; CHECK: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_release_acquire( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("agent") release acquire + ret void +} + +; CHECK-LABEL: {{^}}agent_acq_rel_acquire +; CHECK: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_acq_rel_acquire( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire + ret void +} + +; CHECK-LABEL: {{^}}agent_seq_cst_acquire +; CHECK: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_seq_cst_acquire( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire + ret void +} + +; CHECK-LABEL: {{^}}agent_seq_cst_seq_cst +; CHECK: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_seq_cst_seq_cst( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst + ret void +} + +; CHECK-LABEL: {{^}}workgroup_monotonic_monotonic +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_monotonic_monotonic( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("workgroup") monotonic monotonic + ret void +} + +; CHECK-LABEL: {{^}}workgroup_acquire_monotonic +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_acquire_monotonic( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic + ret void +} + +; CHECK-LABEL: {{^}}workgroup_release_monotonic +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_release_monotonic( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("workgroup") release monotonic + ret void +} + +; CHECK-LABEL: {{^}}workgroup_acq_rel_monotonic +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_acq_rel_monotonic( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic + ret void +} + +; CHECK-LABEL: {{^}}workgroup_seq_cst_monotonic +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_seq_cst_monotonic( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic + ret void +} + +; CHECK-LABEL: {{^}}workgroup_acquire_acquire +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_acquire_acquire( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire + ret void +} + +; CHECK-LABEL: {{^}}workgroup_release_acquire +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_release_acquire( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("workgroup") release acquire + ret void +} + +; CHECK-LABEL: {{^}}workgroup_acq_rel_acquire +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_acq_rel_acquire( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire + ret void +} + +; CHECK-LABEL: {{^}}workgroup_seq_cst_acquire +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_seq_cst_acquire( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire + ret void +} + +; CHECK-LABEL: {{^}}workgroup_seq_cst_seq_cst +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_seq_cst_seq_cst( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst + ret void +} + +; CHECK-LABEL: {{^}}wavefront_monotonic_monotonic +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_monotonic_monotonic( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("wavefront") monotonic monotonic + ret void +} + +; CHECK-LABEL: {{^}}wavefront_acquire_monotonic +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_acquire_monotonic( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic + ret void +} + +; CHECK-LABEL: {{^}}wavefront_release_monotonic +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_release_monotonic( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("wavefront") release monotonic + ret void +} + +; CHECK-LABEL: {{^}}wavefront_acq_rel_monotonic +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_acq_rel_monotonic( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic + ret void +} + +; CHECK-LABEL: {{^}}wavefront_seq_cst_monotonic +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_seq_cst_monotonic( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic + ret void +} + +; CHECK-LABEL: {{^}}wavefront_acquire_acquire +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_acquire_acquire( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire + ret void +} + +; CHECK-LABEL: {{^}}wavefront_release_acquire +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_release_acquire( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("wavefront") release acquire + ret void +} + +; CHECK-LABEL: {{^}}wavefront_acq_rel_acquire +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_acq_rel_acquire( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire + ret void +} + +; CHECK-LABEL: {{^}}wavefront_seq_cst_acquire +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_seq_cst_acquire( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire + ret void +} + +; CHECK-LABEL: {{^}}wavefront_seq_cst_seq_cst +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_seq_cst_seq_cst( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst + ret void +} Index: test/CodeGen/AMDGPU/memory-legalizer-atomic-fence.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/memory-legalizer-atomic-fence.ll @@ -0,0 +1,205 @@ +; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=GCN -check-prefix=GFX6 %s +; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=GCN -check-prefix=GFX8 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=GCN -check-prefix=GFX8 %s + +; FUNC-LABEL: {{^}}system_acquire +; GCN: BB#0 +; GFX6: s_waitcnt vmcnt(0){{$}} +; GFX6-NEXT: buffer_wbinvl1{{$}} +; GFX8: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @system_acquire() { +entry: + fence acquire + ret void +} + +; FUNC-LABEL: {{^}}system_release +; GCN: BB#0 +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_endpgm +define amdgpu_kernel void @system_release() { +entry: + fence release + ret void +} + +; FUNC-LABEL: {{^}}system_acq_rel +; GCN: BB#0 +; GCN: s_waitcnt vmcnt(0){{$}} +; GFX6: buffer_wbinvl1{{$}} +; GFX8: buffer_wbinvl1_vol{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @system_acq_rel() { +entry: + fence acq_rel + ret void +} + +; FUNC-LABEL: {{^}}system_seq_cst +; GCN: BB#0 +; GCN: s_waitcnt vmcnt(0){{$}} +; GFX6: buffer_wbinvl1{{$}} +; GFX8: buffer_wbinvl1_vol{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @system_seq_cst() { +entry: + fence seq_cst + ret void +} + +; FUNC-LABEL: {{^}}singlethread_acquire +; GCN: BB#0 +; GCN: s_endpgm +define amdgpu_kernel void @singlethread_acquire() { +entry: + fence syncscope("singlethread") acquire + ret void +} + +; FUNC-LABEL: {{^}}singlethread_release +; GCN: BB#0 +; GCN: s_endpgm +define amdgpu_kernel void @singlethread_release() { +entry: + fence syncscope("singlethread") release + ret void +} + +; FUNC-LABEL: {{^}}singlethread_acq_rel +; GCN: BB#0 +; GCN: s_endpgm +define amdgpu_kernel void @singlethread_acq_rel() { +entry: + fence syncscope("singlethread") acq_rel + ret void +} + +; FUNC-LABEL: {{^}}singlethread_seq_cst +; GCN: BB#0 +; GCN: s_endpgm +define amdgpu_kernel void @singlethread_seq_cst() { +entry: + fence syncscope("singlethread") seq_cst + ret void +} + +; FUNC-LABEL: {{^}}agent_acquire +; GCN: BB#0 +; GFX6: s_waitcnt vmcnt(0){{$}} +; GFX6-NEXT: buffer_wbinvl1{{$}} +; GFX8: s_waitcnt vmcnt(0){{$}} +; GFX8-NEXT: buffer_wbinvl1_vol{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @agent_acquire() { +entry: + fence syncscope("agent") acquire + ret void +} + +; FUNC-LABEL: {{^}}agent_release +; GCN: BB#0 +; GCN: s_waitcnt vmcnt(0){{$}} +; GCN: s_endpgm +define amdgpu_kernel void @agent_release() { +entry: + fence syncscope("agent") release + ret void +} + +; FUNC-LABEL: {{^}}agent_acq_rel +; GCN: BB#0 +; GCN: s_waitcnt vmcnt(0){{$}} +; GFX6: buffer_wbinvl1{{$}} +; GFX8: buffer_wbinvl1_vol{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @agent_acq_rel() { +entry: + fence syncscope("agent") acq_rel + ret void +} + +; FUNC-LABEL: {{^}}agent_seq_cst +; GCN: BB#0 +; GCN: s_waitcnt vmcnt(0){{$}} +; GFX6: buffer_wbinvl1{{$}} +; GFX8: buffer_wbinvl1_vol{{$}} +; GCN: s_endpgm +define amdgpu_kernel void @agent_seq_cst() { +entry: + fence syncscope("agent") seq_cst + ret void +} + +; FUNC-LABEL: {{^}}workgroup_acquire +; GCN: BB#0 +; GCN: s_endpgm +define amdgpu_kernel void @workgroup_acquire() { +entry: + fence syncscope("workgroup") acquire + ret void +} + +; FUNC-LABEL: {{^}}workgroup_release +; GCN: BB#0 +; GCN: s_endpgm +define amdgpu_kernel void @workgroup_release() { +entry: + fence syncscope("workgroup") release + ret void +} + +; FUNC-LABEL: {{^}}workgroup_acq_rel +; GCN: BB#0 +; GCN: s_endpgm +define amdgpu_kernel void @workgroup_acq_rel() { +entry: + fence syncscope("workgroup") acq_rel + ret void +} + +; FUNC-LABEL: {{^}}workgroup_seq_cst +; GCN: BB#0 +; GCN: s_endpgm +define amdgpu_kernel void @workgroup_seq_cst() { +entry: + fence syncscope("workgroup") seq_cst + ret void +} + +; FUNC-LABEL: {{^}}wavefront_acquire +; GCN: BB#0 +; GCN: s_endpgm +define amdgpu_kernel void @wavefront_acquire() { +entry: + fence syncscope("wavefront") acquire + ret void +} + +; FUNC-LABEL: {{^}}wavefront_release +; GCN: BB#0 +; GCN: s_endpgm +define amdgpu_kernel void @wavefront_release() { +entry: + fence syncscope("wavefront") release + ret void +} + +; FUNC-LABEL: {{^}}wavefront_acq_rel +; GCN: BB#0 +; GCN: s_endpgm +define amdgpu_kernel void @wavefront_acq_rel() { +entry: + fence syncscope("wavefront") acq_rel + ret void +} + +; FUNC-LABEL: {{^}}wavefront_seq_cst +; GCN: BB#0 +; GCN: s_endpgm +define amdgpu_kernel void @wavefront_seq_cst() { +entry: + fence syncscope("wavefront") seq_cst + ret void +} Index: test/CodeGen/AMDGPU/memory-legalizer-atomic-load.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/memory-legalizer-atomic-load.ll @@ -0,0 +1,282 @@ +; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck %s + +; CHECK-LABEL: {{^}}system_unordered +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @system_unordered( + i32 addrspace(4)* %in, i32 addrspace(4)* %out) { +entry: + %val = load atomic i32, i32 addrspace(4)* %in unordered, align 4 + store i32 %val, i32 addrspace(4)* %out + ret void +} + +; CHECK-LABEL: {{^}}system_monotonic +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @system_monotonic( + i32 addrspace(4)* %in, i32 addrspace(4)* %out) { +entry: + %val = load atomic i32, i32 addrspace(4)* %in monotonic, align 4 + store i32 %val, i32 addrspace(4)* %out + ret void +} + +; CHECK-LABEL: {{^}}system_acquire +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: buffer_wbinvl1_vol +; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @system_acquire( + i32 addrspace(4)* %in, i32 addrspace(4)* %out) { +entry: + %val = load atomic i32, i32 addrspace(4)* %in acquire, align 4 + store i32 %val, i32 addrspace(4)* %out + ret void +} + +; CHECK-LABEL: {{^}}system_seq_cst +; CHECK: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: buffer_wbinvl1_vol +; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @system_seq_cst( + i32 addrspace(4)* %in, i32 addrspace(4)* %out) { +entry: + %val = load atomic i32, i32 addrspace(4)* %in seq_cst, align 4 + store i32 %val, i32 addrspace(4)* %out + ret void +} + +; CHECK-LABEL: {{^}}singlethread_unordered +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @singlethread_unordered( + i32 addrspace(4)* %in, i32 addrspace(4)* %out) { +entry: + %val = load atomic i32, i32 addrspace(4)* %in syncscope("singlethread") unordered, align 4 + store i32 %val, i32 addrspace(4)* %out + ret void +} + +; CHECK-LABEL: {{^}}singlethread_monotonic +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @singlethread_monotonic( + i32 addrspace(4)* %in, i32 addrspace(4)* %out) { +entry: + %val = load atomic i32, i32 addrspace(4)* %in syncscope("singlethread") monotonic, align 4 + store i32 %val, i32 addrspace(4)* %out + ret void +} + +; CHECK-LABEL: {{^}}singlethread_acquire +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @singlethread_acquire( + i32 addrspace(4)* %in, i32 addrspace(4)* %out) { +entry: + %val = load atomic i32, i32 addrspace(4)* %in syncscope("singlethread") acquire, align 4 + store i32 %val, i32 addrspace(4)* %out + ret void +} + +; CHECK-LABEL: {{^}}singlethread_seq_cst +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @singlethread_seq_cst( + i32 addrspace(4)* %in, i32 addrspace(4)* %out) { +entry: + %val = load atomic i32, i32 addrspace(4)* %in syncscope("singlethread") seq_cst, align 4 + store i32 %val, i32 addrspace(4)* %out + ret void +} + +; CHECK-LABEL: {{^}}agent_unordered +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @agent_unordered( + i32 addrspace(4)* %in, i32 addrspace(4)* %out) { +entry: + %val = load atomic i32, i32 addrspace(4)* %in syncscope("agent") unordered, align 4 + store i32 %val, i32 addrspace(4)* %out + ret void +} + +; CHECK-LABEL: {{^}}agent_monotonic +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @agent_monotonic( + i32 addrspace(4)* %in, i32 addrspace(4)* %out) { +entry: + %val = load atomic i32, i32 addrspace(4)* %in syncscope("agent") monotonic, align 4 + store i32 %val, i32 addrspace(4)* %out + ret void +} + +; CHECK-LABEL: {{^}}agent_acquire +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: buffer_wbinvl1_vol +; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @agent_acquire( + i32 addrspace(4)* %in, i32 addrspace(4)* %out) { +entry: + %val = load atomic i32, i32 addrspace(4)* %in syncscope("agent") acquire, align 4 + store i32 %val, i32 addrspace(4)* %out + ret void +} + +; CHECK-LABEL: {{^}}agent_seq_cst +; CHECK: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} +; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: buffer_wbinvl1_vol +; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @agent_seq_cst( + i32 addrspace(4)* %in, i32 addrspace(4)* %out) { +entry: + %val = load atomic i32, i32 addrspace(4)* %in syncscope("agent") seq_cst, align 4 + store i32 %val, i32 addrspace(4)* %out + ret void +} + +; CHECK-LABEL: {{^}}workgroup_unordered +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @workgroup_unordered( + i32 addrspace(4)* %in, i32 addrspace(4)* %out) { +entry: + %val = load atomic i32, i32 addrspace(4)* %in syncscope("workgroup") unordered, align 4 + store i32 %val, i32 addrspace(4)* %out + ret void +} + +; CHECK-LABEL: {{^}}workgroup_monotonic +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @workgroup_monotonic( + i32 addrspace(4)* %in, i32 addrspace(4)* %out) { +entry: + %val = load atomic i32, i32 addrspace(4)* %in syncscope("workgroup") monotonic, align 4 + store i32 %val, i32 addrspace(4)* %out + ret void +} + +; CHECK-LABEL: {{^}}workgroup_acquire +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @workgroup_acquire( + i32 addrspace(4)* %in, i32 addrspace(4)* %out) { +entry: + %val = load atomic i32, i32 addrspace(4)* %in syncscope("workgroup") acquire, align 4 + store i32 %val, i32 addrspace(4)* %out + ret void +} + +; CHECK-LABEL: {{^}}workgroup_seq_cst +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @workgroup_seq_cst( + i32 addrspace(4)* %in, i32 addrspace(4)* %out) { +entry: + %val = load atomic i32, i32 addrspace(4)* %in syncscope("workgroup") seq_cst, align 4 + store i32 %val, i32 addrspace(4)* %out + ret void +} + +; CHECK-LABEL: {{^}}wavefront_unordered +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @wavefront_unordered( + i32 addrspace(4)* %in, i32 addrspace(4)* %out) { +entry: + %val = load atomic i32, i32 addrspace(4)* %in syncscope("wavefront") unordered, align 4 + store i32 %val, i32 addrspace(4)* %out + ret void +} + +; CHECK-LABEL: {{^}}wavefront_monotonic +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @wavefront_monotonic( + i32 addrspace(4)* %in, i32 addrspace(4)* %out) { +entry: + %val = load atomic i32, i32 addrspace(4)* %in syncscope("wavefront") monotonic, align 4 + store i32 %val, i32 addrspace(4)* %out + ret void +} + +; CHECK-LABEL: {{^}}wavefront_acquire +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @wavefront_acquire( + i32 addrspace(4)* %in, i32 addrspace(4)* %out) { +entry: + %val = load atomic i32, i32 addrspace(4)* %in syncscope("wavefront") acquire, align 4 + store i32 %val, i32 addrspace(4)* %out + ret void +} + +; CHECK-LABEL: {{^}}wavefront_seq_cst +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +; CHECK: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @wavefront_seq_cst( + i32 addrspace(4)* %in, i32 addrspace(4)* %out) { +entry: + %val = load atomic i32, i32 addrspace(4)* %in syncscope("wavefront") seq_cst, align 4 + store i32 %val, i32 addrspace(4)* %out + ret void +} Index: test/CodeGen/AMDGPU/memory-legalizer-atomic-rmw.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/memory-legalizer-atomic-rmw.ll @@ -0,0 +1,302 @@ +; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck %s + +; CHECK-LABEL: {{^}}system_monotonic +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_monotonic( + i32 addrspace(4)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in monotonic + ret void +} + +; CHECK-LABEL: {{^}}system_acquire +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_acquire( + i32 addrspace(4)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in acquire + ret void +} + +; CHECK-LABEL: {{^}}system_release +; CHECK: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_release( + i32 addrspace(4)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in release + ret void +} + +; CHECK-LABEL: {{^}}system_acq_rel +; CHECK: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_acq_rel( + i32 addrspace(4)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in acq_rel + ret void +} + +; CHECK-LABEL: {{^}}system_seq_cst +; CHECK: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @system_seq_cst( + i32 addrspace(4)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in seq_cst + ret void +} + +; CHECK-LABEL: {{^}}singlethread_monotonic +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_monotonic( + i32 addrspace(4)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in syncscope("singlethread") monotonic + ret void +} + +; CHECK-LABEL: {{^}}singlethread_acquire +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_acquire( + i32 addrspace(4)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in syncscope("singlethread") acquire + ret void +} + +; CHECK-LABEL: {{^}}singlethread_release +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_release( + i32 addrspace(4)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in syncscope("singlethread") release + ret void +} + +; CHECK-LABEL: {{^}}singlethread_acq_rel +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_acq_rel( + i32 addrspace(4)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in syncscope("singlethread") acq_rel + ret void +} + +; CHECK-LABEL: {{^}}singlethread_seq_cst +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @singlethread_seq_cst( + i32 addrspace(4)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in syncscope("singlethread") seq_cst + ret void +} + +; CHECK-LABEL: {{^}}agent_monotonic +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_monotonic( + i32 addrspace(4)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in syncscope("agent") monotonic + ret void +} + +; CHECK-LABEL: {{^}}agent_acquire +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_acquire( + i32 addrspace(4)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in syncscope("agent") acquire + ret void +} + +; CHECK-LABEL: {{^}}agent_release +; CHECK: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_release( + i32 addrspace(4)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in syncscope("agent") release + ret void +} + +; CHECK-LABEL: {{^}}agent_acq_rel +; CHECK: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_acq_rel( + i32 addrspace(4)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in syncscope("agent") acq_rel + ret void +} + +; CHECK-LABEL: {{^}}agent_seq_cst +; CHECK: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; CHECK-NEXT: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: buffer_wbinvl1_vol +define amdgpu_kernel void @agent_seq_cst( + i32 addrspace(4)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in syncscope("agent") seq_cst + ret void +} + +; CHECK-LABEL: {{^}}workgroup_monotonic +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_monotonic( + i32 addrspace(4)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in syncscope("workgroup") monotonic + ret void +} + +; CHECK-LABEL: {{^}}workgroup_acquire +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_acquire( + i32 addrspace(4)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in syncscope("workgroup") acquire + ret void +} + +; CHECK-LABEL: {{^}}workgroup_release +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_release( + i32 addrspace(4)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in syncscope("workgroup") release + ret void +} + +; CHECK-LABEL: {{^}}workgroup_acq_rel +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_acq_rel( + i32 addrspace(4)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in syncscope("workgroup") acq_rel + ret void +} + +; CHECK-LABEL: {{^}}workgroup_seq_cst +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @workgroup_seq_cst( + i32 addrspace(4)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in syncscope("workgroup") seq_cst + ret void +} + +; CHECK-LABEL: {{^}}wavefront_monotonic +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_monotonic( + i32 addrspace(4)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in syncscope("wavefront") monotonic + ret void +} + +; CHECK-LABEL: {{^}}wavefront_acquire +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_acquire( + i32 addrspace(4)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in syncscope("wavefront") acquire + ret void +} + +; CHECK-LABEL: {{^}}wavefront_release +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_release( + i32 addrspace(4)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in syncscope("wavefront") release + ret void +} + +; CHECK-LABEL: {{^}}wavefront_acq_rel +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_acq_rel( + i32 addrspace(4)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in syncscope("wavefront") acq_rel + ret void +} + +; CHECK-LABEL: {{^}}wavefront_seq_cst +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK-NOT: buffer_wbinvl1_vol +define amdgpu_kernel void @wavefront_seq_cst( + i32 addrspace(4)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in syncscope("wavefront") seq_cst + ret void +} Index: test/CodeGen/AMDGPU/memory-legalizer-atomic-store.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/memory-legalizer-atomic-store.ll @@ -0,0 +1,202 @@ +; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck %s + +; CHECK-LABEL: {{^}}system_unordered +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @system_unordered( + i32 %in, i32 addrspace(4)* %out) { +entry: + store atomic i32 %in, i32 addrspace(4)* %out unordered, align 4 + ret void +} + +; CHECK-LABEL: {{^}}system_monotonic +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @system_monotonic( + i32 %in, i32 addrspace(4)* %out) { +entry: + store atomic i32 %in, i32 addrspace(4)* %out monotonic, align 4 + ret void +} + +; CHECK-LABEL: {{^}}system_release +; CHECK: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @system_release( + i32 %in, i32 addrspace(4)* %out) { +entry: + store atomic i32 %in, i32 addrspace(4)* %out release, align 4 + ret void +} + +; CHECK-LABEL: {{^}}system_seq_cst +; CHECK: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @system_seq_cst( + i32 %in, i32 addrspace(4)* %out) { +entry: + store atomic i32 %in, i32 addrspace(4)* %out seq_cst, align 4 + ret void +} + +; CHECK-LABEL: {{^}}singlethread_unordered +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @singlethread_unordered( + i32 %in, i32 addrspace(4)* %out) { +entry: + store atomic i32 %in, i32 addrspace(4)* %out syncscope("singlethread") unordered, align 4 + ret void +} + +; CHECK-LABEL: {{^}}singlethread_monotonic +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @singlethread_monotonic( + i32 %in, i32 addrspace(4)* %out) { +entry: + store atomic i32 %in, i32 addrspace(4)* %out syncscope("singlethread") monotonic, align 4 + ret void +} + +; CHECK-LABEL: {{^}}singlethread_release +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @singlethread_release( + i32 %in, i32 addrspace(4)* %out) { +entry: + store atomic i32 %in, i32 addrspace(4)* %out syncscope("singlethread") release, align 4 + ret void +} + +; CHECK-LABEL: {{^}}singlethread_seq_cst +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @singlethread_seq_cst( + i32 %in, i32 addrspace(4)* %out) { +entry: + store atomic i32 %in, i32 addrspace(4)* %out syncscope("singlethread") seq_cst, align 4 + ret void +} + +; CHECK-LABEL: {{^}}agent_unordered +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @agent_unordered( + i32 %in, i32 addrspace(4)* %out) { +entry: + store atomic i32 %in, i32 addrspace(4)* %out syncscope("agent") unordered, align 4 + ret void +} + +; CHECK-LABEL: {{^}}agent_monotonic +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @agent_monotonic( + i32 %in, i32 addrspace(4)* %out) { +entry: + store atomic i32 %in, i32 addrspace(4)* %out syncscope("agent") monotonic, align 4 + ret void +} + +; CHECK-LABEL: {{^}}agent_release +; CHECK: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @agent_release( + i32 %in, i32 addrspace(4)* %out) { +entry: + store atomic i32 %in, i32 addrspace(4)* %out syncscope("agent") release, align 4 + ret void +} + +; CHECK-LABEL: {{^}}agent_seq_cst +; CHECK: s_waitcnt vmcnt(0){{$}} +; CHECK-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @agent_seq_cst( + i32 %in, i32 addrspace(4)* %out) { +entry: + store atomic i32 %in, i32 addrspace(4)* %out syncscope("agent") seq_cst, align 4 + ret void +} + +; CHECK-LABEL: {{^}}workgroup_unordered +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @workgroup_unordered( + i32 %in, i32 addrspace(4)* %out) { +entry: + store atomic i32 %in, i32 addrspace(4)* %out syncscope("workgroup") unordered, align 4 + ret void +} + +; CHECK-LABEL: {{^}}workgroup_monotonic +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @workgroup_monotonic( + i32 %in, i32 addrspace(4)* %out) { +entry: + store atomic i32 %in, i32 addrspace(4)* %out syncscope("workgroup") monotonic, align 4 + ret void +} + +; CHECK-LABEL: {{^}}workgroup_release +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @workgroup_release( + i32 %in, i32 addrspace(4)* %out) { +entry: + store atomic i32 %in, i32 addrspace(4)* %out syncscope("workgroup") release, align 4 + ret void +} + +; CHECK-LABEL: {{^}}workgroup_seq_cst +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @workgroup_seq_cst( + i32 %in, i32 addrspace(4)* %out) { +entry: + store atomic i32 %in, i32 addrspace(4)* %out syncscope("workgroup") seq_cst, align 4 + ret void +} + +; CHECK-LABEL: {{^}}wavefront_unordered +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @wavefront_unordered( + i32 %in, i32 addrspace(4)* %out) { +entry: + store atomic i32 %in, i32 addrspace(4)* %out syncscope("wavefront") unordered, align 4 + ret void +} + +; CHECK-LABEL: {{^}}wavefront_monotonic +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @wavefront_monotonic( + i32 %in, i32 addrspace(4)* %out) { +entry: + store atomic i32 %in, i32 addrspace(4)* %out syncscope("wavefront") monotonic, align 4 + ret void +} + +; CHECK-LABEL: {{^}}wavefront_release +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @wavefront_release( + i32 %in, i32 addrspace(4)* %out) { +entry: + store atomic i32 %in, i32 addrspace(4)* %out syncscope("wavefront") release, align 4 + ret void +} + +; CHECK-LABEL: {{^}}wavefront_seq_cst +; CHECK-NOT: s_waitcnt vmcnt(0){{$}} +; CHECK: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} +define amdgpu_kernel void @wavefront_seq_cst( + i32 %in, i32 addrspace(4)* %out) { +entry: + store atomic i32 %in, i32 addrspace(4)* %out syncscope("wavefront") seq_cst, align 4 + ret void +} Index: test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll @@ -0,0 +1,43 @@ +; RUN: not llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s 2>&1 | FileCheck %s +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s 2>&1 | FileCheck %s + +; CHECK: error: :0:0: in function invalid_fence void (): Unsupported synchronization scope +define amdgpu_kernel void @invalid_fence() { +entry: + fence syncscope("invalid") seq_cst + ret void +} + +; CHECK: error: :0:0: in function invalid_load void (i32 addrspace(4)*, i32 addrspace(4)*): Unsupported synchronization scope +define amdgpu_kernel void @invalid_load( + i32 addrspace(4)* %in, i32 addrspace(4)* %out) { +entry: + %val = load atomic i32, i32 addrspace(4)* %in syncscope("invalid") seq_cst, align 4 + store i32 %val, i32 addrspace(4)* %out + ret void +} + +; CHECK: error: :0:0: in function invalid_store void (i32, i32 addrspace(4)*): Unsupported synchronization scope +define amdgpu_kernel void @invalid_store( + i32 %in, i32 addrspace(4)* %out) { +entry: + store atomic i32 %in, i32 addrspace(4)* %out syncscope("invalid") seq_cst, align 4 + ret void +} + +; CHECK: error: :0:0: in function invalid_cmpxchg void (i32 addrspace(4)*, i32, i32): Unsupported synchronization scope +define amdgpu_kernel void @invalid_cmpxchg( + i32 addrspace(4)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in syncscope("invalid") seq_cst seq_cst + ret void +} + +; CHECK: error: :0:0: in function invalid_rmw void (i32 addrspace(4)*, i32): Unsupported synchronization scope +define amdgpu_kernel void @invalid_rmw( + i32 addrspace(4)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in syncscope("invalid") seq_cst + ret void +} Index: test/CodeGen/AMDGPU/syncscopes.ll =================================================================== --- test/CodeGen/AMDGPU/syncscopes.ll +++ test/CodeGen/AMDGPU/syncscopes.ll @@ -1,9 +1,9 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -stop-before=si-debugger-insert-nops < %s | FileCheck --check-prefix=GCN %s ; GCN-LABEL: name: syncscopes -; GCN: FLAT_STORE_DWORD killed %vgpr1_vgpr2, killed %vgpr0, 0, -1, 0, implicit %exec, implicit %flat_scr :: (volatile store syncscope("agent") seq_cst 4 into %ir.agent_out) -; GCN: FLAT_STORE_DWORD killed %vgpr4_vgpr5, killed %vgpr3, 0, -1, 0, implicit %exec, implicit %flat_scr :: (volatile store syncscope("workgroup") seq_cst 4 into %ir.workgroup_out) -; GCN: FLAT_STORE_DWORD killed %vgpr7_vgpr8, killed %vgpr6, 0, -1, 0, implicit %exec, implicit %flat_scr :: (volatile store syncscope("wavefront") seq_cst 4 into %ir.wavefront_out) +; GCN: FLAT_STORE_DWORD killed %vgpr1_vgpr2, killed %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store syncscope("agent") seq_cst 4 into %ir.agent_out) +; GCN: FLAT_STORE_DWORD killed %vgpr4_vgpr5, killed %vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store syncscope("workgroup") seq_cst 4 into %ir.workgroup_out) +; GCN: FLAT_STORE_DWORD killed %vgpr7_vgpr8, killed %vgpr6, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store syncscope("wavefront") seq_cst 4 into %ir.wavefront_out) define void @syncscopes( i32 %agent, i32 addrspace(4)* %agent_out, Index: test/CodeGen/MIR/AMDGPU/memory-legalizer-atomic-insert-end.mir =================================================================== --- /dev/null +++ test/CodeGen/MIR/AMDGPU/memory-legalizer-atomic-insert-end.mir @@ -0,0 +1,122 @@ +# RUN: llc -march=amdgcn -mcpu=gfx803 -run-pass si-memory-legalizer %s -o - | FileCheck %s + +--- | + ; ModuleID = '' + source_filename = "" + target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" + + ; Function Attrs: nounwind readnone + declare i32 @llvm.amdgcn.workitem.id.x() #0 + + ; Function Attrs: nounwind + define amdgpu_kernel void @atomic_max_i32_noret( + i32 addrspace(1)* %out, + i32 addrspace(1)* addrspace(1)* %in, + i32 addrspace(1)* %x, + i32 %y) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %idxprom = sext i32 %tid to i64 + %tid.gep = getelementptr i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %in, i64 %idxprom + %ptr = load volatile i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %tid.gep + %xor = xor i32 %tid, 1 + %cmp = icmp ne i32 %xor, 0 + %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %cmp) + %2 = extractvalue { i1, i64 } %1, 0 + %3 = extractvalue { i1, i64 } %1, 1 + br i1 %2, label %atomic, label %exit + + atomic: ; preds = %0 + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 100 + %ret = atomicrmw max i32 addrspace(1)* %gep, i32 %y seq_cst + br label %exit + + exit: ; preds = %atomic, %0 + call void @llvm.amdgcn.end.cf(i64 %3) + ret void + } + + declare { i1, i64 } @llvm.amdgcn.if(i1) + + declare void @llvm.amdgcn.end.cf(i64) + + ; Function Attrs: nounwind + declare void @llvm.stackprotector(i8*, i8**) #3 + + attributes #0 = { nounwind readnone "target-cpu"="tahiti" } + attributes #1 = { nounwind "target-cpu"="tahiti" } + attributes #2 = { readnone } + attributes #3 = { nounwind } + +... +--- + +# CHECK-LABEL: name: atomic_max_i32_noret + +# CHECK-LABEL: bb.1.atomic: +# CHECK: BUFFER_ATOMIC_SMAX_ADDR64 +# CHECK-NEXT: S_WAITCNT 3952 +# CHECK-NEXT: BUFFER_WBINVL1_VOL + +name: atomic_max_i32_noret +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%sgpr0_sgpr1' } + - { reg: '%vgpr0' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + successors: %bb.1.atomic(0x40000000), %bb.2.exit(0x40000000) + liveins: %vgpr0, %sgpr0_sgpr1 + + %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM %sgpr0_sgpr1, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %vgpr1 = V_ASHRREV_I32_e32 31, %vgpr0, implicit %exec + %vgpr1_vgpr2 = V_LSHL_B64 %vgpr0_vgpr1, 3, implicit %exec + %sgpr7 = S_MOV_B32 61440 + %sgpr6 = S_MOV_B32 0 + S_WAITCNT 127 + %vgpr1_vgpr2 = BUFFER_LOAD_DWORDX2_ADDR64 killed %vgpr1_vgpr2, %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 8 from %ir.tid.gep) + %vgpr0 = V_XOR_B32_e32 1, killed %vgpr0, implicit %exec + V_CMP_NE_U32_e32 0, killed %vgpr0, implicit-def %vcc, implicit %exec + %sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed %vcc, implicit-def %exec, implicit-def %scc, implicit %exec + %sgpr2_sgpr3 = S_XOR_B64 %exec, killed %sgpr2_sgpr3, implicit-def dead %scc + SI_MASK_BRANCH %bb.2.exit, implicit %exec + + bb.1.atomic: + successors: %bb.2.exit(0x80000000) + liveins: %sgpr4_sgpr5_sgpr6_sgpr7:0x0000000C, %sgpr0_sgpr1, %sgpr2_sgpr3, %vgpr1_vgpr2_vgpr3_vgpr4:0x00000003 + + %sgpr0 = S_LOAD_DWORD_IMM killed %sgpr0_sgpr1, 15, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(2)* undef`) + dead %vgpr0 = V_MOV_B32_e32 -1, implicit %exec + dead %vgpr0 = V_MOV_B32_e32 61440, implicit %exec + %sgpr4_sgpr5 = S_MOV_B64 0 + S_WAITCNT 127 + %vgpr0 = V_MOV_B32_e32 killed %sgpr0, implicit %exec, implicit %exec + S_WAITCNT 3952 + BUFFER_ATOMIC_SMAX_ADDR64 killed %vgpr0, killed %vgpr1_vgpr2, killed %sgpr4_sgpr5_sgpr6_sgpr7, 0, 400, 0, implicit %exec :: (volatile load seq_cst 4 from %ir.gep) + + bb.2.exit: + liveins: %sgpr2_sgpr3 + + %exec = S_OR_B64 %exec, killed %sgpr2_sgpr3, implicit-def %scc + S_ENDPGM + +... +