Skip to content

Commit 51809cb

Browse files
committedMar 25, 2019
AMDGPU: Add support for cross address space synchronization scopes
Differential Revision: https://reviews.llvm.org/D59517 llvm-svn: 356946
1 parent 65bd5d8 commit 51809cb

19 files changed

+2692
-490
lines changed
 

‎llvm/docs/AMDGPUUsage.rst

Lines changed: 74 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -323,62 +323,80 @@ is conservatively correct for OpenCL.
323323
.. table:: AMDHSA LLVM Sync Scopes
324324
:name: amdgpu-amdhsa-llvm-sync-scopes-table
325325

326-
================ ==========================================================
327-
LLVM Sync Scope Description
328-
================ ==========================================================
329-
*none* The default: ``system``.
330-
331-
Synchronizes with, and participates in modification and
332-
seq_cst total orderings with, other operations (except
333-
image operations) for all address spaces (except private,
334-
or generic that accesses private) provided the other
335-
operation's sync scope is:
336-
337-
- ``system``.
338-
- ``agent`` and executed by a thread on the same agent.
339-
- ``workgroup`` and executed by a thread in the same
340-
workgroup.
341-
- ``wavefront`` and executed by a thread in the same
342-
wavefront.
343-
344-
``agent`` Synchronizes with, and participates in modification and
345-
seq_cst total orderings with, other operations (except
346-
image operations) for all address spaces (except private,
347-
or generic that accesses private) provided the other
348-
operation's sync scope is:
349-
350-
- ``system`` or ``agent`` and executed by a thread on the
351-
same agent.
352-
- ``workgroup`` and executed by a thread in the same
353-
workgroup.
354-
- ``wavefront`` and executed by a thread in the same
355-
wavefront.
356-
357-
``workgroup`` Synchronizes with, and participates in modification and
358-
seq_cst total orderings with, other operations (except
359-
image operations) for all address spaces (except private,
360-
or generic that accesses private) provided the other
361-
operation's sync scope is:
362-
363-
- ``system``, ``agent`` or ``workgroup`` and executed by a
364-
thread in the same workgroup.
365-
- ``wavefront`` and executed by a thread in the same
366-
wavefront.
367-
368-
``wavefront`` Synchronizes with, and participates in modification and
369-
seq_cst total orderings with, other operations (except
370-
image operations) for all address spaces (except private,
371-
or generic that accesses private) provided the other
372-
operation's sync scope is:
373-
374-
- ``system``, ``agent``, ``workgroup`` or ``wavefront``
375-
and executed by a thread in the same wavefront.
376-
377-
``singlethread`` Only synchronizes with, and participates in modification
378-
and seq_cst total orderings with, other operations (except
379-
image operations) running in the same thread for all
380-
address spaces (for example, in signal handlers).
381-
================ ==========================================================
326+
======================= ===================================================
327+
LLVM Sync Scope Description
328+
======================= ===================================================
329+
*none* The default: ``system``.
330+
331+
Synchronizes with, and participates in modification
332+
and seq_cst total orderings with, other operations
333+
(except image operations) for all address spaces
334+
(except private, or generic that accesses private)
335+
provided the other operation's sync scope is:
336+
337+
- ``system``.
338+
- ``agent`` and executed by a thread on the same
339+
agent.
340+
- ``workgroup`` and executed by a thread in the
341+
same workgroup.
342+
- ``wavefront`` and executed by a thread in the
343+
same wavefront.
344+
345+
``agent`` Synchronizes with, and participates in modification
346+
and seq_cst total orderings with, other operations
347+
(except image operations) for all address spaces
348+
(except private, or generic that accesses private)
349+
provided the other operation's sync scope is:
350+
351+
- ``system`` or ``agent`` and executed by a thread
352+
on the same agent.
353+
- ``workgroup`` and executed by a thread in the
354+
same workgroup.
355+
- ``wavefront`` and executed by a thread in the
356+
same wavefront.
357+
358+
``workgroup`` Synchronizes with, and participates in modification
359+
and seq_cst total orderings with, other operations
360+
(except image operations) for all address spaces
361+
(except private, or generic that accesses private)
362+
provided the other operation's sync scope is:
363+
364+
- ``system``, ``agent`` or ``workgroup`` and
365+
executed by a thread in the same workgroup.
366+
- ``wavefront`` and executed by a thread in the
367+
same wavefront.
368+
369+
``wavefront`` Synchronizes with, and participates in modification
370+
and seq_cst total orderings with, other operations
371+
(except image operations) for all address spaces
372+
(except private, or generic that accesses private)
373+
provided the other operation's sync scope is:
374+
375+
- ``system``, ``agent``, ``workgroup`` or
376+
``wavefront`` and executed by a thread in the
377+
same wavefront.
378+
379+
``singlethread`` Only synchronizes with, and participates in
380+
modification and seq_cst total orderings with,
381+
other operations (except image operations) running
382+
in the same thread for all address spaces (for
383+
example, in signal handlers).
384+
385+
``one-as`` Same as ``system`` but only synchronizes with other
386+
operations within the same address space.
387+
388+
``agent-one-as`` Same as ``agent`` but only synchronizes with other
389+
operations within the same address space.
390+
391+
``workgroup-one-as`` Same as ``workgroup`` but only synchronizes with
392+
other operations within the same address space.
393+
394+
``wavefront-one-as`` Same as ``wavefront`` but only synchronizes with
395+
other operations within the same address space.
396+
397+
``singlethread-one-as`` Same as ``singlethread`` but only synchronizes with
398+
other operations within the same address space.
399+
======================= ===================================================
382400

383401
AMDGPU Intrinsics
384402
-----------------

‎llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,16 @@ AMDGPUMachineModuleInfo::AMDGPUMachineModuleInfo(const MachineModuleInfo &MMI)
2323
AgentSSID = CTX.getOrInsertSyncScopeID("agent");
2424
WorkgroupSSID = CTX.getOrInsertSyncScopeID("workgroup");
2525
WavefrontSSID = CTX.getOrInsertSyncScopeID("wavefront");
26+
SystemOneAddressSpaceSSID =
27+
CTX.getOrInsertSyncScopeID("one-as");
28+
AgentOneAddressSpaceSSID =
29+
CTX.getOrInsertSyncScopeID("agent-one-as");
30+
WorkgroupOneAddressSpaceSSID =
31+
CTX.getOrInsertSyncScopeID("workgroup-one-as");
32+
WavefrontOneAddressSpaceSSID =
33+
CTX.getOrInsertSyncScopeID("wavefront-one-as");
34+
SingleThreadOneAddressSpaceSSID =
35+
CTX.getOrInsertSyncScopeID("singlethread-one-as");
2636
}
2737

2838
} // end namespace llvm

‎llvm/lib/Target/AMDGPU/AMDGPUMachineModuleInfo.h

Lines changed: 61 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,22 @@ class AMDGPUMachineModuleInfo final : public MachineModuleInfoELF {
2929
// All supported memory/synchronization scopes can be found here:
3030
// http://llvm.org/docs/AMDGPUUsage.html#memory-scopes
3131

32-
/// Agent synchronization scope ID.
32+
/// Agent synchronization scope ID (cross address space).
3333
SyncScope::ID AgentSSID;
34-
/// Workgroup synchronization scope ID.
34+
/// Workgroup synchronization scope ID (cross address space).
3535
SyncScope::ID WorkgroupSSID;
36-
/// Wavefront synchronization scope ID.
36+
/// Wavefront synchronization scope ID (cross address space).
3737
SyncScope::ID WavefrontSSID;
38+
/// System synchronization scope ID (single address space).
39+
SyncScope::ID SystemOneAddressSpaceSSID;
40+
/// Agent synchronization scope ID (single address space).
41+
SyncScope::ID AgentOneAddressSpaceSSID;
42+
/// Workgroup synchronization scope ID (single address space).
43+
SyncScope::ID WorkgroupOneAddressSpaceSSID;
44+
/// Wavefront synchronization scope ID (single address space).
45+
SyncScope::ID WavefrontOneAddressSpaceSSID;
46+
/// Single thread synchronization scope ID (single address space).
47+
SyncScope::ID SingleThreadOneAddressSpaceSSID;
3848

3949
/// In AMDGPU target synchronization scopes are inclusive, meaning a
4050
/// larger synchronization scope is inclusive of a smaller synchronization
@@ -43,35 +53,70 @@ class AMDGPUMachineModuleInfo final : public MachineModuleInfoELF {
4353
/// \returns \p SSID's inclusion ordering, or "None" if \p SSID is not
4454
/// supported by the AMDGPU target.
4555
Optional<uint8_t> getSyncScopeInclusionOrdering(SyncScope::ID SSID) const {
46-
if (SSID == SyncScope::SingleThread)
56+
if (SSID == SyncScope::SingleThread ||
57+
SSID == getSingleThreadOneAddressSpaceSSID())
4758
return 0;
48-
else if (SSID == getWavefrontSSID())
59+
else if (SSID == getWavefrontSSID() ||
60+
SSID == getWavefrontOneAddressSpaceSSID())
4961
return 1;
50-
else if (SSID == getWorkgroupSSID())
62+
else if (SSID == getWorkgroupSSID() ||
63+
SSID == getWorkgroupOneAddressSpaceSSID())
5164
return 2;
52-
else if (SSID == getAgentSSID())
65+
else if (SSID == getAgentSSID() ||
66+
SSID == getAgentOneAddressSpaceSSID())
5367
return 3;
54-
else if (SSID == SyncScope::System)
68+
else if (SSID == SyncScope::System ||
69+
SSID == getSystemOneAddressSpaceSSID())
5570
return 4;
5671

5772
return None;
5873
}
5974

75+
/// \returns True if \p SSID is restricted to single address space, false
76+
/// otherwise
77+
bool isOneAddressSpace(SyncScope::ID SSID) const {
78+
return SSID == getSingleThreadOneAddressSpaceSSID() ||
79+
SSID == getWavefrontOneAddressSpaceSSID() ||
80+
SSID == getWorkgroupOneAddressSpaceSSID() ||
81+
SSID == getAgentOneAddressSpaceSSID() ||
82+
SSID == getSystemOneAddressSpaceSSID();
83+
}
84+
6085
public:
6186
AMDGPUMachineModuleInfo(const MachineModuleInfo &MMI);
6287

63-
/// \returns Agent synchronization scope ID.
88+
/// \returns Agent synchronization scope ID (cross address space).
6489
SyncScope::ID getAgentSSID() const {
6590
return AgentSSID;
6691
}
67-
/// \returns Workgroup synchronization scope ID.
92+
/// \returns Workgroup synchronization scope ID (cross address space).
6893
SyncScope::ID getWorkgroupSSID() const {
6994
return WorkgroupSSID;
7095
}
71-
/// \returns Wavefront synchronization scope ID.
96+
/// \returns Wavefront synchronization scope ID (cross address space).
7297
SyncScope::ID getWavefrontSSID() const {
7398
return WavefrontSSID;
7499
}
100+
/// \returns System synchronization scope ID (single address space).
101+
SyncScope::ID getSystemOneAddressSpaceSSID() const {
102+
return SystemOneAddressSpaceSSID;
103+
}
104+
/// \returns Agent synchronization scope ID (single address space).
105+
SyncScope::ID getAgentOneAddressSpaceSSID() const {
106+
return AgentOneAddressSpaceSSID;
107+
}
108+
/// \returns Workgroup synchronization scope ID (single address space).
109+
SyncScope::ID getWorkgroupOneAddressSpaceSSID() const {
110+
return WorkgroupOneAddressSpaceSSID;
111+
}
112+
/// \returns Wavefront synchronization scope ID (single address space).
113+
SyncScope::ID getWavefrontOneAddressSpaceSSID() const {
114+
return WavefrontOneAddressSpaceSSID;
115+
}
116+
/// \returns Single thread synchronization scope ID (single address space).
117+
SyncScope::ID getSingleThreadOneAddressSpaceSSID() const {
118+
return SingleThreadOneAddressSpaceSSID;
119+
}
75120

76121
/// In AMDGPU target synchronization scopes are inclusive, meaning a
77122
/// larger synchronization scope is inclusive of a smaller synchronization
@@ -87,7 +132,11 @@ class AMDGPUMachineModuleInfo final : public MachineModuleInfoELF {
87132
if (!AIO || !BIO)
88133
return None;
89134

90-
return AIO.getValue() > BIO.getValue();
135+
bool IsAOneAddressSpace = isOneAddressSpace(A);
136+
bool IsBOneAddressSpace = isOneAddressSpace(B);
137+
138+
return AIO.getValue() >= BIO.getValue() &&
139+
(IsAOneAddressSpace == IsBOneAddressSpace || !IsAOneAddressSpace);
91140
}
92141
};
93142

‎llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Lines changed: 30 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -417,35 +417,46 @@ void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
417417
Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
418418
SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
419419
SIAtomicAddrSpace InstrScope) const {
420-
/// TODO: For now assume OpenCL memory model which treats each
421-
/// address space as having a separate happens-before relation, and
422-
/// so an instruction only has ordering with respect to the address
423-
/// space it accesses, and if it accesses multiple address spaces it
424-
/// does not require ordering of operations in different address
425-
/// spaces.
426-
if (SSID == SyncScope::System)
420+
if (SSID == SyncScope::System)
421+
return std::make_tuple(SIAtomicScope::SYSTEM,
422+
SIAtomicAddrSpace::ATOMIC,
423+
true);
424+
if (SSID == MMI->getAgentSSID())
425+
return std::make_tuple(SIAtomicScope::AGENT,
426+
SIAtomicAddrSpace::ATOMIC,
427+
true);
428+
if (SSID == MMI->getWorkgroupSSID())
429+
return std::make_tuple(SIAtomicScope::WORKGROUP,
430+
SIAtomicAddrSpace::ATOMIC,
431+
true);
432+
if (SSID == MMI->getWavefrontSSID())
433+
return std::make_tuple(SIAtomicScope::WAVEFRONT,
434+
SIAtomicAddrSpace::ATOMIC,
435+
true);
436+
if (SSID == SyncScope::SingleThread)
437+
return std::make_tuple(SIAtomicScope::SINGLETHREAD,
438+
SIAtomicAddrSpace::ATOMIC,
439+
true);
440+
if (SSID == MMI->getSystemOneAddressSpaceSSID())
427441
return std::make_tuple(SIAtomicScope::SYSTEM,
428442
SIAtomicAddrSpace::ATOMIC & InstrScope,
429443
false);
430-
if (SSID == MMI->getAgentSSID())
444+
if (SSID == MMI->getAgentOneAddressSpaceSSID())
431445
return std::make_tuple(SIAtomicScope::AGENT,
432446
SIAtomicAddrSpace::ATOMIC & InstrScope,
433447
false);
434-
if (SSID == MMI->getWorkgroupSSID())
448+
if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
435449
return std::make_tuple(SIAtomicScope::WORKGROUP,
436450
SIAtomicAddrSpace::ATOMIC & InstrScope,
437451
false);
438-
if (SSID == MMI->getWavefrontSSID())
452+
if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
439453
return std::make_tuple(SIAtomicScope::WAVEFRONT,
440454
SIAtomicAddrSpace::ATOMIC & InstrScope,
441455
false);
442-
if (SSID == SyncScope::SingleThread)
456+
if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
443457
return std::make_tuple(SIAtomicScope::SINGLETHREAD,
444458
SIAtomicAddrSpace::ATOMIC & InstrScope,
445459
false);
446-
/// TODO: To support HSA Memory Model need to add additional memory
447-
/// scopes that specify that do require cross address space
448-
/// ordering.
449460
return None;
450461
}
451462

@@ -721,13 +732,12 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
721732

722733
bool VMCnt = false;
723734
bool LGKMCnt = false;
724-
bool EXPCnt = false;
725735

726736
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
727737
switch (Scope) {
728738
case SIAtomicScope::SYSTEM:
729739
case SIAtomicScope::AGENT:
730-
VMCnt = true;
740+
VMCnt |= true;
731741
break;
732742
case SIAtomicScope::WORKGROUP:
733743
case SIAtomicScope::WAVEFRONT:
@@ -751,7 +761,7 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
751761
// also synchronizing with global/GDS memory as LDS operations
752762
// could be reordered with respect to later global/GDS memory
753763
// operations of the same wave.
754-
LGKMCnt = IsCrossAddrSpaceOrdering;
764+
LGKMCnt |= IsCrossAddrSpaceOrdering;
755765
break;
756766
case SIAtomicScope::WAVEFRONT:
757767
case SIAtomicScope::SINGLETHREAD:
@@ -773,7 +783,7 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
773783
// also synchronizing with global/LDS memory as GDS operations
774784
// could be reordered with respect to later global/LDS memory
775785
// operations of the same wave.
776-
EXPCnt = IsCrossAddrSpaceOrdering;
786+
LGKMCnt |= IsCrossAddrSpaceOrdering;
777787
break;
778788
case SIAtomicScope::WORKGROUP:
779789
case SIAtomicScope::WAVEFRONT:
@@ -786,11 +796,11 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
786796
}
787797
}
788798

789-
if (VMCnt || LGKMCnt || EXPCnt) {
799+
if (VMCnt || LGKMCnt) {
790800
unsigned WaitCntImmediate =
791801
AMDGPU::encodeWaitcnt(IV,
792802
VMCnt ? 0 : getVmcntBitMask(IV),
793-
EXPCnt ? 0 : getExpcntBitMask(IV),
803+
getExpcntBitMask(IV),
794804
LGKMCnt ? 0 : getLgkmcntBitMask(IV));
795805
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
796806
Changed = true;

‎llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,10 @@ define i32 @atomic_nand_i32_lds(i32 addrspace(3)* %ptr) nounwind {
1212
; GCN-NEXT: s_waitcnt lgkmcnt(0)
1313
; GCN-NEXT: v_not_b32_e32 v1, v2
1414
; GCN-NEXT: v_or_b32_e32 v1, -5, v1
15+
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1516
; GCN-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
16-
; GCN-NEXT: s_waitcnt lgkmcnt(0)
17+
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18+
; GCN-NEXT: buffer_wbinvl1_vol
1719
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
1820
; GCN-NEXT: v_mov_b32_e32 v2, v1
1921
; GCN-NEXT: s_or_b64 s[6:7], vcc, s[6:7]

0 commit comments

Comments
 (0)
Please sign in to comment.