Diff 389991

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Show First 20 Lines • Show All 789 Lines • ▼ Show 20 Lines	bool SIGfx6CacheControl::enableLoadCacheBypass(
SIAtomicAddrSpace AddrSpace) const {		SIAtomicAddrSpace AddrSpace) const {
assert(MI->mayLoad() && !MI->mayStore());		assert(MI->mayLoad() && !MI->mayStore());
bool Changed = false;		bool Changed = false;

if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {		if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
switch (Scope) {		switch (Scope) {
case SIAtomicScope::SYSTEM:		case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:		case SIAtomicScope::AGENT:
		// Set L1 cache policy to MISS_EVICT.
		// Note: there is no L2 cache bypass policy at the ISA level.
Changed \|= enableGLCBit(MI);		Changed \|= enableGLCBit(MI);
break;		break;
case SIAtomicScope::WORKGROUP:		case SIAtomicScope::WORKGROUP:
case SIAtomicScope::WAVEFRONT:		case SIAtomicScope::WAVEFRONT:
case SIAtomicScope::SINGLETHREAD:		case SIAtomicScope::SINGLETHREAD:
// No cache to bypass.		// No cache to bypass.
break;		break;
default:		default:
Show All 26 Lines

bool SIGfx6CacheControl::enableRMWCacheBypass(		bool SIGfx6CacheControl::enableRMWCacheBypass(
const MachineBasicBlock::iterator &MI,		const MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,		SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const {		SIAtomicAddrSpace AddrSpace) const {
assert(MI->mayLoad() && MI->mayStore());		assert(MI->mayLoad() && MI->mayStore());
bool Changed = false;		bool Changed = false;

/// The L1 cache is write through so does not need to be bypassed. There is no		/// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
/// bypass control for the L2 cache at the isa level.		/// bypassed, and the GLC bit is instead used to indicate if they are
		/// return or no-return.
		t-tyeUnsubmitted Done Reply Inline Actions Please add back: /// There is no bypass control for the L2 cache at the isa level. The modified comment is only explaining the L1 cache and both caches are involved for system scope. t-tye: Please add back: /// There is no bypass control for the L2 cache at the isa level. The…
		critsonAuthorUnsubmitted Done Reply Inline Actions I deleted that text because there is a bypass for L2 stores and atomics on GFX10: SLC=0 DLC=1. I can put it back but only contextualised for targets before GFX10? (And the same for all the similar references in comments below.) critson: I deleted that text because there is a bypass for L2 stores and atomics on GFX10: SLC=0 DLC=1.
		t-tyeUnsubmitted Done Reply Inline Actions In GFX10 the bypass is only available for stores and not loads, and is not coherent so cannot be used anyway. That is why I added the word "coherent" in one of the comments below. So probably should do that here too. t-tye: In GFX10 the bypass is only available for stores and not loads, and is not coherent so cannot…
		/// Note: there is no L2 cache coherent bypass control at the ISA level.

return Changed;		return Changed;
}		}

bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(		bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,		MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
bool IsVolatile, bool IsNonTemporal) const {		bool IsVolatile, bool IsNonTemporal) const {
// Only handle load and store, not atomic read-modify-write insructions. The		// Only handle load and store, not atomic read-modify-write insructions. The
// latter use glc to indicate if the atomic returns a result and so must not		// latter use glc to indicate if the atomic returns a result and so must not
// be used for cache control.		// be used for cache control.
assert(MI->mayLoad() ^ MI->mayStore());		assert(MI->mayLoad() ^ MI->mayStore());

// Only update load and store, not LLVM IR atomic read-modify-write		// Only update load and store, not LLVM IR atomic read-modify-write
// instructions. The latter are always marked as volatile so cannot sensibly		// instructions. The latter are always marked as volatile so cannot sensibly
// handle it as do not want to pessimize all atomics. Also they do not support		// handle it as do not want to pessimize all atomics. Also they do not support
// the nontemporal attribute.		// the nontemporal attribute.
assert(Op == SIMemOp::LOAD \|\| Op == SIMemOp::STORE);		assert(Op == SIMemOp::LOAD \|\| Op == SIMemOp::STORE);

bool Changed = false;		bool Changed = false;

if (IsVolatile) {		if (IsVolatile) {
		// Set L1 cache policy to be MISS_EVICT for load instructions
		// and MISS_LRU for store instructions.
		t-tyeUnsubmitted Done Reply Inline Actions How about: // Request L1 cache policy to be MISS_EVICT for load instructions and MISS_LRU for store instructions. Note that there is no L2 cache bypass policy at the isa level. t-tye: How about: // Request L1 cache policy to be MISS_EVICT for load instructions and MISS_LRU…
		t-tyeUnsubmitted Done Reply Inline Actions I would not add "{write-combine)" as it really is not an exact match for that term. t-tye: I would not add "{write-combine)" as it really is not an exact match for that term.
		// Note: there is no L2 cache bypass policy at the ISA level.
if (Op == SIMemOp::LOAD)		if (Op == SIMemOp::LOAD)
Changed \|= enableGLCBit(MI);		Changed \|= enableGLCBit(MI);

// Ensure operation has completed at system scope to cause all volatile		// Ensure operation has completed at system scope to cause all volatile
// operations to be visible outside the program in a global order. Do not		// operations to be visible outside the program in a global order. Do not
// request cross address space as only the global address space can be		// request cross address space as only the global address space can be
// observable outside the program, so no need to cause a waitcnt for LDS		// observable outside the program, so no need to cause a waitcnt for LDS
// address space operations.		// address space operations.
Changed \|= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,		Changed \|= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
Position::AFTER);		Position::AFTER);

return Changed;		return Changed;
		foadUnsubmitted Done Reply Inline Actions Not really related to your patch, but why do we return here? Doesn't that mean that IsNonTemporal is effectively ignored if IsVolatile is true? Wouldn't it be both better and simpler to fall through to the IsNonTemporal handling here? foad: Not really related to your patch, but why do we return here? Doesn't that mean that…
		critsonAuthorUnsubmitted Done Reply Inline Actions This is a good point. From a bit setting perspective it would be fine. Of course there is the question of what it semantically means to have a volatile nontemporal access when we seem to define volatile as "bypasses all caches". critson: This is a good point. From a bit setting perspective it would be fine. Of course there is the…
		t-tyeUnsubmitted Done Reply Inline Actions I suspect this is because at one time relaxed atomics were marked as volatile. This may have been because the C/C++/OpenCL standards defined them that way, or because LLVM back then did not fully support atomics so marking them all as volatile made the existing passes "do the right thing". So this code may have been an attempt not to pessimize normal relaxed atomics. LLVM does not support non-temporal atomics currently. I am not sure if these reasons are still the case so would be good to investigate and potentially fix this code (or at least document why it is the way it is with a FIXME). That can be a separate review I think. t-tye: I suspect this is because at one time relaxed atomics were marked as volatile. This may have…
		critsonAuthorUnsubmitted Done Reply Inline Actions Yes, a separate investigation and review. critson: Yes, a separate investigation and review.
}		}

if (IsNonTemporal) {		if (IsNonTemporal) {
// Request L1 MISS_EVICT and L2 STREAM for load and store instructions.		// Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
		// for both loads and stores, and the L2 cache policy to STREAM.
Changed \|= enableGLCBit(MI);		Changed \|= enableGLCBit(MI);
		t-tyeUnsubmitted Done Reply Inline Actions I had not remembered that the GLC and SLC bits are used together to set the L1 cache policy. So how about: // Setting GLC and SLC both to 1 sets the L1 cache policy to MISS_EVICT for both loads and stores, and the L2 cache policy to STREAM. and delete the comment below. t-tye: I had not remembered that the GLC and SLC bits are used together to set the L1 cache policy. So…
Changed \|= enableSLCBit(MI);		Changed \|= enableSLCBit(MI);
return Changed;		return Changed;
}		}

return Changed;		return Changed;
}		}

bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,		bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
▲ Show 20 Lines • Show All 204 Lines • ▼ Show 20 Lines	bool SIGfx90ACacheControl::enableLoadCacheBypass(
SIAtomicAddrSpace AddrSpace) const {		SIAtomicAddrSpace AddrSpace) const {
assert(MI->mayLoad() && !MI->mayStore());		assert(MI->mayLoad() && !MI->mayStore());
bool Changed = false;		bool Changed = false;

if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {		if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
switch (Scope) {		switch (Scope) {
case SIAtomicScope::SYSTEM:		case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:		case SIAtomicScope::AGENT:
		// Set the L1 cache policy to MISS_LRU.
		t-tyeUnsubmitted Done Reply Inline Actions MISS_EVICT is a policy that only applies when both SLC and GLC are set. Here only GLC is being set which specifies MISS_LRU and L2 LRU. How about: // Set the L1 cache policy to MISS_LRU. Note that there is no L2 cache bypass policy at the isa level. t-tye: MISS_EVICT is a policy that only applies when both SLC and GLC are set. Here only GLC is being…
		critsonAuthorUnsubmitted Done Reply Inline Actions "The load intentionally misses the GPU L1 and reads from L2. If there was a line in the GPU L1 that matched, it is invalidated; L2 is reread." -- (CDNA1 Shader ISA, p67). This sounds like MISS_EVICT to me. As far as I know MISS_LRU only exists for stores and means write-combine, whereas MISS_EVICT is write-through. critson: > "The load intentionally misses the GPU L1 and reads from L2. If there was a line in the GPU…
		t-tyeUnsubmitted Done Reply Inline Actions GFX90A cache policies are different to other GFX9 and GFX10. If GLC=1 then the L1 policy is MISS_LRU for loads: any existing line is invalidated, then the cache line is loaded and remains in the cache with LRU policy. t-tye: GFX90A cache policies are different to other GFX9 and GFX10. If GLC=1 then the L1 policy is…
		// Note: there is no L2 cache bypass policy at the ISA level.
Changed \|= enableGLCBit(MI);		Changed \|= enableGLCBit(MI);
break;		break;
case SIAtomicScope::WORKGROUP:		case SIAtomicScope::WORKGROUP:
// In threadgroup split mode the waves of a work-group can be executing on		// In threadgroup split mode the waves of a work-group can be executing on
// different CUs. Therefore need to bypass the L1 which is per CU.		// different CUs. Therefore need to bypass the L1 which is per CU.
// Otherwise in non-threadgroup split mode all waves of a work-group are		// Otherwise in non-threadgroup split mode all waves of a work-group are
// on the same CU, and so the L1 does not need to be bypassed.		// on the same CU, and so the L1 does not need to be bypassed.
if (ST.isTgSplitEnabled())		if (ST.isTgSplitEnabled())
▲ Show 20 Lines • Show All 93 Lines • ▼ Show 20 Lines	bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
// instructions. The latter are always marked as volatile so cannot sensibly		// instructions. The latter are always marked as volatile so cannot sensibly
// handle it as do not want to pessimize all atomics. Also they do not support		// handle it as do not want to pessimize all atomics. Also they do not support
// the nontemporal attribute.		// the nontemporal attribute.
assert(Op == SIMemOp::LOAD \|\| Op == SIMemOp::STORE);		assert(Op == SIMemOp::LOAD \|\| Op == SIMemOp::STORE);

bool Changed = false;		bool Changed = false;

if (IsVolatile) {		if (IsVolatile) {
		// Set L1 cache policy to be MISS_EVICT for load instructions
		// and MISS_LRU for store instructions.
		t-tyeUnsubmitted Done Reply Inline Actions How about: // Request L1 cache policy to be MISS_EVICT for load instructions and MISS_LRU for store instructions. Note that there is no L2 cache bypass policy at the isa level. t-tye: How about: // Request L1 cache policy to be MISS_EVICT for load instructions and MISS_LRU for…
		t-tyeUnsubmitted Done Reply Inline Actions Would eliminate "(write-combine)" as mentioned above. t-tye: Would eliminate "(write-combine)" as mentioned above.
		// Note: there is no L2 cache bypass policy at the ISA level.
if (Op == SIMemOp::LOAD)		if (Op == SIMemOp::LOAD)
Changed \|= enableGLCBit(MI);		Changed \|= enableGLCBit(MI);

// Ensure operation has completed at system scope to cause all volatile		// Ensure operation has completed at system scope to cause all volatile
// operations to be visible outside the program in a global order. Do not		// operations to be visible outside the program in a global order. Do not
// request cross address space as only the global address space can be		// request cross address space as only the global address space can be
// observable outside the program, so no need to cause a waitcnt for LDS		// observable outside the program, so no need to cause a waitcnt for LDS
// address space operations.		// address space operations.
Changed \|= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,		Changed \|= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
Position::AFTER);		Position::AFTER);

return Changed;		return Changed;
}		}

if (IsNonTemporal) {		if (IsNonTemporal) {
// Request L1 MISS_EVICT and L2 STREAM for load and store instructions.		// Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
		// for both loads and stores, and the L2 cache policy to STREAM.
Changed \|= enableGLCBit(MI);		Changed \|= enableGLCBit(MI);
Changed \|= enableSLCBit(MI);		Changed \|= enableSLCBit(MI);
		t-tyeUnsubmitted Done Reply Inline Actions How about: // Setting GLC and SLC both to 1 sets the L1 cache policy to MISS_EVICT for both loads and stores, and the L2 cache policy to STREAM. t-tye: How about: // Setting GLC and SLC both to 1 sets the L1 cache policy to MISS_EVICT for both…
return Changed;		return Changed;
}		}

return Changed;		return Changed;
}		}

bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,		bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,		SIAtomicScope Scope,
▲ Show 20 Lines • Show All 140 Lines • ▼ Show 20 Lines
bool SIGfx10CacheControl::enableLoadCacheBypass(		bool SIGfx10CacheControl::enableLoadCacheBypass(
const MachineBasicBlock::iterator &MI,		const MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,		SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace) const {		SIAtomicAddrSpace AddrSpace) const {
assert(MI->mayLoad() && !MI->mayStore());		assert(MI->mayLoad() && !MI->mayStore());
bool Changed = false;		bool Changed = false;

if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {		if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
/// TODO Do not set glc for rmw atomic operations as they
/// implicitly bypass the L0/L1 caches.

switch (Scope) {		switch (Scope) {
case SIAtomicScope::SYSTEM:		case SIAtomicScope::SYSTEM:
case SIAtomicScope::AGENT:		case SIAtomicScope::AGENT:
		// Set the L0 and L1 cache policies to MISS_EVICT.
		t-tyeUnsubmitted Done Reply Inline Actions How about: // Set the L0 and L1 cache policies to MISS_EVICT. Note that there is no L2 cache bypass policy at the isa level. t-tye: How about: // Set the L0 and L1 cache policies to MISS_EVICT. Note that there is no L2 cache…
		// Note: there is no L2 cache coherent bypass control at the ISA level.
Changed \|= enableGLCBit(MI);		Changed \|= enableGLCBit(MI);
Changed \|= enableDLCBit(MI);		Changed \|= enableDLCBit(MI);
break;		break;
case SIAtomicScope::WORKGROUP:		case SIAtomicScope::WORKGROUP:
// In WGP mode the waves of a work-group can be executing on either CU of		// In WGP mode the waves of a work-group can be executing on either CU of
// the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in		// the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
// CU mode all waves of a work-group are on the same CU, and so the L0		// CU mode all waves of a work-group are on the same CU, and so the L0
// does not need to be bypassed.		// does not need to be bypassed.
Show All 32 Lines	bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
// instructions. The latter are always marked as volatile so cannot sensibly		// instructions. The latter are always marked as volatile so cannot sensibly
// handle it as do not want to pessimize all atomics. Also they do not support		// handle it as do not want to pessimize all atomics. Also they do not support
// the nontemporal attribute.		// the nontemporal attribute.
assert(Op == SIMemOp::LOAD \|\| Op == SIMemOp::STORE);		assert(Op == SIMemOp::LOAD \|\| Op == SIMemOp::STORE);

bool Changed = false;		bool Changed = false;

if (IsVolatile) {		if (IsVolatile) {
		// Set L0 and L1 cache policy to be MISS_EVICT for load instructions
		// and MISS_LRU for store instructions.
		t-tyeUnsubmitted Done Reply Inline Actions Eliminate "(write-cobine)". Add: // Note that there is no L2 cache coherent bypass policy at the isa level. t-tye: Eliminate "(write-cobine)". Add: // Note that there is no L2 cache coherent bypass policy at…
		// Note: there is no L2 cache coherent bypass control at the ISA level.
		t-tyeUnsubmitted Done Reply Inline Actions How about: // Request L0 and L1 cache policy to be MISS_EVICT for load instructions and MISS_LRU for store instructions. Note that there is no L2 cache coherent bypass policy at the isa level. t-tye: How about: // Request L0 and L1 cache policy to be MISS_EVICT for load instructions and…
if (Op == SIMemOp::LOAD) {		if (Op == SIMemOp::LOAD) {
Changed \|= enableGLCBit(MI);		Changed \|= enableGLCBit(MI);
Changed \|= enableDLCBit(MI);		Changed \|= enableDLCBit(MI);
}		}

// Ensure operation has completed at system scope to cause all volatile		// Ensure operation has completed at system scope to cause all volatile
// operations to be visible outside the program in a global order. Do not		// operations to be visible outside the program in a global order. Do not
// request cross address space as only the global address space can be		// request cross address space as only the global address space can be
// observable outside the program, so no need to cause a waitcnt for LDS		// observable outside the program, so no need to cause a waitcnt for LDS
// address space operations.		// address space operations.
Changed \|= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,		Changed \|= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
Position::AFTER);		Position::AFTER);
return Changed;		return Changed;
}		}

if (IsNonTemporal) {		if (IsNonTemporal) {
// Request L0/L1 HIT_EVICT and L2 STREAM for load and store instructions.		// For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
		// and L2 cache policy to STREAM.
		// For stores setting both GLC and SLC configures L0 and L1 cache policy
		// to MISS_EVICT and the L2 cache policy to STREAM.
		if (Op == SIMemOp::STORE)
		Changed \|= enableGLCBit(MI);
		foadUnsubmitted Not Done Reply Inline Actions Does this change the documented code sequences: https://llvm.org/docs/AMDGPUUsage.html#amdgpu-amdhsa-memory-model-code-sequences-gfx10-table ? foad: Does this change the documented code sequences: https://llvm.org/docs/AMDGPUUsage.html#amdgpu…
		foadUnsubmitted Not Done Reply Inline Actions D114707 foad: D114707
Changed \|= enableSLCBit(MI);		Changed \|= enableSLCBit(MI);

		t-tyeUnsubmitted Done Reply Inline Actions It appears that this should be setting GLC=1 for stores so that L0 will be HIT_EVICT instead of MISS_EVICT. This must not be done for loads as that would make Lo MISS_EVICT. How about: // For loads setting GLC to 1 sets the L0 and L1 cache policy to HIT_EVICT and the L2 cache policy to STREAM. For stores setting GLC and SLC both to 1 sets the L0 and L1 cache policy to MISS_EVICT and the L2 cache policy to STREAM. t-tye: It appears that this should be setting GLC=1 for stores so that L0 will be HIT_EVICT instead of…
		critsonAuthorUnsubmitted Done Reply Inline Actions Do you have MISS_EVICT and HIT_EVICT flipped in your description? Do you mean: // For loads setting SLC to 1 sets the L0 and L1 cache policy to HIT_EVICT and the L2 cache policy to STREAM. For stores setting GLC and SLC both to 1 sets the L0 cache policy to MISS_EVICT and the L2 cache policy to STREAM. L1 is always bypassed for stores. I can add the GLC bit for stores and this ceases to be NFC. critson: Do you have MISS_EVICT and HIT_EVICT flipped in your description? Do you mean: // For loads…
		t-tyeUnsubmitted Done Reply Inline Actions I believe I have it right according to the hardware GFX10 memory model spec. // For loads setting SLC to 1 sets the L0 and L1 cache policy to HIT_EVICT and the L2 cache policy to STREAM. For stores setting GLC and SLC both to 1 sets the L0 and L1 cache policy to MISS_EVICT and the L2 cache policy to STREAM. We have to state the policy for L1 too even though the hardware documentation does not state it. The L1 MUST be evict or a subsequent load could see stale data. Yes this changes ceases to be NFC and will need thorough testing. t-tye: I believe I have it right according to the hardware GFX10 memory model spec. // For loads…
		critsonAuthorUnsubmitted Done Reply Inline Actions Technically the L1 only has one policy described as "bypassed (but is coherent)" (RDNA1 Shader ISA p69), but on paper the behaviour of this looks the same as MISS_EVICT. So I guess I can accept just calling it that. Do you have any test cases which use non-temporal? critson: Technically the L1 only has one policy described as "bypassed (but is coherent)" (RDNA1 Shader…
		t-tyeUnsubmitted Done Reply Inline Actions In the GFX10 memory model spec I have, it does not state the L1 policy for stores, but my understanding is that it behaves as MISS_EVICT. To me "bypass but coherent" is semantically the same thing as MISS_EVICT and it seems odd not to use the terminology that is already well defined. t-tye: In the GFX10 memory model spec I have, it does not state the L1 policy for stores, but my…
return Changed;		return Changed;
}		}

return Changed;		return Changed;
}		}

bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,		bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,		SIAtomicScope Scope,
▲ Show 20 Lines • Show All 401 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll

	Show First 20 Lines • Show All 210 Lines • ▼ Show 20 Lines
	; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0			; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
	; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)			; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
	; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0			; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
	; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1			; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
	; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1]			; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1]
	; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2			; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
	; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3			; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
	; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)			; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
	; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 slc			; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 glc slc
	; GFX10-WGP-NEXT: s_endpgm			; GFX10-WGP-NEXT: s_endpgm
	;			;
	; GFX10-CU-LABEL: flat_nontemporal_store_0:			; GFX10-CU-LABEL: flat_nontemporal_store_0:
	; GFX10-CU: ; %bb.0: ; %entry			; GFX10-CU: ; %bb.0: ; %entry
	; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0			; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
	; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)			; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
	; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0			; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
	; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1			; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
	; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]			; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
	; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2			; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
	; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3			; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
	; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)			; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
	; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 slc			; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 glc slc
	; GFX10-CU-NEXT: s_endpgm			; GFX10-CU-NEXT: s_endpgm
	;			;
	; SKIP-CACHE-INV-LABEL: flat_nontemporal_store_0:			; SKIP-CACHE-INV-LABEL: flat_nontemporal_store_0:
	; SKIP-CACHE-INV: ; %bb.0: ; %entry			; SKIP-CACHE-INV: ; %bb.0: ; %entry
	; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9			; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
	; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)			; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
	; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0			; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
	; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1			; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
	▲ Show 20 Lines • Show All 60 Lines • ▼ Show 20 Lines
	; GFX10-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0			; GFX10-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
	; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)			; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
	; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0			; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
	; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1			; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1
	; GFX10-WGP-NEXT: v_add_co_u32 v0, s0, s2, v0			; GFX10-WGP-NEXT: v_add_co_u32 v0, s0, s2, v0
	; GFX10-WGP-NEXT: flat_load_dword v2, v[1:2]			; GFX10-WGP-NEXT: flat_load_dword v2, v[1:2]
	; GFX10-WGP-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0			; GFX10-WGP-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
	; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)			; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
	; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 slc			; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 glc slc
	; GFX10-WGP-NEXT: s_endpgm			; GFX10-WGP-NEXT: s_endpgm
	;			;
	; GFX10-CU-LABEL: flat_nontemporal_store_1:			; GFX10-CU-LABEL: flat_nontemporal_store_1:
	; GFX10-CU: ; %bb.0: ; %entry			; GFX10-CU: ; %bb.0: ; %entry
	; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0			; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
	; GFX10-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0			; GFX10-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
	; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)			; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
	; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0			; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
	; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1			; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1
	; GFX10-CU-NEXT: v_add_co_u32 v0, s0, s2, v0			; GFX10-CU-NEXT: v_add_co_u32 v0, s0, s2, v0
	; GFX10-CU-NEXT: flat_load_dword v2, v[1:2]			; GFX10-CU-NEXT: flat_load_dword v2, v[1:2]
	; GFX10-CU-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0			; GFX10-CU-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
	; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)			; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
	; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 slc			; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 glc slc
	; GFX10-CU-NEXT: s_endpgm			; GFX10-CU-NEXT: s_endpgm
	;			;
	; SKIP-CACHE-INV-LABEL: flat_nontemporal_store_1:			; SKIP-CACHE-INV-LABEL: flat_nontemporal_store_1:
	; SKIP-CACHE-INV: ; %bb.0: ; %entry			; SKIP-CACHE-INV: ; %bb.0: ; %entry
	; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9			; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
	; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0			; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0
	; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)			; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
	; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0			; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
	▲ Show 20 Lines • Show All 51 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll

	Show First 20 Lines • Show All 233 Lines • ▼ Show 20 Lines
	; GFX10-WGP-LABEL: global_nontemporal_store_0:			; GFX10-WGP-LABEL: global_nontemporal_store_0:
	; GFX10-WGP: ; %bb.0: ; %entry			; GFX10-WGP: ; %bb.0: ; %entry
	; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0			; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
	; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0			; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
	; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)			; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
	; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0			; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0
	; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)			; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
	; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0			; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
	; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] slc			; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] glc slc
	; GFX10-WGP-NEXT: s_endpgm			; GFX10-WGP-NEXT: s_endpgm
	;			;
	; GFX10-CU-LABEL: global_nontemporal_store_0:			; GFX10-CU-LABEL: global_nontemporal_store_0:
	; GFX10-CU: ; %bb.0: ; %entry			; GFX10-CU: ; %bb.0: ; %entry
	; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0			; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
	; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0			; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
	; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)			; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
	; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0			; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0
	; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)			; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
	; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0			; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
	; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] slc			; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] glc slc
	; GFX10-CU-NEXT: s_endpgm			; GFX10-CU-NEXT: s_endpgm
	;			;
	; SKIP-CACHE-INV-LABEL: global_nontemporal_store_0:			; SKIP-CACHE-INV-LABEL: global_nontemporal_store_0:
	; SKIP-CACHE-INV: ; %bb.0: ; %entry			; SKIP-CACHE-INV: ; %bb.0: ; %entry
	; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9			; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
	; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000			; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000
	; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1			; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
	; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)			; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
	▲ Show 20 Lines • Show All 68 Lines • ▼ Show 20 Lines
	; GFX10-WGP-LABEL: global_nontemporal_store_1:			; GFX10-WGP-LABEL: global_nontemporal_store_1:
	; GFX10-WGP: ; %bb.0: ; %entry			; GFX10-WGP: ; %bb.0: ; %entry
	; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0			; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
	; GFX10-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0			; GFX10-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
	; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)			; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
	; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0			; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0
	; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)			; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
	; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0			; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
	; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] slc			; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] glc slc
	; GFX10-WGP-NEXT: s_endpgm			; GFX10-WGP-NEXT: s_endpgm
	;			;
	; GFX10-CU-LABEL: global_nontemporal_store_1:			; GFX10-CU-LABEL: global_nontemporal_store_1:
	; GFX10-CU: ; %bb.0: ; %entry			; GFX10-CU: ; %bb.0: ; %entry
	; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0			; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
	; GFX10-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0			; GFX10-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
	; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)			; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
	; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0			; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0
	; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)			; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
	; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0			; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
	; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] slc			; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] glc slc
	; GFX10-CU-NEXT: s_endpgm			; GFX10-CU-NEXT: s_endpgm
	;			;
	; SKIP-CACHE-INV-LABEL: global_nontemporal_store_1:			; SKIP-CACHE-INV-LABEL: global_nontemporal_store_1:
	; SKIP-CACHE-INV: ; %bb.0: ; %entry			; SKIP-CACHE-INV: ; %bb.0: ; %entry
	; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9			; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
	; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000			; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000
	; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0			; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0
	; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0			; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0
	▲ Show 20 Lines • Show All 43 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll

	Show First 20 Lines • Show All 303 Lines • ▼ Show 20 Lines
	; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8			; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
	; GFX10-WGP-NEXT: s_add_u32 s8, s8, s7			; GFX10-WGP-NEXT: s_add_u32 s8, s8, s7
	; GFX10-WGP-NEXT: s_addc_u32 s9, s9, 0			; GFX10-WGP-NEXT: s_addc_u32 s9, s9, 0
	; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)			; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
	; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0			; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0
	; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2			; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2
	; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)			; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
	; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0			; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
	; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen slc			; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
	; GFX10-WGP-NEXT: s_endpgm			; GFX10-WGP-NEXT: s_endpgm
	;			;
	; GFX10-CU-LABEL: private_nontemporal_store_0:			; GFX10-CU-LABEL: private_nontemporal_store_0:
	; GFX10-CU: ; %bb.0: ; %entry			; GFX10-CU: ; %bb.0: ; %entry
	; GFX10-CU-NEXT: s_mov_b64 s[10:11], s[2:3]			; GFX10-CU-NEXT: s_mov_b64 s[10:11], s[2:3]
	; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[0:1]			; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[0:1]
	; GFX10-CU-NEXT: s_clause 0x1			; GFX10-CU-NEXT: s_clause 0x1
	; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0			; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
	; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8			; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
	; GFX10-CU-NEXT: s_add_u32 s8, s8, s7			; GFX10-CU-NEXT: s_add_u32 s8, s8, s7
	; GFX10-CU-NEXT: s_addc_u32 s9, s9, 0			; GFX10-CU-NEXT: s_addc_u32 s9, s9, 0
	; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)			; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
	; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0			; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0
	; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2			; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
	; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)			; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
	; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0			; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
	; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen slc			; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
	; GFX10-CU-NEXT: s_endpgm			; GFX10-CU-NEXT: s_endpgm
	;			;
	; SKIP-CACHE-INV-LABEL: private_nontemporal_store_0:			; SKIP-CACHE-INV-LABEL: private_nontemporal_store_0:
	; SKIP-CACHE-INV: ; %bb.0: ; %entry			; SKIP-CACHE-INV: ; %bb.0: ; %entry
	; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[4:5]			; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[4:5]
	; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0			; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0
	; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0			; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
	; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)			; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
	▲ Show 20 Lines • Show All 91 Lines • ▼ Show 20 Lines
	; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8			; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
	; GFX10-WGP-NEXT: s_add_u32 s8, s8, s7			; GFX10-WGP-NEXT: s_add_u32 s8, s8, s7
	; GFX10-WGP-NEXT: s_addc_u32 s9, s9, 0			; GFX10-WGP-NEXT: s_addc_u32 s9, s9, 0
	; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)			; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
	; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0			; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0
	; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2			; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2
	; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)			; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
	; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0			; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
	; GFX10-WGP-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen slc			; GFX10-WGP-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen glc slc
	; GFX10-WGP-NEXT: s_endpgm			; GFX10-WGP-NEXT: s_endpgm
	;			;
	; GFX10-CU-LABEL: private_nontemporal_store_1:			; GFX10-CU-LABEL: private_nontemporal_store_1:
	; GFX10-CU: ; %bb.0: ; %entry			; GFX10-CU: ; %bb.0: ; %entry
	; GFX10-CU-NEXT: s_mov_b64 s[10:11], s[2:3]			; GFX10-CU-NEXT: s_mov_b64 s[10:11], s[2:3]
	; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[0:1]			; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[0:1]
	; GFX10-CU-NEXT: s_clause 0x1			; GFX10-CU-NEXT: s_clause 0x1
	; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0			; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
	; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8			; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
	; GFX10-CU-NEXT: s_add_u32 s8, s8, s7			; GFX10-CU-NEXT: s_add_u32 s8, s8, s7
	; GFX10-CU-NEXT: s_addc_u32 s9, s9, 0			; GFX10-CU-NEXT: s_addc_u32 s9, s9, 0
	; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)			; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
	; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0			; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0
	; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2			; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2
	; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)			; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
	; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0			; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
	; GFX10-CU-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen slc			; GFX10-CU-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen glc slc
	; GFX10-CU-NEXT: s_endpgm			; GFX10-CU-NEXT: s_endpgm
	;			;
	; SKIP-CACHE-INV-LABEL: private_nontemporal_store_1:			; SKIP-CACHE-INV-LABEL: private_nontemporal_store_1:
	; SKIP-CACHE-INV: ; %bb.0: ; %entry			; SKIP-CACHE-INV: ; %bb.0: ; %entry
	; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[4:5]			; SKIP-CACHE-INV-NEXT: s_getpc_b64 s[4:5]
	; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0			; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s0
	; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0			; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
	; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0			; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0
	▲ Show 20 Lines • Show All 55 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Add SIMemoryLegalizer comments to clarify bit usage
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 389991

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll

llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll

llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Add SIMemoryLegalizer comments to clarify bit usageClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 389991

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll

llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll

llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll

[AMDGPU] Add SIMemoryLegalizer comments to clarify bit usage
ClosedPublic