Diff 478411

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 1,417 Lines • ▼ Show 20 Lines	case 64:
// can do a 4 byte aligned, 8 byte access in a single operation using		// can do a 4 byte aligned, 8 byte access in a single operation using
// ds_read2/write2_b32 with adjacent offsets.		// ds_read2/write2_b32 with adjacent offsets.
RequiredAlignment = Align(4);		RequiredAlignment = Align(4);

if (Subtarget->hasUnalignedDSAccessEnabled()) {		if (Subtarget->hasUnalignedDSAccessEnabled()) {
// We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/		// We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
// ds_write2_b32 depending on the alignment. In either case with either		// ds_write2_b32 depending on the alignment. In either case with either
// alignment there is no faster way of doing this.		// alignment there is no faster way of doing this.

		// The numbers returned here and below are not additive, it is a 'speed
		// rank'. They are just meant to be compared to decide if a certain way
		// of lowering an operation is faster than another. For that purpose
		// naturally aligned operation gets it bitsize to indicate that "it
		// operates with a speed comparable to N-bit wide load". With the full
		// alignment ds128 is slower than ds96 for example. If underaligned it
		// is comparable to a speed of a single dword access, which would then
		// mean 32 < 128 and it is faster to issue a wide load regardless.
		// 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
		// wider load which will not be aligned anymore the latter is slower.
if (IsFast)		if (IsFast)
*IsFast = 1;		*IsFast = (Alignment >= RequiredAlignment) ? 64
		: (Alignment < Align(4)) ? 32
		: 1;
return true;		return true;
}		}

break;		break;
case 96:		case 96:
if (!Subtarget->hasDS96AndDS128())		if (!Subtarget->hasDS96AndDS128())
return false;		return false;

// 12 byte accessing via ds_read/write_b96 require 16-byte alignment on		// 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
// gfx8 and older.		// gfx8 and older.

if (Subtarget->hasUnalignedDSAccessEnabled()) {		if (Subtarget->hasUnalignedDSAccessEnabled()) {
// Naturally aligned access is fastest. However, also report it is Fast		// Naturally aligned access is fastest. However, also report it is Fast
// if memory is aligned less than DWORD. A narrow load or store will be		// if memory is aligned less than DWORD. A narrow load or store will be
// be equally slow as a single ds_read_b96/ds_write_b96, but there will		// be equally slow as a single ds_read_b96/ds_write_b96, but there will
// be more of them, so overall we will pay less penalty issuing a single		// be more of them, so overall we will pay less penalty issuing a single
// instruction.		// instruction.

		// See comment on the values above.
if (IsFast)		if (IsFast)
*IsFast = Alignment >= RequiredAlignment \|\| Alignment < Align(4);		*IsFast = (Alignment >= RequiredAlignment) ? 96
		: (Alignment < Align(4)) ? 32
		: 1;
return true;		return true;
}		}

break;		break;
case 128:		case 128:
if (!Subtarget->hasDS96AndDS128() \|\| !Subtarget->useDS128())		if (!Subtarget->hasDS96AndDS128() \|\| !Subtarget->useDS128())
return false;		return false;

// 16 byte accessing via ds_read/write_b128 require 16-byte alignment on		// 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
// gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a		// gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
// single operation using ds_read2/write2_b64.		// single operation using ds_read2/write2_b64.
RequiredAlignment = Align(8);		RequiredAlignment = Align(8);

if (Subtarget->hasUnalignedDSAccessEnabled()) {		if (Subtarget->hasUnalignedDSAccessEnabled()) {
// Naturally aligned access is fastest. However, also report it is Fast		// Naturally aligned access is fastest. However, also report it is Fast
// if memory is aligned less than DWORD. A narrow load or store will be		// if memory is aligned less than DWORD. A narrow load or store will be
// be equally slow as a single ds_read_b128/ds_write_b128, but there		// be equally slow as a single ds_read_b128/ds_write_b128, but there
// will be more of them, so overall we will pay less penalty issuing a		// will be more of them, so overall we will pay less penalty issuing a
// single instruction.		// single instruction.

		// See comment on the values above.
if (IsFast)		if (IsFast)
*IsFast= Alignment >= RequiredAlignment \|\| Alignment < Align(4);		*IsFast = (Alignment >= RequiredAlignment) ? 128
		: (Alignment < Align(4)) ? 32
		: 1;
		arsenmUnsubmitted Done Reply Inline Actions What do the numbers mean? arsenm: What do the numbers mean?
		rampitecAuthorUnsubmitted Done Reply Inline Actions More or less 'it operates with a speed comparable to N-bit wide load'. With the full alignment ds128 is slower than ds96 for example. If underaligned it is comparable to a speed of a single dword access, which would then mean 32 < 128 and it is faster to issue a wide load regardless. 1 is simply 'slow, don't do it'. I.e. comparing an aligned load to a wider load which will not be aligned anymore the latter is slower. But essentially it is just a rank, these are not additive. rampitec: More or less 'it operates with a speed comparable to N-bit wide load'. With the full alignment…
		arsenmUnsubmitted Done Reply Inline Actions This needs to be commented arsenm: This needs to be commented
return true;		return true;
}		}

break;		break;
default:		default:
if (Size > 32)		if (Size > 32)
return false;		return false;

break;		break;
}		}

		// See comment on the values above.
		// Note that we have a single-dword or sub-dword here, so if underaligned
		// it is a slowest possible access, hence returned value is 0.
if (IsFast)		if (IsFast)
*IsFast = Alignment >= RequiredAlignment;		*IsFast = (Alignment >= RequiredAlignment) ? Size : 0;

return Alignment >= RequiredAlignment \|\|		return Alignment >= RequiredAlignment \|\|
Subtarget->hasUnalignedDSAccessEnabled();		Subtarget->hasUnalignedDSAccessEnabled();
}		}

if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {		if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
bool AlignedBy4 = Alignment >= Align(4);		bool AlignedBy4 = Alignment >= Align(4);
if (IsFast)		if (IsFast)
▲ Show 20 Lines • Show All 41 Lines • ▼ Show 20 Lines	if (IsFast)
*IsFast = 1;		*IsFast = 1;

return Size >= 32 && Alignment >= Align(4);		return Size >= 32 && Alignment >= Align(4);
}		}

bool SITargetLowering::allowsMisalignedMemoryAccesses(		bool SITargetLowering::allowsMisalignedMemoryAccesses(
EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,		EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
unsigned *IsFast) const {		unsigned *IsFast) const {
bool Allow = allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace,		return allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace,
Alignment, Flags, IsFast);		Alignment, Flags, IsFast);

if (Allow && IsFast && Subtarget->hasUnalignedDSAccessEnabled() &&
(AddrSpace == AMDGPUAS::LOCAL_ADDRESS \|\|
AddrSpace == AMDGPUAS::REGION_ADDRESS)) {
// Lie it is fast if +unaligned-access-mode is passed so that DS accesses
// get vectorized. We could use ds_read2_b/ds_write2_b instructions on a
// misaligned data which is faster than a pair of ds_read_b/ds_write_b
// which would be equally misaligned.
// This is only used by the common passes, selection always calls the
// allowsMisalignedMemoryAccessesImpl version.
*IsFast= 1;
}

return Allow;
}		}

EVT SITargetLowering::getOptimalMemOpType(		EVT SITargetLowering::getOptimalMemOpType(
const MemOp &Op, const AttributeList &FuncAttributes) const {		const MemOp &Op, const AttributeList &FuncAttributes) const {
// FIXME: Should account for address space here.		// FIXME: Should account for address space here.

// The default fallback uses the private pointer size as a guess for a type to		// The default fallback uses the private pointer size as a guess for a type to
// use. Make sure we switch these to 64-bit accesses.		// use. Make sure we switch these to 64-bit accesses.
▲ Show 20 Lines • Show All 7,218 Lines • ▼ Show 20 Lines	if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
default:		default:
llvm_unreachable("unsupported private_element_size");		llvm_unreachable("unsupported private_element_size");
}		}
} else if (AS == AMDGPUAS::LOCAL_ADDRESS \|\| AS == AMDGPUAS::REGION_ADDRESS) {		} else if (AS == AMDGPUAS::LOCAL_ADDRESS \|\| AS == AMDGPUAS::REGION_ADDRESS) {
unsigned Fast = 0;		unsigned Fast = 0;
auto Flags = Load->getMemOperand()->getFlags();		auto Flags = Load->getMemOperand()->getFlags();
if (allowsMisalignedMemoryAccessesImpl(MemVT.getSizeInBits(), AS,		if (allowsMisalignedMemoryAccessesImpl(MemVT.getSizeInBits(), AS,
Load->getAlign(), Flags, &Fast) &&		Load->getAlign(), Flags, &Fast) &&
Fast)		Fast > 1)
return SDValue();		return SDValue();

if (MemVT.isVector())		if (MemVT.isVector())
return SplitVectorLoad(Op, DAG);		return SplitVectorLoad(Op, DAG);
}		}

if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),		if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
MemVT, *Load->getMemOperand())) {		MemVT, *Load->getMemOperand())) {
▲ Show 20 Lines • Show All 482 Lines • ▼ Show 20 Lines	if (AS == AMDGPUAS::GLOBAL_ADDRESS \|\|
default:		default:
llvm_unreachable("unsupported private_element_size");		llvm_unreachable("unsupported private_element_size");
}		}
} else if (AS == AMDGPUAS::LOCAL_ADDRESS \|\| AS == AMDGPUAS::REGION_ADDRESS) {		} else if (AS == AMDGPUAS::LOCAL_ADDRESS \|\| AS == AMDGPUAS::REGION_ADDRESS) {
unsigned Fast = 0;		unsigned Fast = 0;
auto Flags = Store->getMemOperand()->getFlags();		auto Flags = Store->getMemOperand()->getFlags();
if (allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AS,		if (allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AS,
Store->getAlign(), Flags, &Fast) &&		Store->getAlign(), Flags, &Fast) &&
Fast)		Fast > 1)
return SDValue();		return SDValue();

if (VT.isVector())		if (VT.isVector())
return SplitVectorStore(Op, DAG);		return SplitVectorStore(Op, DAG);

return expandUnalignedStore(Store, DAG);		return expandUnalignedStore(Store, DAG);
}		}

▲ Show 20 Lines • Show All 3,953 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/ds-vectorization-alignment.ll

This file was added.

				; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck --enable-var-scope --check-prefix=GCN %s

				; Check that vectorizer does not create slow misaligned loads

				; GCN-LABEL: {{^}}ds1align1:
				; GCN-COUNT-2: ds_read_u8
				; GCN-COUNT-2: ds_write_b8
				define amdgpu_kernel void @ds1align1(i8 addrspace(3)* %in, i8 addrspace(3)* %out) {
				%val1 = load i8, i8 addrspace(3)* %in, align 1
				%gep1 = getelementptr i8, i8 addrspace(3)* %in, i32 1
				%val2 = load i8, i8 addrspace(3)* %gep1, align 1
				store i8 %val1, i8 addrspace(3)* %out, align 1
				%gep2 = getelementptr i8, i8 addrspace(3)* %out, i32 1
				store i8 %val2, i8 addrspace(3)* %gep2, align 1
				ret void
				}

				; GCN-LABEL: {{^}}ds2align2:
				; GCN-COUNT-2: ds_read_u16
				; GCN-COUNT-2: ds_write_b16
				define amdgpu_kernel void @ds2align2(i16 addrspace(3)* %in, i16 addrspace(3)* %out) {
				%val1 = load i16, i16 addrspace(3)* %in, align 2
				%gep1 = getelementptr i16, i16 addrspace(3)* %in, i32 1
				%val2 = load i16, i16 addrspace(3)* %gep1, align 2
				store i16 %val1, i16 addrspace(3)* %out, align 2
				%gep2 = getelementptr i16, i16 addrspace(3)* %out, i32 1
				store i16 %val2, i16 addrspace(3)* %gep2, align 2
				ret void
				}

				; GCN-LABEL: {{^}}ds4align4:
				; GCN: ds_read2_b32
				; GCN: ds_write2_b32
				define amdgpu_kernel void @ds4align4(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
				%val1 = load i32, i32 addrspace(3)* %in, align 4
				%gep1 = getelementptr i32, i32 addrspace(3)* %in, i32 1
				%val2 = load i32, i32 addrspace(3)* %gep1, align 4
				store i32 %val1, i32 addrspace(3)* %out, align 4
				%gep2 = getelementptr i32, i32 addrspace(3)* %out, i32 1
				store i32 %val2, i32 addrspace(3)* %gep2, align 4
				ret void
				}

				; GCN-LABEL: {{^}}ds8align8:
				; GCN: ds_read2_b64
				; GCN: ds_write2_b64
				define amdgpu_kernel void @ds8align8(i64 addrspace(3)* %in, i64 addrspace(3)* %out) {
				%val1 = load i64, i64 addrspace(3)* %in, align 8
				%gep1 = getelementptr i64, i64 addrspace(3)* %in, i64 1
				%val2 = load i64, i64 addrspace(3)* %gep1, align 8
				store i64 %val1, i64 addrspace(3)* %out, align 8
				%gep2 = getelementptr i64, i64 addrspace(3)* %out, i64 1
				store i64 %val2, i64 addrspace(3)* %gep2, align 8
				ret void
				}

				; GCN-LABEL: {{^}}ds1align2:
				; GCN: ds_read_u16
				; GCN: ds_write_b16
				define amdgpu_kernel void @ds1align2(i8 addrspace(3)* %in, i8 addrspace(3)* %out) {
				%val1 = load i8, i8 addrspace(3)* %in, align 2
				%gep1 = getelementptr i8, i8 addrspace(3)* %in, i32 1
				%val2 = load i8, i8 addrspace(3)* %gep1, align 2
				store i8 %val1, i8 addrspace(3)* %out, align 2
				%gep2 = getelementptr i8, i8 addrspace(3)* %out, i32 1
				store i8 %val2, i8 addrspace(3)* %gep2, align 2
				ret void
				}

				; GCN-LABEL: {{^}}ds2align4:
				; GCN: ds_read_b32
				; GCN: ds_write_b32
				define amdgpu_kernel void @ds2align4(i16 addrspace(3)* %in, i16 addrspace(3)* %out) {
				%val1 = load i16, i16 addrspace(3)* %in, align 4
				%gep1 = getelementptr i16, i16 addrspace(3)* %in, i32 1
				%val2 = load i16, i16 addrspace(3)* %gep1, align 4
				store i16 %val1, i16 addrspace(3)* %out, align 4
				%gep2 = getelementptr i16, i16 addrspace(3)* %out, i32 1
				store i16 %val2, i16 addrspace(3)* %gep2, align 4
				ret void
				}

				; GCN-LABEL: {{^}}ds4align8:
				; GCN: ds_read_b64
				; GCN: ds_write_b64
				define amdgpu_kernel void @ds4align8(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
				%val1 = load i32, i32 addrspace(3)* %in, align 8
				%gep1 = getelementptr i32, i32 addrspace(3)* %in, i32 1
				%val2 = load i32, i32 addrspace(3)* %gep1, align 8
				store i32 %val1, i32 addrspace(3)* %out, align 8
				%gep2 = getelementptr i32, i32 addrspace(3)* %out, i32 1
				store i32 %val2, i32 addrspace(3)* %gep2, align 8
				ret void
				}

				; GCN-LABEL: {{^}}ds8align16:
				; GCN: ds_read_b128
				; GCN: ds_write_b128
				define amdgpu_kernel void @ds8align16(i64 addrspace(3)* %in, i64 addrspace(3)* %out) {
				%val1 = load i64, i64 addrspace(3)* %in, align 16
				%gep1 = getelementptr i64, i64 addrspace(3)* %in, i64 1
				%val2 = load i64, i64 addrspace(3)* %gep1, align 16
				store i64 %val1, i64 addrspace(3)* %out, align 16
				%gep2 = getelementptr i64, i64 addrspace(3)* %out, i64 1
				store i64 %val2, i64 addrspace(3)* %gep2, align 16
				ret void
				}

llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll

	Show First 20 Lines • Show All 306 Lines • ▼ Show 20 Lines
	}			}

	%struct0 = type { [4224 x %type.i16] }			%struct0 = type { [4224 x %type.i16] }
	%type.i16 = type { i16 }			%type.i16 = type { i16 }
	@_ZZN0 = external hidden addrspace(3) global %struct0, align 8			@_ZZN0 = external hidden addrspace(3) global %struct0, align 8

	; GFX11-LABEL: tied_operand_test:			; GFX11-LABEL: tied_operand_test:
	; GFX11: ; %bb.0: ; %entry			; GFX11: ; %bb.0: ; %entry
	; GFX11: scratch_load_d16_hi_b16 [[LDRESULT:v[0-9]+]], off, off offset:4			; GFX11-DAG: scratch_load_u16 [[LDRESULT:v[0-9]+]], off, off offset:4
	; GFX11-NEXT: s_waitcnt vmcnt(0)			; GFX11-DAG: v_mov_b32_e32 [[C:v[0-9]+]], 0x7b
	; GFX11-NEXT: ds_store_b32 v{{[0-9]+}}, [[LDRESULT]] offset:8			; GFX11-DAG: ds_store_b16 v{{[0-9]+}}, [[LDRESULT]] offset:10
				; GFX11-DAG: ds_store_b16 v{{[0-9]+}}, [[C]] offset:8
	; GFX11-NEXT: s_endpgm			; GFX11-NEXT: s_endpgm
	define protected amdgpu_kernel void @tied_operand_test(i1 %c1, i1 %c2, i32 %val) {			define protected amdgpu_kernel void @tied_operand_test(i1 %c1, i1 %c2, i32 %val) {
	entry:			entry:
	%scratch0 = alloca i16, align 4, addrspace(5)			%scratch0 = alloca i16, align 4, addrspace(5)
	%scratch1 = alloca i16, align 4, addrspace(5)			%scratch1 = alloca i16, align 4, addrspace(5)
	%first = select i1 %c1, i16 addrspace(5)* %scratch0, i16 addrspace(5)* %scratch1			%first = select i1 %c1, i16 addrspace(5)* %scratch0, i16 addrspace(5)* %scratch1
	%spec.select = select i1 %c2, i16 addrspace(5)* %first, i16 addrspace(5)* %scratch0			%spec.select = select i1 %c2, i16 addrspace(5)* %first, i16 addrspace(5)* %scratch0
	%dead.load = load i16, i16 addrspace(5)* %spec.select, align 2			%dead.load = load i16, i16 addrspace(5)* %spec.select, align 2
	Show All 11 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Fine tune LDS misaligned access speed
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 478411

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

llvm/test/CodeGen/AMDGPU/ds-vectorization-alignment.ll

llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Fine tune LDS misaligned access speedClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 478411

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

llvm/test/CodeGen/AMDGPU/ds-vectorization-alignment.ll

llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll

[AMDGPU] Fine tune LDS misaligned access speed
ClosedPublic