This is an archive of the discontinued LLVM Phabricator instance.

DAG: Don't try to cluster loads with tied inputs
ClosedPublic

Authored by arsenm on Mar 4 2019, 10:18 AM.

Download Raw Diff

Details

Reviewers

bogner
rampitec
cfang
sunfish

Summary

This avoids breaking possible value dependencies when sorting loads by
offset.

AMDGPU has some load instructions that write into the high or low bits
of the destination register, and have a tied input for the other input
bits. These can easily have the same base pointer, but be a swizzle so
the high address load needs to come first. This was inserting glue
forcing the opposite ordering, producing a cycle the InstrEmitter
would assert on. It may be potentially expensive to look for the
dependency between the other loads, so just skip any where this could
happen.

Fixes bug 40936 by reverting r351379, which added a hacky attempt to
fix this by adding chains in this case, which I think was just working
around broken glue before the InstrEmitter. The core of the patch is
re-implementing the fix for that problem.

Diff Detail

Event Timeline

arsenm created this revision.Mar 4 2019, 10:18 AM

Herald added subscribers: mgrang, tpr, nhaehnle and 3 others. · View Herald TranscriptMar 4 2019, 10:18 AM

rampitec added inline comments.Mar 4 2019, 10:30 AM

lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
277	Isn't that easier and cleaner to compute new BaseOff once yet while sorting?

arsenm marked an inline comment as done.Mar 4 2019, 10:40 AM

arsenm added inline comments.

lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
277	This is dependent on each offset so I don’t understand what you mean . The alternative is to delete the assert in the SI implementation, which isn’t really necessary

rampitec added inline comments.Mar 4 2019, 11:12 AM

lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
271	There is unlikely but possible case neither is operand of another but depend through a third instruction. Also what would happen if: A: load i16 [base] B: load i16 [base + 2] C: load i8 [base + 1] I guess normal sorting will tell A < C < B. With your change: B < A, A < C, C < B. That creates an impossible sort order.
277	I mean you could compute min(Offsets[]) during sort and use it as a new base. Otherwise you may send different base offset to target with each call for the same chain.

Just skip loads with tied inputs

arsenm marked an inline comment as done.Mar 7 2019, 3:26 PM

arsenm added inline comments.

lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
271	It turns out this is already broken even if clustering is disabled. Fixing this requires another patch to rewrite the patterns

LGTM

This revision is now accepted and ready to land.Mar 7 2019, 3:46 PM

r355728

Revision Contents

Path

Size

lib/

CodeGen/

SelectionDAG/

ScheduleDAGSDNodes.cpp

21 lines

Target/

AMDGPU/

SIISelLowering.cpp

45 lines

test/

CodeGen/

AMDGPU/

chain-hi-to-lo.ll

38 lines

Diff 189790

lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp

	Show First 20 Lines • Show All 199 Lines • ▼ Show 20 Lines
	void ScheduleDAGSDNodes::ClusterNeighboringLoads(SDNode *Node) {			void ScheduleDAGSDNodes::ClusterNeighboringLoads(SDNode *Node) {
	SDNode *Chain = nullptr;			SDNode *Chain = nullptr;
	unsigned NumOps = Node->getNumOperands();			unsigned NumOps = Node->getNumOperands();
	if (Node->getOperand(NumOps-1).getValueType() == MVT::Other)			if (Node->getOperand(NumOps-1).getValueType() == MVT::Other)
	Chain = Node->getOperand(NumOps-1).getNode();			Chain = Node->getOperand(NumOps-1).getNode();
	if (!Chain)			if (!Chain)
	return;			return;

				// Skip any load instruction that has a tied input. There may be an additional
				// dependency requiring a different order than by increasing offsets, and the
				// added glue may introduce a cycle.
				auto hasTiedInput = [this](const SDNode *N) {
				const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
				for (unsigned I = 0; I != MCID.getNumOperands(); ++I) {
				if (MCID.getOperandConstraint(I, MCOI::TIED_TO) != -1)
				return true;
				}

				return false;
				};

	// Look for other loads of the same chain. Find loads that are loading from			// Look for other loads of the same chain. Find loads that are loading from
	// the same base pointer and different offsets.			// the same base pointer and different offsets.
	SmallPtrSet<SDNode*, 16> Visited;			SmallPtrSet<SDNode*, 16> Visited;
	SmallVector<int64_t, 4> Offsets;			SmallVector<int64_t, 4> Offsets;
	DenseMap<long long, SDNode*> O2SMap; // Map from offset to SDNode.			DenseMap<long long, SDNode*> O2SMap; // Map from offset to SDNode.
	bool Cluster = false;			bool Cluster = false;
	SDNode *Base = Node;			SDNode *Base = Node;

				if (hasTiedInput(Base))
				return;

	// This algorithm requires a reasonably low use count before finding a match			// This algorithm requires a reasonably low use count before finding a match
	// to avoid uselessly blowing up compile time in large blocks.			// to avoid uselessly blowing up compile time in large blocks.
	unsigned UseCount = 0;			unsigned UseCount = 0;
	for (SDNode::use_iterator I = Chain->use_begin(), E = Chain->use_end();			for (SDNode::use_iterator I = Chain->use_begin(), E = Chain->use_end();
	I != E && UseCount < 100; ++I, ++UseCount) {			I != E && UseCount < 100; ++I, ++UseCount) {
	SDNode User = I;			SDNode User = I;
	if (User == Node \|\| !Visited.insert(User).second)			if (User == Node \|\| !Visited.insert(User).second)
	continue;			continue;
	int64_t Offset1, Offset2;			int64_t Offset1, Offset2;
	if (!TII->areLoadsFromSameBasePtr(Base, User, Offset1, Offset2) \|\|			if (!TII->areLoadsFromSameBasePtr(Base, User, Offset1, Offset2) \|\|
	Offset1 == Offset2)			Offset1 == Offset2 \|\|
				hasTiedInput(User)) {
	// FIXME: Should be ok if they addresses are identical. But earlier			// FIXME: Should be ok if they addresses are identical. But earlier
	// optimizations really should have eliminated one of the loads.			// optimizations really should have eliminated one of the loads.
	continue;			continue;
				}
	if (O2SMap.insert(std::make_pair(Offset1, Base)).second)			if (O2SMap.insert(std::make_pair(Offset1, Base)).second)
	Offsets.push_back(Offset1);			Offsets.push_back(Offset1);
	O2SMap.insert(std::make_pair(Offset2, User));			O2SMap.insert(std::make_pair(Offset2, User));
	Offsets.push_back(Offset2);			Offsets.push_back(Offset2);
	if (Offset2 < Offset1)			if (Offset2 < Offset1)
	Base = User;			Base = User;
	Cluster = true;			Cluster = true;
	// Reset UseCount to allow more matches.			// Reset UseCount to allow more matches.
	UseCount = 0;			UseCount = 0;
	}			}

	if (!Cluster)			if (!Cluster)
	return;			return;

	// Sort them in increasing order.			// Sort them in increasing order.
	llvm::sort(Offsets);			llvm::sort(Offsets);

	// Check if the loads are close enough.			// Check if the loads are close enough.
	SmallVector<SDNode*, 4> Loads;			SmallVector<SDNode*, 4> Loads;
	unsigned NumLoads = 0;			unsigned NumLoads = 0;
	int64_t BaseOff = Offsets[0];			int64_t BaseOff = Offsets[0];
	SDNode *BaseLoad = O2SMap[BaseOff];			SDNode *BaseLoad = O2SMap[BaseOff];
	Loads.push_back(BaseLoad);			Loads.push_back(BaseLoad);
	for (unsigned i = 1, e = Offsets.size(); i != e; ++i) {			for (unsigned i = 1, e = Offsets.size(); i != e; ++i) {
				rampitecUnsubmitted Not Done Reply Inline Actions There is unlikely but possible case neither is operand of another but depend through a third instruction. Also what would happen if: A: load i16 [base] B: load i16 [base + 2] C: load i8 [base + 1] I guess normal sorting will tell A < C < B. With your change: B < A, A < C, C < B. That creates an impossible sort order. rampitec: There is unlikely but possible case neither is operand of another but depend through a third…
				arsenmAuthorUnsubmitted Done Reply Inline Actions It turns out this is already broken even if clustering is disabled. Fixing this requires another patch to rewrite the patterns arsenm: It turns out this is already broken even if clustering is disabled. Fixing this requires…
	int64_t Offset = Offsets[i];			int64_t Offset = Offsets[i];
	SDNode *Load = O2SMap[Offset];			SDNode *Load = O2SMap[Offset];
	if (!TII->shouldScheduleLoadsNear(BaseLoad, Load, BaseOff, Offset,NumLoads))			if (!TII->shouldScheduleLoadsNear(BaseLoad, Load, BaseOff, Offset,NumLoads))
	break; // Stop right here. Ignore loads that are further away.			break; // Stop right here. Ignore loads that are further away.
	Loads.push_back(Load);			Loads.push_back(Load);
	++NumLoads;			++NumLoads;
				rampitecUnsubmitted Not Done Reply Inline Actions Isn't that easier and cleaner to compute new BaseOff once yet while sorting? rampitec: Isn't that easier and cleaner to compute new BaseOff once yet while sorting?
				arsenmAuthorUnsubmitted Done Reply Inline Actions This is dependent on each offset so I don’t understand what you mean . The alternative is to delete the assert in the SI implementation, which isn’t really necessary arsenm: This is dependent on each offset so I don’t understand what you mean . The alternative is to…
				rampitecUnsubmitted Not Done Reply Inline Actions I mean you could compute min(Offsets[]) during sort and use it as a new base. Otherwise you may send different base offset to target with each call for the same chain. rampitec: I mean you could compute min(Offsets[]) during sort and use it as a new base. Otherwise you may…
	}			}

	if (NumLoads == 0)			if (NumLoads == 0)
	return;			return;

	// Cluster loads by adding MVT::Glue outputs and inputs. This also			// Cluster loads by adding MVT::Glue outputs and inputs. This also
	// ensure they are scheduled in order of increasing addresses.			// ensure they are scheduled in order of increasing addresses.
	SDNode *Lead = Loads[0];			SDNode *Lead = Loads[0];
	▲ Show 20 Lines • Show All 732 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 9,361 Lines • ▼ Show 20 Lines	case AMDGPU::V_DIV_SCALE_F64: {

SmallVector<SDValue, 4> Ops = { Src0, Src1, Src2 };		SmallVector<SDValue, 4> Ops = { Src0, Src1, Src2 };
for (unsigned I = 3, N = Node->getNumOperands(); I != N; ++I)		for (unsigned I = 3, N = Node->getNumOperands(); I != N; ++I)
Ops.push_back(Node->getOperand(I));		Ops.push_back(Node->getOperand(I));

Ops.push_back(ImpDef.getValue(1));		Ops.push_back(ImpDef.getValue(1));
return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);		return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
}		}
case AMDGPU::FLAT_LOAD_UBYTE_D16_HI:
case AMDGPU::FLAT_LOAD_SBYTE_D16_HI:
case AMDGPU::FLAT_LOAD_SHORT_D16_HI:
case AMDGPU::GLOBAL_LOAD_UBYTE_D16_HI:
case AMDGPU::GLOBAL_LOAD_SBYTE_D16_HI:
case AMDGPU::GLOBAL_LOAD_SHORT_D16_HI:
case AMDGPU::DS_READ_U16_D16_HI:
case AMDGPU::DS_READ_I8_D16_HI:
case AMDGPU::DS_READ_U8_D16_HI:
case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET:
case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET:
case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET:
case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN:
case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN:
case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN: {
// For these loads that write to the HI part of a register,
// we should chain them to the op that writes to the LO part
// of the register to maintain the order.
unsigned NumOps = Node->getNumOperands();
SDValue OldChain = Node->getOperand(NumOps-1);

if (OldChain.getValueType() != MVT::Other)
break;

// Look for the chain to replace to.
SDValue Lo = Node->getOperand(NumOps-2);
SDNode *LoNode = Lo.getNode();
if (LoNode->getNumValues() == 1 \|\|
LoNode->getValueType(LoNode->getNumValues() - 1) != MVT::Other)
break;

SDValue NewChain = Lo.getValue(LoNode->getNumValues() - 1);
if (NewChain == OldChain) // Already replaced.
break;

SmallVector<SDValue, 16> Ops;
for (unsigned I = 0; I < NumOps-1; ++I)
Ops.push_back(Node->getOperand(I));
// Repalce the Chain.
Ops.push_back(NewChain);
MachineSDNode *NewNode = DAG.getMachineNode(Opcode, SDLoc(Node),
Node->getVTList(), Ops);
DAG.setNodeMemRefs(NewNode, Node->memoperands());
return NewNode;
}
default:		default:
break;		break;
}		}

return Node;		return Node;
}		}

/// Assign the register class depending on the number of		/// Assign the register class depending on the number of
▲ Show 20 Lines • Show All 405 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/chain-hi-to-lo.ll

; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s		; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s

; GCN-LABEL: {{^}}chain_hi_to_lo_private:		; GCN-LABEL: {{^}}chain_hi_to_lo_private:
; GCN: buffer_load_ushort [[DST:v[0-9]+]], off, [[RSRC:s\[[0-9]+:[0-9]+\]]], [[SOFF:s[0-9]+]] offset:2		; GCN: buffer_load_ushort [[DST:v[0-9]+]], off, [[RSRC:s\[[0-9]+:[0-9]+\]]], [[SOFF:s[0-9]+]] offset:2
; GCN-NEXT: s_waitcnt vmcnt(0)		; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_load_short_d16_hi [[DST]], off, [[RSRC]], [[SOFF]]		; GCN-NEXT: buffer_load_short_d16_hi [[DST]], off, [[RSRC]], [[SOFF]]
define <2 x half> @chain_hi_to_lo_private() {		define <2 x half> @chain_hi_to_lo_private() {
bb:		bb:
%gep_lo = getelementptr inbounds half, half addrspace(5)* null, i64 1		%gep_lo = getelementptr inbounds half, half addrspace(5)* null, i64 1
▲ Show 20 Lines • Show All 124 Lines • ▼ Show 20 Lines	bb:
%load_lo = load half, half* %base_lo		%load_lo = load half, half* %base_lo
%load_hi = load half, half* %base_hi		%load_hi = load half, half* %base_hi

%temp = insertelement <2 x half> undef, half %load_lo, i32 0		%temp = insertelement <2 x half> undef, half %load_lo, i32 0
%result = insertelement <2 x half> %temp, half %load_hi, i32 1		%result = insertelement <2 x half> %temp, half %load_hi, i32 1

ret <2 x half> %result		ret <2 x half> %result
}		}

		; Make sure we don't lose any of the private stores.
		; GCN-LABEL: {{^}}vload2_private:
		; GCN: buffer_store_short v{{[0-9]+}}, off, s[0:3], s{{[0-9]+}} offset:4
		; GCN: buffer_store_short_d16_hi v{{[0-9]+}}, off, s[0:3], s{{[0-9]+}} offset:6
		; GCN: buffer_store_short v{{[0-9]+}}, off, s[0:3], s{{[0-9]+}} offset:8

		; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s{{[0-9]+}} offset:4
		; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s{{[0-9]+}} offset:6
		; GCN: buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s{{[0-9]+}} offset:8
		define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly %in, <2 x i16> addrspace(1)* nocapture %out) #0 {
		entry:
		%loc = alloca [3 x i16], align 2, addrspace(5)
		%loc.0.sroa_cast1 = bitcast [3 x i16] addrspace(5)* %loc to i8 addrspace(5)*
		%tmp = load i16, i16 addrspace(1)* %in, align 2
		%loc.0.sroa_idx = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 0
		store volatile i16 %tmp, i16 addrspace(5)* %loc.0.sroa_idx
		%arrayidx.1 = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 1
		%tmp1 = load i16, i16 addrspace(1)* %arrayidx.1, align 2
		%loc.2.sroa_idx3 = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 1
		store volatile i16 %tmp1, i16 addrspace(5)* %loc.2.sroa_idx3
		%arrayidx.2 = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 2
		%tmp2 = load i16, i16 addrspace(1)* %arrayidx.2, align 2
		%loc.4.sroa_idx = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 2
		store volatile i16 %tmp2, i16 addrspace(5)* %loc.4.sroa_idx
		%loc.0.sroa_cast = bitcast [3 x i16] addrspace(5)* %loc to <2 x i16> addrspace(5)*
		%loc.0. = load <2 x i16>, <2 x i16> addrspace(5)* %loc.0.sroa_cast, align 2
		store <2 x i16> %loc.0., <2 x i16> addrspace(1)* %out, align 4
		%loc.2.sroa_idx = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 1
		%loc.2.sroa_cast = bitcast i16 addrspace(5)* %loc.2.sroa_idx to <2 x i16> addrspace(5)*
		%loc.2. = load <2 x i16>, <2 x i16> addrspace(5)* %loc.2.sroa_cast, align 2
		%arrayidx6 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 1
		store <2 x i16> %loc.2., <2 x i16> addrspace(1)* %arrayidx6, align 4
		%loc.0.sroa_cast2 = bitcast [3 x i16] addrspace(5)* %loc to i8 addrspace(5)*
		ret void
		}