This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Fix broken FrameIndex handling
ClosedPublic

Authored by arsenm on Sep 7 2016, 7:40 PM.

Download Raw Diff

Details

Reviewers

• tstellarAMD
nhaehnle

Summary

We were trying to avoid using a FrameIndex operand in non-pointer
operands in a convoluted way, and would break because of
using TargetFrameIndex. The TargetFrameIndex should only be used
in the case where it makes sense to fold it as part of the addressing
mode, otherwise it requires materialization like a normal constant.
This wasn't working reliably and failed in the added testcase, hitting
the assert when processing the frame index.

The TargetFrameIndex was coming from trying to produce an AssertZext
limiting the maximum stack size. I'm not sure this was correct to begin
with, because it is apparently possible to have a single workitem
dispatch that requires all 4G of private memory.

This mostly improves code but a few of the cases in
amdgpu.private-memory.ll are worse.

Diff Detail

Event Timeline

arsenm updated this revision to Diff 70637.Sep 7 2016, 7:40 PM

arsenm retitled this revision from to AMDGPU: Fix broken FrameIndex handling.

arsenm updated this object.

arsenm added a reviewer: • tstellarAMD.

arsenm added a subscriber: llvm-commits.

Herald added subscribers: nhaehnle, wdng, arsenm. · View Herald TranscriptSep 7 2016, 7:40 PM

Fix missing piece of patch and test changes that belong with later commit

In D24328#536701, @arsenm wrote:

Fix missing piece of patch and test changes that belong with later commit

Actually I think omitting the FrameIndex into addressing mode DAG folding and letting LocalStackSlotAllocation figure it out instead is where all of the improvements came from, so the original patch is probably better

arsenm mentioned this in D24324: AMDGPU: Fix selection failure with dead AssertZext.Sep 9 2016, 11:56 AM

ping, changing to folding frame indexes into memory instructions needs to be a separate patch

LGTM

This revision is now accepted and ready to land.Sep 17 2016, 7:04 AM

r281824

Revision Contents

Path

Size

lib/

Target/

AMDGPU/

AMDGPUISelDAGToDAG.cpp

71 lines

1 line

39 lines

5 lines

5 lines

test/

CodeGen/

AMDGPU/

captured-frame-index.ll

31 lines

local-stack-slot-bug.ll

6 lines

Diff 70643

lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Show First 20 Lines • Show All 57 Lines • ▼ Show 20 Lines	class AMDGPUDAGToDAGISel : public SelectionDAGISel {
const AMDGPUSubtarget *Subtarget;		const AMDGPUSubtarget *Subtarget;

public:		public:
AMDGPUDAGToDAGISel(TargetMachine &TM);		AMDGPUDAGToDAGISel(TargetMachine &TM);
virtual ~AMDGPUDAGToDAGISel();		virtual ~AMDGPUDAGToDAGISel();
bool runOnMachineFunction(MachineFunction &MF) override;		bool runOnMachineFunction(MachineFunction &MF) override;
void Select(SDNode *N) override;		void Select(SDNode *N) override;
const char *getPassName() const override;		const char *getPassName() const override;
void PreprocessISelDAG() override;
void PostprocessISelDAG() override;		void PostprocessISelDAG() override;

private:		private:
		SDValue foldFrameIndex(SDValue N) const;
bool isInlineImmediate(const SDNode *N) const;		bool isInlineImmediate(const SDNode *N) const;
bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs,		bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs,
const R600InstrInfo *TII);		const R600InstrInfo *TII);
bool FoldOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);		bool FoldOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);
bool FoldDotOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);		bool FoldDotOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);

bool isConstantLoad(const MemSDNode *N, int cbID) const;		bool isConstantLoad(const MemSDNode *N, int cbID) const;
bool isUniformBr(const SDNode *N) const;		bool isUniformBr(const SDNode *N) const;
▲ Show 20 Lines • Show All 827 Lines • ▼ Show 20 Lines	bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
SDValue &Offset,		SDValue &Offset,
SDValue &SLC) const {		SDValue &SLC) const {
SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1);		SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1);
SDValue GLC, TFE;		SDValue GLC, TFE;

return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE);		return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE);
}		}

		SDValue AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
		if (auto FI = dyn_cast<FrameIndexSDNode>(N))
		return CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
		return N;
		}

bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,		bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
SDValue &VAddr, SDValue &SOffset,		SDValue &VAddr, SDValue &SOffset,
SDValue &ImmOffset) const {		SDValue &ImmOffset) const {

SDLoc DL(Addr);		SDLoc DL(Addr);
MachineFunction &MF = CurDAG->getMachineFunction();		MachineFunction &MF = CurDAG->getMachineFunction();
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();		const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();

Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);		Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
SOffset = CurDAG->getRegister(Info->getScratchWaveOffsetReg(), MVT::i32);		SOffset = CurDAG->getRegister(Info->getScratchWaveOffsetReg(), MVT::i32);

// (add n0, c1)		// (add n0, c1)
if (CurDAG->isBaseWithConstantOffset(Addr)) {		if (CurDAG->isBaseWithConstantOffset(Addr)) {
SDValue N0 = Addr.getOperand(0);		SDValue N0 = Addr.getOperand(0);
SDValue N1 = Addr.getOperand(1);		SDValue N1 = Addr.getOperand(1);

// Offsets in vaddr must be positive.		// Offsets in vaddr must be positive.
ConstantSDNode *C1 = cast<ConstantSDNode>(N1);		ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
if (isLegalMUBUFImmOffset(C1)) {		if (isLegalMUBUFImmOffset(C1)) {
VAddr = N0;		VAddr = foldFrameIndex(N0);
ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);		ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
return true;		return true;
}		}
}		}

// (node)		// (node)
VAddr = Addr;		VAddr = foldFrameIndex(Addr);
ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16);		ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16);
return true;		return true;
}		}

bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,		bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
SDValue &SOffset, SDValue &Offset,		SDValue &SOffset, SDValue &Offset,
SDValue &GLC, SDValue &SLC,		SDValue &GLC, SDValue &SLC,
SDValue &TFE) const {		SDValue &TFE) const {
▲ Show 20 Lines • Show All 545 Lines • ▼ Show 20 Lines
bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src,		bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src,
SDValue &SrcMods,		SDValue &SrcMods,
SDValue &Clamp,		SDValue &Clamp,
SDValue &Omod) const {		SDValue &Omod) const {
Clamp = Omod = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);		Clamp = Omod = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
return SelectVOP3Mods(In, Src, SrcMods);		return SelectVOP3Mods(In, Src, SrcMods);
}		}

void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
MachineFrameInfo &MFI = CurDAG->getMachineFunction().getFrameInfo();

// Handle the perverse case where a frame index is being stored. We don't
// want to see multiple frame index operands on the same instruction since
// it complicates things and violates some assumptions about frame index
// lowering.
for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
I != E; ++I) {
SDValue FI = CurDAG->getTargetFrameIndex(I, MVT::i32);

// It's possible that we have a frame index defined in the function that
// isn't used in this block.
if (FI.use_empty())
continue;

// Skip over the AssertZext inserted during lowering.
SDValue EffectiveFI = FI;
auto It = FI->use_begin();
if (It->getOpcode() == ISD::AssertZext && FI->hasOneUse()) {
EffectiveFI = SDValue(*It, 0);
It = EffectiveFI->use_begin();
}

for (auto It = EffectiveFI->use_begin(); !It.atEnd(); ) {
SDUse &Use = It.getUse();
SDNode *User = Use.getUser();
unsigned OpIdx = It.getOperandNo();
++It;

if (MemSDNode *M = dyn_cast<MemSDNode>(User)) {
unsigned PtrIdx = M->getOpcode() == ISD::STORE ? 2 : 1;
if (OpIdx == PtrIdx)
continue;

unsigned OpN = M->getNumOperands();
SDValue NewOps[8];

assert(OpN < array_lengthof(NewOps));
for (unsigned Op = 0; Op != OpN; ++Op) {
if (Op != OpIdx) {
NewOps[Op] = M->getOperand(Op);
continue;
}

MachineSDNode *Mov = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
SDLoc(M), MVT::i32, FI);
NewOps[Op] = SDValue(Mov, 0);
}

CurDAG->UpdateNodeOperands(M, makeArrayRef(NewOps, OpN));
}

if (EffectiveFI->use_empty())
CurDAG->RemoveDeadNode(EffectiveFI.getNode());
}
}
}

void AMDGPUDAGToDAGISel::PostprocessISelDAG() {		void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
const AMDGPUTargetLowering& Lowering =		const AMDGPUTargetLowering& Lowering =
static_cast<const AMDGPUTargetLowering>(getTargetLowering());		static_cast<const AMDGPUTargetLowering>(getTargetLowering());
bool IsModified = false;		bool IsModified = false;
do {		do {
IsModified = false;		IsModified = false;
// Go over all selected nodes and try to fold them a bit more		// Go over all selected nodes and try to fold them a bit more
for (SDNode &Node : CurDAG->allnodes()) {		for (SDNode &Node : CurDAG->allnodes()) {
Show All 13 Lines

lib/Target/AMDGPU/SIISelLowering.h

Show All 27 Lines	class SITargetLowering final : public AMDGPUTargetLowering {
SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,		SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
SelectionDAG &DAG) const override;		SelectionDAG &DAG) const override;
SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,		SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
MVT VT, unsigned Offset) const;		MVT VT, unsigned Offset) const;

SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const;		SDValue lowerFastUnsafeFDIV(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const;		SDValue lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool Signed) const;		SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool Signed) const;
▲ Show 20 Lines • Show All 118 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIISelLowering.cpp

Show First 20 Lines • Show All 83 Lines • ▼ Show 20 Lines	SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STORE, MVT::v2i32, Custom);		setOperationAction(ISD::STORE, MVT::v2i32, Custom);
setOperationAction(ISD::STORE, MVT::v4i32, Custom);		setOperationAction(ISD::STORE, MVT::v4i32, Custom);
setOperationAction(ISD::STORE, MVT::v8i32, Custom);		setOperationAction(ISD::STORE, MVT::v8i32, Custom);
setOperationAction(ISD::STORE, MVT::v16i32, Custom);		setOperationAction(ISD::STORE, MVT::v16i32, Custom);
setOperationAction(ISD::STORE, MVT::i1, Custom);		setOperationAction(ISD::STORE, MVT::i1, Custom);

setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);		setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);		setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);		setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);

setOperationAction(ISD::SELECT, MVT::i1, Promote);		setOperationAction(ISD::SELECT, MVT::i1, Promote);
setOperationAction(ISD::SELECT, MVT::i64, Custom);		setOperationAction(ISD::SELECT, MVT::i64, Custom);
setOperationAction(ISD::SELECT, MVT::f64, Promote);		setOperationAction(ISD::SELECT, MVT::f64, Promote);
AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);		AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);

setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);		setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
▲ Show 20 Lines • Show All 1,449 Lines • ▼ Show 20 Lines

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// Custom DAG Lowering Operations		// Custom DAG Lowering Operations
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {		SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {		switch (Op.getOpcode()) {
default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);		default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
case ISD::BRCOND: return LowerBRCOND(Op, DAG);		case ISD::BRCOND: return LowerBRCOND(Op, DAG);
case ISD::LOAD: {		case ISD::LOAD: {
SDValue Result = LowerLOAD(Op, DAG);		SDValue Result = LowerLOAD(Op, DAG);
assert((!Result.getNode() \|\|		assert((!Result.getNode() \|\|
Result.getNode()->getNumValues() == 2) &&		Result.getNode()->getNumValues() == 2) &&
"Load should return a value and a chain");		"Load should return a value and a chain");
return Result;		return Result;
}		}
Show All 30 Lines	if (I.getUse().get() != Value)
continue;		continue;

if (I->getOpcode() == Opcode)		if (I->getOpcode() == Opcode)
return *I;		return *I;
}		}
return nullptr;		return nullptr;
}		}

SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {

SDLoc SL(Op);
FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op);
unsigned FrameIndex = FINode->getIndex();

// A FrameIndex node represents a 32-bit offset into scratch memory. If the
// high bit of a frame index offset were to be set, this would mean that it
// represented an offset of ~2GB * 64 = ~128GB from the start of the scratch
// buffer, with 64 being the number of threads per wave.
//
// The maximum private allocation for the entire GPU is 4G, and we are
// concerned with the largest the index could ever be for an individual
// workitem. This will occur with the minmum dispatch size. If a program
// requires more, the dispatch size will be reduced.
//
// With this limit, we can mark the high bit of the FrameIndex node as known
// zero, which is important, because it means in most situations we can prove
// that values derived from FrameIndex nodes are non-negative. This enables us
// to take advantage of more addressing modes when accessing scratch buffers,
// since for scratch reads/writes, the register offset must always be
// positive.

uint64_t MaxGPUAlloc = UINT64_C(4) * 1024 * 1024 * 1024;

// XXX - It is unclear if partial dispatch works. Assume it works at half wave
// granularity. It is probably a full wave.
uint64_t MinGranularity = 32;

unsigned KnownBits = Log2_64(MaxGPUAlloc / MinGranularity);
EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), KnownBits);

SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI,
DAG.getValueType(ExtVT));
}

bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {		bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
if (Intr->getOpcode() != ISD::INTRINSIC_W_CHAIN)		if (Intr->getOpcode() != ISD::INTRINSIC_W_CHAIN)
return false;		return false;

switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {		switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
default: return false;		default: return false;
case AMDGPUIntrinsic::amdgcn_if:		case AMDGPUIntrinsic::amdgcn_if:
case AMDGPUIntrinsic::amdgcn_else:		case AMDGPUIntrinsic::amdgcn_else:
▲ Show 20 Lines • Show All 2,163 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIInstrInfo.td

	Show First 20 Lines • Show All 289 Lines • ▼ Show 20 Lines
	}]>;			}]>;

	// Copied from the AArch64 backend:			// Copied from the AArch64 backend:
	def bitcast_fpimm_to_i32 : SDNodeXForm<fpimm, [{			def bitcast_fpimm_to_i32 : SDNodeXForm<fpimm, [{
	return CurDAG->getTargetConstant(			return CurDAG->getTargetConstant(
	N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i32);			N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i32);
	}]>;			}]>;

				def frameindex_to_targetframeindex : SDNodeXForm<frameindex, [{
				auto FI = cast<FrameIndexSDNode>(N);
				return CurDAG->getTargetFrameIndex(FI->getIndex(), MVT::i32);
				}]>;

	// Copied from the AArch64 backend:			// Copied from the AArch64 backend:
	def bitcast_fpimm_to_i64 : SDNodeXForm<fpimm, [{			def bitcast_fpimm_to_i64 : SDNodeXForm<fpimm, [{
	return CurDAG->getTargetConstant(			return CurDAG->getTargetConstant(
	N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i64);			N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i64);
	}]>;			}]>;

	def SIMM16bit : PatLeaf <(imm),			def SIMM16bit : PatLeaf <(imm),
	[{return isInt<16>(N->getSExtValue());}]			[{return isInt<16>(N->getSExtValue());}]
	▲ Show 20 Lines • Show All 2,285 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIInstructions.td

	Show First 20 Lines • Show All 1,934 Lines • ▼ Show 20 Lines
	>;			>;

	def : Pat <			def : Pat <
	(f32 fpimm:$imm),			(f32 fpimm:$imm),
	(V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm)))			(V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm)))
	>;			>;

	def : Pat <			def : Pat <
				(i32 frameindex:$fi),
				(V_MOV_B32_e32 (i32 (frameindex_to_targetframeindex $fi)))
				>;

				def : Pat <
	(i64 InlineImm<i64>:$imm),			(i64 InlineImm<i64>:$imm),
	(S_MOV_B64 InlineImm<i64>:$imm)			(S_MOV_B64 InlineImm<i64>:$imm)
	>;			>;

	// XXX - Should this use a s_cmp to set SCC?			// XXX - Should this use a s_cmp to set SCC?

	// Set to sign-extended 64-bit value (true = -1, false = 0)			// Set to sign-extended 64-bit value (true = -1, false = 0)
	def : Pat <			def : Pat <
	▲ Show 20 Lines • Show All 490 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/captured-frame-index.ll

; RUN: llc -march=amdgcn -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s		; RUN: llc -march=amdgcn -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s

		; GCN-LABEL: {{^}}store_fi_lifetime:
		; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
		; GCN: buffer_store_dword [[FI]]
		define void @store_fi_lifetime(i32 addrspace(1)* %out, i32 %in) #0 {
		entry:
		%b = alloca i8
		call void @llvm.lifetime.start(i64 1, i8* %b)
		store volatile i8* %b, i8* addrspace(1)* undef
		call void @llvm.lifetime.end(i64 1, i8* %b)
		ret void
		}

; GCN-LABEL: {{^}}stored_fi_to_lds:		; GCN-LABEL: {{^}}stored_fi_to_lds:
; GCN: s_load_dword [[LDSPTR:s[0-9]+]]		; GCN: s_load_dword [[LDSPTR:s[0-9]+]]
; GCN: v_mov_b32_e32 [[ZERO1:v[0-9]+]], 0{{$}}		; GCN: v_mov_b32_e32 [[ZERO1:v[0-9]+]], 0{{$}}
; GCN: buffer_store_dword v{{[0-9]+}}, [[ZERO1]]		; GCN: buffer_store_dword v{{[0-9]+}}, [[ZERO1]]
; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 0{{$}}		; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 0{{$}}
; GCN: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]]		; GCN: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]]
; GCN: ds_write_b32 [[VLDSPTR]], [[ZERO0]]		; GCN: ds_write_b32 [[VLDSPTR]], [[ZERO0]]
define void @stored_fi_to_lds(float* addrspace(3)* %ptr) #0 {		define void @stored_fi_to_lds(float* addrspace(3)* %ptr) #0 {
▲ Show 20 Lines • Show All 124 Lines • ▼ Show 20 Lines	define void @stored_fi_to_global_2_small_objects(float* addrspace(1)* %ptr) #0 {
store volatile float 0.0, float *%tmp1		store volatile float 0.0, float *%tmp1
store volatile float 0.0, float *%tmp2		store volatile float 0.0, float *%tmp2
store volatile float* %tmp1, float* addrspace(1)* %ptr		store volatile float* %tmp1, float* addrspace(1)* %ptr
store volatile float* %tmp2, float* addrspace(1)* %ptr		store volatile float* %tmp2, float* addrspace(1)* %ptr
ret void		ret void
}		}

; GCN-LABEL: {{^}}stored_fi_to_global_huge_frame_offset:		; GCN-LABEL: {{^}}stored_fi_to_global_huge_frame_offset:
; GCN: s_add_i32 [[BASE_1_OFF_0:s[0-9]+]], 0, 0x3ffc		; GCN: v_mov_b32_e32 [[VAL_0:v[0-9]+]], 0{{$}}
; GCN: v_mov_b32_e32 [[BASE_0:v[0-9]+]], 0{{$}}		; GCN: v_mov_b32_e32 [[BASE_0:v[0-9]+]], 0{{$}}
; GCN: buffer_store_dword [[BASE_0]], v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen		; GCN: buffer_store_dword [[VAL_0]], [[BASE_0]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen

		; GCN: v_mov_b32_e32 [[BASE_0_1:v[0-9]+]], 0{{$}}
		; GCN: v_add_i32_e32 [[BASE_1_OFF_0:v[0-9]+]], vcc, 0x3ffc, [[BASE_0_1]]

; GCN: v_mov_b32_e32 [[V_BASE_1_OFF_0:v[0-9]+]], [[BASE_1_OFF_0]]
; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}		; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
; GCN: s_add_i32 [[BASE_1_OFF_1:s[0-9]+]], 0, 56		; GCN: v_add_i32_e32 [[BASE_1_OFF_1:v[0-9]+]], vcc, 56, [[BASE_0_1]]
; GCN: buffer_store_dword [[K]], [[V_BASE_1_OFF_0]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}		; GCN: buffer_store_dword [[K]], [[BASE_1_OFF_0]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}

; GCN: v_mov_b32_e32 [[V_BASE_1_OFF_1:v[0-9]+]], [[BASE_1_OFF_1]]		; GCN: buffer_store_dword [[BASE_1_OFF_1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
; GCN: buffer_store_dword [[V_BASE_1_OFF_1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
define void @stored_fi_to_global_huge_frame_offset(i32* addrspace(1)* %ptr) #0 {		define void @stored_fi_to_global_huge_frame_offset(i32* addrspace(1)* %ptr) #0 {
%tmp0 = alloca [4096 x i32]		%tmp0 = alloca [4096 x i32]
%tmp1 = alloca [4096 x i32]		%tmp1 = alloca [4096 x i32]
%gep0.tmp0 = getelementptr [4096 x i32], [4096 x i32]* %tmp0, i32 0, i32 0		%gep0.tmp0 = getelementptr [4096 x i32], [4096 x i32]* %tmp0, i32 0, i32 0
store volatile i32 0, i32* %gep0.tmp0		store volatile i32 0, i32* %gep0.tmp0
%gep1.tmp0 = getelementptr [4096 x i32], [4096 x i32]* %tmp0, i32 0, i32 4095		%gep1.tmp0 = getelementptr [4096 x i32], [4096 x i32]* %tmp0, i32 0, i32 4095
store volatile i32 999, i32* %gep1.tmp0		store volatile i32 999, i32* %gep1.tmp0
%gep0.tmp1 = getelementptr [4096 x i32], [4096 x i32]* %tmp0, i32 0, i32 14		%gep0.tmp1 = getelementptr [4096 x i32], [4096 x i32]* %tmp0, i32 0, i32 14
Show All 15 Lines	entry:
%b = alloca i32, align 4		%b = alloca i32, align 4
%tmp1 = load volatile i32, i32 addrspace(1)* @g1, align 4		%tmp1 = load volatile i32, i32 addrspace(1)* @g1, align 4
%arrayidx = getelementptr inbounds i32, i32* %tmp1, i32 %idx		%arrayidx = getelementptr inbounds i32, i32* %tmp1, i32 %idx
%tmp2 = load i32, i32* %arrayidx, align 4		%tmp2 = load i32, i32* %arrayidx, align 4
store volatile i32* %b, i32* addrspace(1)* undef		store volatile i32* %b, i32* addrspace(1)* undef
ret void		ret void
}		}

		declare void @llvm.lifetime.start(i64, i8* nocapture) #1
		declare void @llvm.lifetime.end(i64, i8* nocapture) #1

attributes #0 = { nounwind }		attributes #0 = { nounwind }
		attributes #1 = { argmemonly nounwind }

test/CodeGen/AMDGPU/local-stack-slot-bug.ll

	; RUN: llc -march=amdgcn -mcpu=verde -mattr=+vgpr-spilling -verify-machineinstrs < %s \| FileCheck %s			; RUN: llc -march=amdgcn -mcpu=verde -mattr=+vgpr-spilling -verify-machineinstrs < %s \| FileCheck %s
	; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling -verify-machineinstrs < %s \| FileCheck %s			; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling -verify-machineinstrs < %s \| FileCheck %s

	; This used to fail due to a v_add_i32 instruction with an illegal immediate			; This used to fail due to a v_add_i32 instruction with an illegal immediate
	; operand that was created during Local Stack Slot Allocation. Test case derived			; operand that was created during Local Stack Slot Allocation. Test case derived
	; from https://bugs.freedesktop.org/show_bug.cgi?id=96602			; from https://bugs.freedesktop.org/show_bug.cgi?id=96602
	;			;
	; CHECK-LABEL: {{^}}main:			; CHECK-LABEL: {{^}}main:
	; CHECK: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0			; CHECK: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0
	; CHECK: v_add_i32_e32 [[HI_OFF:v[0-9]+]], vcc, 0x200, [[BYTES]]			; CHECK-DAG: v_mov_b32_e32 [[ZERO_BASE_FI:v[0-9]+]], 0{{$}}
	; CHECK: v_add_i32_e32 [[LO_OFF:v[0-9]+]], vcc, 0, [[BYTES]]			; CHECK-DAG: v_add_i32_e32 [[HI_OFF:v[0-9]+]], vcc, 0x200, [[BYTES]]
				; CHECK-DAG: v_add_i32_e32 [[LO_OFF:v[0-9]+]], vcc, 0, [[BYTES]]

	; CHECK: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen			; CHECK: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen
	; CHECK: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen			; CHECK: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen
	define amdgpu_ps float @main(i32 %idx) {			define amdgpu_ps float @main(i32 %idx) {
	main_body:			main_body:
	%v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx			%v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
	%v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx			%v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
	%r = fadd float %v1, %v2			%r = fadd float %v1, %v2
	ret float %r			ret float %r
	}			}