Diff 206420

llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

Show First 20 Lines • Show All 292 Lines • ▼ Show 20 Lines	DisasmLines.push_back(
+ "_" + Twine(MBB.getNumber()) + ":").str());		+ "_" + Twine(MBB.getNumber()) + ":").str());
DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());		DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
HexLines.push_back("");		HexLines.push_back("");
}		}
AsmPrinter::EmitBasicBlockStart(MBB);		AsmPrinter::EmitBasicBlockStart(MBB);
}		}

void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {		void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
		if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
		if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) {
		OutContext.reportError({},
		Twine(GV->getName()) +
		": unsupported initializer for address space");
		return;
		}

		// LDS variables aren't emitted in HSA or PAL yet.
		const Triple::OSType OS = TM.getTargetTriple().getOS();
		if (OS == Triple::AMDHSA \|\| OS == Triple::AMDPAL)
		return;

		MCSymbol *GVSym = getSymbol(GV);

// Group segment variables aren't emitted in HSA.		GVSym->redefineIfPossible();
if (AMDGPU::isGroupSegment(GV))		if (GVSym->isDefined() \|\| GVSym->isVariable())
		report_fatal_error("symbol '" + Twine(GVSym->getName()) +
		"' is already defined");

		const DataLayout &DL = GV->getParent()->getDataLayout();
		uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
		unsigned Align = GV->getAlignment();
		if (!Align)
		Align = 4;

		EmitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
		EmitLinkage(GV, GVSym);
		getTargetStreamer()->emitAMDGPULDS(GVSym, Size, Align);
return;		return;
		}

AsmPrinter::EmitGlobalVariable(GV);		AsmPrinter::EmitGlobalVariable(GV);
}		}

bool AMDGPUAsmPrinter::doFinalization(Module &M) {		bool AMDGPUAsmPrinter::doFinalization(Module &M) {
CallGraphResourceInfo.clear();		CallGraphResourceInfo.clear();

// Pad with s_code_end to help tools and guard against instruction prefetch		// Pad with s_code_end to help tools and guard against instruction prefetch
▲ Show 20 Lines • Show All 892 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h

Show First 20 Lines • Show All 479 Lines • ▼ Show 20 Lines	enum NodeType : unsigned {
SENDMSGHALT,		SENDMSGHALT,
INTERP_MOV,		INTERP_MOV,
INTERP_P1,		INTERP_P1,
INTERP_P2,		INTERP_P2,
INTERP_P1LL_F16,		INTERP_P1LL_F16,
INTERP_P1LV_F16,		INTERP_P1LV_F16,
INTERP_P2_F16,		INTERP_P2_F16,
PC_ADD_REL_OFFSET,		PC_ADD_REL_OFFSET,
		LDS,
KILL,		KILL,
DUMMY_CHAIN,		DUMMY_CHAIN,
FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,		FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
LOAD_D16_HI,		LOAD_D16_HI,
LOAD_D16_LO,		LOAD_D16_LO,
LOAD_D16_HI_I8,		LOAD_D16_HI_I8,
LOAD_D16_HI_U8,		LOAD_D16_HI_U8,
LOAD_D16_LO_I8,		LOAD_D16_LO_I8,
▲ Show 20 Lines • Show All 48 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Show First 20 Lines • Show All 4,351 Lines • ▼ Show 20 Lines	const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(CVT_PKNORM_U16_F32)		NODE_NAME_CASE(CVT_PKNORM_U16_F32)
NODE_NAME_CASE(CVT_PK_I16_I32)		NODE_NAME_CASE(CVT_PK_I16_I32)
NODE_NAME_CASE(CVT_PK_U16_U32)		NODE_NAME_CASE(CVT_PK_U16_U32)
NODE_NAME_CASE(FP_TO_FP16)		NODE_NAME_CASE(FP_TO_FP16)
NODE_NAME_CASE(FP16_ZEXT)		NODE_NAME_CASE(FP16_ZEXT)
NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)		NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
NODE_NAME_CASE(CONST_DATA_PTR)		NODE_NAME_CASE(CONST_DATA_PTR)
NODE_NAME_CASE(PC_ADD_REL_OFFSET)		NODE_NAME_CASE(PC_ADD_REL_OFFSET)
		NODE_NAME_CASE(LDS)
NODE_NAME_CASE(KILL)		NODE_NAME_CASE(KILL)
NODE_NAME_CASE(DUMMY_CHAIN)		NODE_NAME_CASE(DUMMY_CHAIN)
case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;		case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
NODE_NAME_CASE(INIT_EXEC)		NODE_NAME_CASE(INIT_EXEC)
NODE_NAME_CASE(INIT_EXEC_FROM_INPUT)		NODE_NAME_CASE(INIT_EXEC_FROM_INPUT)
NODE_NAME_CASE(SENDMSG)		NODE_NAME_CASE(SENDMSG)
NODE_NAME_CASE(SENDMSGHALT)		NODE_NAME_CASE(SENDMSGHALT)
NODE_NAME_CASE(INTERP_MOV)		NODE_NAME_CASE(INTERP_MOV)
▲ Show 20 Lines • Show All 198 Lines • ▼ Show 20 Lines	void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
case AMDGPUISD::BUFFER_LOAD_UBYTE: {		case AMDGPUISD::BUFFER_LOAD_UBYTE: {
Known.Zero.setHighBits(24);		Known.Zero.setHighBits(24);
break;		break;
}		}
case AMDGPUISD::BUFFER_LOAD_USHORT: {		case AMDGPUISD::BUFFER_LOAD_USHORT: {
Known.Zero.setHighBits(16);		Known.Zero.setHighBits(16);
break;		break;
}		}
		case AMDGPUISD::LDS: {
		auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
		unsigned Align = GA->getGlobal()->getAlignment();

		Known.Zero.setHighBits(16);
		if (Align)
		Known.Zero.setLowBits(Log2_32(Align));
		break;
		}
case ISD::INTRINSIC_WO_CHAIN: {		case ISD::INTRINSIC_WO_CHAIN: {
unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();		unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
switch (IID) {		switch (IID) {
case Intrinsic::amdgcn_mbcnt_lo:		case Intrinsic::amdgcn_mbcnt_lo:
case Intrinsic::amdgcn_mbcnt_hi: {		case Intrinsic::amdgcn_mbcnt_hi: {
const GCNSubtarget &ST =		const GCNSubtarget &ST =
DAG.getMachineFunction().getSubtarget<GCNSubtarget>();		DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
// These return at most the wavefront size - 1.		// These return at most the wavefront size - 1.
▲ Show 20 Lines • Show All 161 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/AMDGPU/SIFoldOperands.cpp

Show First 20 Lines • Show All 44 Lines • ▼ Show 20 Lines	FoldCandidate(MachineInstr MI, unsigned OpNo, MachineOperand FoldOp,
UseMI(MI), OpToFold(nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),		UseMI(MI), OpToFold(nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
Kind(FoldOp->getType()),		Kind(FoldOp->getType()),
Commuted(Commuted_) {		Commuted(Commuted_) {
if (FoldOp->isImm()) {		if (FoldOp->isImm()) {
ImmToFold = FoldOp->getImm();		ImmToFold = FoldOp->getImm();
} else if (FoldOp->isFI()) {		} else if (FoldOp->isFI()) {
FrameIndexToFold = FoldOp->getIndex();		FrameIndexToFold = FoldOp->getIndex();
} else {		} else {
assert(FoldOp->isReg());		assert(FoldOp->isReg() \|\| FoldOp->isGlobal());
OpToFold = FoldOp;		OpToFold = FoldOp;
}		}
}		}

bool isFI() const {		bool isFI() const {
return Kind == MachineOperand::MO_FrameIndex;		return Kind == MachineOperand::MO_FrameIndex;
}		}

bool isImm() const {		bool isImm() const {
return Kind == MachineOperand::MO_Immediate;		return Kind == MachineOperand::MO_Immediate;
}		}

bool isReg() const {		bool isReg() const {
return Kind == MachineOperand::MO_Register;		return Kind == MachineOperand::MO_Register;
}		}

		bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; }

bool isCommuted() const {		bool isCommuted() const {
return Commuted;		return Commuted;
}		}

bool needsShrink() const {		bool needsShrink() const {
return ShrinkOpcode != -1;		return ShrinkOpcode != -1;
}		}

▲ Show 20 Lines • Show All 147 Lines • ▼ Show 20 Lines	if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked &&
}		}
break;		break;
default:		default:
break;		break;
}		}
}		}
}		}

if ((Fold.isImm() \|\| Fold.isFI()) && Fold.needsShrink()) {		if ((Fold.isImm() \|\| Fold.isFI() \|\| Fold.isGlobal()) && Fold.needsShrink()) {
MachineBasicBlock *MBB = MI->getParent();		MachineBasicBlock *MBB = MI->getParent();
auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI);		auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI);
if (Liveness != MachineBasicBlock::LQR_Dead)		if (Liveness != MachineBasicBlock::LQR_Dead)
return false;		return false;

MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();		MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
int Op32 = Fold.getShrinkOpcode();		int Op32 = Fold.getShrinkOpcode();
MachineOperand &Dst0 = MI->getOperand(0);		MachineOperand &Dst0 = MI->getOperand(0);
Show All 30 Lines	static bool updateOperand(FoldCandidate &Fold,

assert(!Fold.needsShrink() && "not handled");		assert(!Fold.needsShrink() && "not handled");

if (Fold.isImm()) {		if (Fold.isImm()) {
Old.ChangeToImmediate(Fold.ImmToFold);		Old.ChangeToImmediate(Fold.ImmToFold);
return true;		return true;
}		}

		if (Fold.isGlobal()) {
		Old.ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(),
		Fold.OpToFold->getTargetFlags());
		return true;
		}

if (Fold.isFI()) {		if (Fold.isFI()) {
Old.ChangeToFrameIndex(Fold.FrameIndexToFold);		Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
return true;		return true;
}		}

MachineOperand *New = Fold.OpToFold;		MachineOperand *New = Fold.OpToFold;
Old.substVirtReg(New->getReg(), New->getSubReg(), TRI);		Old.substVirtReg(New->getReg(), New->getSubReg(), TRI);
Old.setIsUndef(New->isUndef());		Old.setIsUndef(New->isUndef());
▲ Show 20 Lines • Show All 75 Lines • ▼ Show 20 Lines	if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
if (!CanCommute \|\|		if (!CanCommute \|\|
!TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1))		!TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1))
return false;		return false;

if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) {		if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) {
if ((Opc == AMDGPU::V_ADD_I32_e64 \|\|		if ((Opc == AMDGPU::V_ADD_I32_e64 \|\|
Opc == AMDGPU::V_SUB_I32_e64 \|\|		Opc == AMDGPU::V_SUB_I32_e64 \|\|
Opc == AMDGPU::V_SUBREV_I32_e64) && // FIXME		Opc == AMDGPU::V_SUBREV_I32_e64) && // FIXME
(OpToFold->isImm() \|\| OpToFold->isFI())) {		(OpToFold->isImm() \|\| OpToFold->isFI() \|\| OpToFold->isGlobal())) {
MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();		MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();

// Verify the other operand is a VGPR, otherwise we would violate the		// Verify the other operand is a VGPR, otherwise we would violate the
// constant bus restriction.		// constant bus restriction.
unsigned OtherIdx = CommuteOpNo == CommuteIdx0 ? CommuteIdx1 : CommuteIdx0;		unsigned OtherIdx = CommuteOpNo == CommuteIdx0 ? CommuteIdx1 : CommuteIdx0;
MachineOperand &OtherOp = MI->getOperand(OtherIdx);		MachineOperand &OtherOp = MI->getOperand(OtherIdx);
if (!OtherOp.isReg() \|\|		if (!OtherOp.isReg() \|\|
!TII->getRegisterInfo().isVGPR(MRI, OtherOp.getReg()))		!TII->getRegisterInfo().isVGPR(MRI, OtherOp.getReg()))
▲ Show 20 Lines • Show All 98 Lines • ▼ Show 20 Lines	if (frameIndexMayFold(TII, *UseMI, UseOpIdx, OpToFold)) {

// A frame index will resolve to a positive constant, so it should always be		// A frame index will resolve to a positive constant, so it should always be
// safe to fold the addressing mode, even pre-GFX9.		// safe to fold the addressing mode, even pre-GFX9.
UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex());		UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex());
SOff->setReg(MFI->getStackPtrOffsetReg());		SOff->setReg(MFI->getStackPtrOffsetReg());
return;		return;
}		}

bool FoldingImmLike = OpToFold.isImm() \|\| OpToFold.isFI();		bool FoldingImmLike =
		OpToFold.isImm() \|\| OpToFold.isFI() \|\| OpToFold.isGlobal();

if (FoldingImmLike && UseMI->isCopy()) {		if (FoldingImmLike && UseMI->isCopy()) {
unsigned DestReg = UseMI->getOperand(0).getReg();		unsigned DestReg = UseMI->getOperand(0).getReg();
const TargetRegisterClass *DestRC		const TargetRegisterClass *DestRC
= TargetRegisterInfo::isVirtualRegister(DestReg) ?		= TargetRegisterInfo::isVirtualRegister(DestReg) ?
MRI->getRegClass(DestReg) :		MRI->getRegClass(DestReg) :
TRI->getPhysRegClass(DestReg);		TRI->getPhysRegClass(DestReg);

▲ Show 20 Lines • Show All 384 Lines • ▼ Show 20 Lines	void SIFoldOperands::foldInstOperand(MachineInstr &MI,
MachineOperand &OpToFold) const {		MachineOperand &OpToFold) const {
// We need mutate the operands of new mov instructions to add implicit		// We need mutate the operands of new mov instructions to add implicit
// uses of EXEC, but adding them invalidates the use_iterator, so defer		// uses of EXEC, but adding them invalidates the use_iterator, so defer
// this.		// this.
SmallVector<MachineInstr *, 4> CopiesToReplace;		SmallVector<MachineInstr *, 4> CopiesToReplace;
SmallVector<FoldCandidate, 4> FoldList;		SmallVector<FoldCandidate, 4> FoldList;
MachineOperand &Dst = MI.getOperand(0);		MachineOperand &Dst = MI.getOperand(0);

bool FoldingImm = OpToFold.isImm() \|\| OpToFold.isFI();		bool FoldingImm = OpToFold.isImm() \|\| OpToFold.isFI() \|\| OpToFold.isGlobal();
if (FoldingImm) {		if (FoldingImm) {
unsigned NumLiteralUses = 0;		unsigned NumLiteralUses = 0;
MachineOperand *NonInlineUse = nullptr;		MachineOperand *NonInlineUse = nullptr;
int NonInlineUseOpNo = -1;		int NonInlineUseOpNo = -1;

MachineRegisterInfo::use_iterator NextUse;		MachineRegisterInfo::use_iterator NextUse;
for (MachineRegisterInfo::use_iterator		for (MachineRegisterInfo::use_iterator
Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();		Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
▲ Show 20 Lines • Show All 331 Lines • ▼ Show 20 Lines	for (I = MBB->begin(); I != MBB->end(); I = Next) {
// instruction, and not the omod multiply.		// instruction, and not the omod multiply.
if (IsIEEEMode \|\| (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) \|\|		if (IsIEEEMode \|\| (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) \|\|
!tryFoldOMod(MI))		!tryFoldOMod(MI))
tryFoldClamp(MI);		tryFoldClamp(MI);
continue;		continue;
}		}

MachineOperand &OpToFold = MI.getOperand(1);		MachineOperand &OpToFold = MI.getOperand(1);
bool FoldingImm = OpToFold.isImm() \|\| OpToFold.isFI();		bool FoldingImm =
		OpToFold.isImm() \|\| OpToFold.isFI() \|\| OpToFold.isGlobal();

// FIXME: We could also be folding things like TargetIndexes.		// FIXME: We could also be folding things like TargetIndexes.
if (!FoldingImm && !OpToFold.isReg())		if (!FoldingImm && !OpToFold.isReg())
continue;		continue;

if (OpToFold.isReg() &&		if (OpToFold.isReg() &&
!TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()))		!TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()))
continue;		continue;
Show All 17 Lines

llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 3,582 Lines • ▼ Show 20 Lines	BuildMI(*BB, FirstMI, DebugLoc(),
TII->get(isWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),		TII->get(isWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
Exec)		Exec)
.addImm(-1);		.addImm(-1);
MI.eraseFromParent();		MI.eraseFromParent();
return BB;		return BB;
}		}

case AMDGPU::GET_GROUPSTATICSIZE: {		case AMDGPU::GET_GROUPSTATICSIZE: {
		assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA \|\|
		getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
DebugLoc DL = MI.getDebugLoc();		DebugLoc DL = MI.getDebugLoc();
BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))		BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
.add(MI.getOperand(0))		.add(MI.getOperand(0))
.addImm(MFI->getLDSSize());		.addImm(MFI->getLDSSize());
MI.eraseFromParent();		MI.eraseFromParent();
return BB;		return BB;
}		}
case AMDGPU::SI_INDIRECT_SRC_V1:		case AMDGPU::SI_INDIRECT_SRC_V1:
▲ Show 20 Lines • Show All 1,172 Lines • ▼ Show 20 Lines	buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);		return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
}		}

SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,		SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
SDValue Op,		SDValue Op,
SelectionDAG &DAG) const {		SelectionDAG &DAG) const {
GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);		GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
const GlobalValue *GV = GSD->getGlobal();		const GlobalValue *GV = GSD->getGlobal();
if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS \|\|		if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
		(!GV->hasExternalLinkage() \|\|
		getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA \|\|
		getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL)) \|\|
GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS \|\|		GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS \|\|
GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)		GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);		return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);

SDLoc DL(GSD);		SDLoc DL(GSD);
EVT PtrVT = Op.getValueType();		EVT PtrVT = Op.getValueType();

// FIXME: Should not make address space based decisions here.		if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
		SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
		SIInstrInfo::MO_ABS32_LO);
		return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
		}

if (shouldEmitFixup(GV))		if (shouldEmitFixup(GV))
return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);		return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
else if (shouldEmitPCReloc(GV))		else if (shouldEmitPCReloc(GV))
return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,		return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
SIInstrInfo::MO_REL32);		SIInstrInfo::MO_REL32);

SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,		SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
SIInstrInfo::MO_GOTPCREL32);		SIInstrInfo::MO_GOTPCREL32);
▲ Show 20 Lines • Show All 972 Lines • ▼ Show 20 Lines	SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::amdgcn_fmad_ftz:		case Intrinsic::amdgcn_fmad_ftz:
return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),		return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
Op.getOperand(2), Op.getOperand(3));		Op.getOperand(2), Op.getOperand(3));

case Intrinsic::amdgcn_if_break:		case Intrinsic::amdgcn_if_break:
return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,		return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
Op->getOperand(1), Op->getOperand(2)), 0);		Op->getOperand(1), Op->getOperand(2)), 0);

		case Intrinsic::amdgcn_groupstaticsize: {
		Triple::OSType OS = getTargetMachine().getTargetTriple().getOS();
		if (OS == Triple::AMDHSA \|\| OS == Triple::AMDPAL)
		return Op;

		const Module *M = MF.getFunction().getParent();
		const GlobalValue *GV =
		M->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize));
		SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
		SIInstrInfo::MO_ABS32_LO);
		return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
		}
default:		default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =		if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))		AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
return lowerImage(Op, ImageDimIntr, DAG);		return lowerImage(Op, ImageDimIntr, DAG);

return Op;		return Op;
}		}
}		}
▲ Show 20 Lines • Show All 4,643 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp

Show First 20 Lines • Show All 2,697 Lines • ▼ Show 20 Lines	static bool compareMachineOp(const MachineOperand &Op0,
}		}
}		}

bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,		bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
const MachineOperand &MO) const {		const MachineOperand &MO) const {
const MCInstrDesc &InstDesc = MI.getDesc();		const MCInstrDesc &InstDesc = MI.getDesc();
const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo];		const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo];

assert(MO.isImm() \|\| MO.isTargetIndex() \|\| MO.isFI());		assert(MO.isImm() \|\| MO.isTargetIndex() \|\| MO.isFI() \|\| MO.isGlobal());

if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)		if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
return true;		return true;

if (OpInfo.RegClass < 0)		if (OpInfo.RegClass < 0)
return false;		return false;

if (MO.isImm() && isInlineConstant(MO, OpInfo))		if (MO.isImm() && isInlineConstant(MO, OpInfo))
▲ Show 20 Lines • Show All 292 Lines • ▼ Show 20 Lines	if (MI.getOperand(i).isFPImm()) {
"all fp values to integers.";		"all fp values to integers.";
return false;		return false;
}		}

int RegClass = Desc.OpInfo[i].RegClass;		int RegClass = Desc.OpInfo[i].RegClass;

switch (Desc.OpInfo[i].OperandType) {		switch (Desc.OpInfo[i].OperandType) {
case MCOI::OPERAND_REGISTER:		case MCOI::OPERAND_REGISTER:
if (MI.getOperand(i).isImm()) {		if (MI.getOperand(i).isImm() \|\| MI.getOperand(i).isGlobal()) {
ErrInfo = "Illegal immediate value for operand.";		ErrInfo = "Illegal immediate value for operand.";
return false;		return false;
}		}
break;		break;
case AMDGPU::OPERAND_REG_IMM_INT32:		case AMDGPU::OPERAND_REG_IMM_INT32:
case AMDGPU::OPERAND_REG_IMM_FP32:		case AMDGPU::OPERAND_REG_IMM_FP32:
break;		break;
case AMDGPU::OPERAND_REG_INLINE_C_INT32:		case AMDGPU::OPERAND_REG_INLINE_C_INT32:
▲ Show 20 Lines • Show All 653 Lines • ▼ Show 20 Lines

bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,		bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
const MCOperandInfo &OpInfo,		const MCOperandInfo &OpInfo,
const MachineOperand &MO) const {		const MachineOperand &MO) const {
if (MO.isReg())		if (MO.isReg())
return isLegalRegOperand(MRI, OpInfo, MO);		return isLegalRegOperand(MRI, OpInfo, MO);

// Handle non-register types that are treated like immediates.		// Handle non-register types that are treated like immediates.
assert(MO.isImm() \|\| MO.isTargetIndex() \|\| MO.isFI());		assert(MO.isImm() \|\| MO.isTargetIndex() \|\| MO.isFI() \|\| MO.isGlobal());
return true;		return true;
}		}

bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,		bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
const MachineOperand *MO) const {		const MachineOperand *MO) const {
const MachineFunction &MF = *MI.getParent()->getParent();		const MachineFunction &MF = *MI.getParent()->getParent();
const MachineRegisterInfo &MRI = MF.getRegInfo();		const MachineRegisterInfo &MRI = MF.getRegInfo();
const MCInstrDesc &InstDesc = MI.getDesc();		const MCInstrDesc &InstDesc = MI.getDesc();
Show All 40 Lines	bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
}		}

if (MO->isReg()) {		if (MO->isReg()) {
assert(DefinedRC);		assert(DefinedRC);
return isLegalRegOperand(MRI, OpInfo, *MO);		return isLegalRegOperand(MRI, OpInfo, *MO);
}		}

// Handle non-register types that are treated like immediates.		// Handle non-register types that are treated like immediates.
assert(MO->isImm() \|\| MO->isTargetIndex() \|\| MO->isFI());		assert(MO->isImm() \|\| MO->isTargetIndex() \|\| MO->isFI() \|\| MO->isGlobal());

if (!DefinedRC) {		if (!DefinedRC) {
// This operand expects an immediate.		// This operand expects an immediate.
return true;		return true;
}		}

return isImmOperandLegal(MI, OpIdx, *MO);		return isImmOperandLegal(MI, OpIdx, *MO);
}		}
▲ Show 20 Lines • Show All 2,378 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td

Show First 20 Lines • Show All 199 Lines • ▼ Show 20 Lines	SDTypeProfile<1, 9,
SDTCisVT<9, i1>]>, // idxen(imm)		SDTCisVT<9, i1>]>, // idxen(imm)
[SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore]		[SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore]
>;		>;

def SIpc_add_rel_offset : SDNode<"AMDGPUISD::PC_ADD_REL_OFFSET",		def SIpc_add_rel_offset : SDNode<"AMDGPUISD::PC_ADD_REL_OFFSET",
SDTypeProfile<1, 2, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>		SDTypeProfile<1, 2, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>
>;		>;

		def SIlds : SDNode<"AMDGPUISD::LDS",
		SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>]>
		>;

def SIload_d16_lo : SDNode<"AMDGPUISD::LOAD_D16_LO",		def SIload_d16_lo : SDNode<"AMDGPUISD::LOAD_D16_LO",
SIload_d16,		SIload_d16,
[SDNPMayLoad, SDNPMemOperand, SDNPHasChain]		[SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
>;		>;

def SIload_d16_lo_u8 : SDNode<"AMDGPUISD::LOAD_D16_LO_U8",		def SIload_d16_lo_u8 : SDNode<"AMDGPUISD::LOAD_D16_LO_U8",
SIload_d16,		SIload_d16,
[SDNPMayLoad, SDNPMemOperand, SDNPHasChain]		[SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
▲ Show 20 Lines • Show All 2,096 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/AMDGPU/SIInstructions.td

Show First 20 Lines • Show All 1,136 Lines • ▼ Show 20 Lines	def : GCNPat <
(V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm)))		(V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm)))
>;		>;

def : GCNPat <		def : GCNPat <
(i32 imm:$imm),		(i32 imm:$imm),
(S_MOV_B32 imm:$imm)		(S_MOV_B32 imm:$imm)
>;		>;

		def : GCNPat <
		(VGPRImm<(SIlds tglobaladdr:$ga)>),
		(V_MOV_B32_e32 $ga)
		>;

		def : GCNPat <
		(SIlds tglobaladdr:$ga),
		(S_MOV_B32 $ga)
		>;

// FIXME: Workaround for ordering issue with peephole optimizer where		// FIXME: Workaround for ordering issue with peephole optimizer where
// a register class copy interferes with immediate folding. Should		// a register class copy interferes with immediate folding. Should
// use s_mov_b32, which can be shrunk to s_movk_i32		// use s_mov_b32, which can be shrunk to s_movk_i32
def : GCNPat <		def : GCNPat <
(VGPRImm<(f16 fpimm)>:$imm),		(VGPRImm<(f16 fpimm)>:$imm),
(V_MOV_B32_e32 (f16 (bitcast_fpimm_to_i32 $imm)))		(V_MOV_B32_e32 (f16 (bitcast_fpimm_to_i32 $imm)))
>;		>;

▲ Show 20 Lines • Show All 685 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/AMDGPU/SIShrinkInstructions.cpp

Show First 20 Lines • Show All 89 Lines • ▼ Show 20 Lines	if (TargetRegisterInfo::isVirtualRegister(Reg) && MRI.hasOneUse(Reg)) {
// a single mov, so we need to clear any subregister flag.		// a single mov, so we need to clear any subregister flag.
Src0.setSubReg(0);		Src0.setSubReg(0);
Src0.ChangeToImmediate(MovSrc.getImm());		Src0.ChangeToImmediate(MovSrc.getImm());
ConstantFolded = true;		ConstantFolded = true;
} else if (MovSrc.isFI()) {		} else if (MovSrc.isFI()) {
Src0.setSubReg(0);		Src0.setSubReg(0);
Src0.ChangeToFrameIndex(MovSrc.getIndex());		Src0.ChangeToFrameIndex(MovSrc.getIndex());
ConstantFolded = true;		ConstantFolded = true;
		} else if (MovSrc.isGlobal()) {
		Src0.ChangeToGA(MovSrc.getGlobal(), MovSrc.getOffset(),
		MovSrc.getTargetFlags());
		ConstantFolded = true;
}		}

if (ConstantFolded) {		if (ConstantFolded) {
assert(MRI.use_empty(Reg));		assert(MRI.use_empty(Reg));
Def->eraseFromParent();		Def->eraseFromParent();
++NumLiteralConstantsFolded;		++NumLiteralConstantsFolded;
return true;		return true;
}		}
▲ Show 20 Lines • Show All 693 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/32-bit-local-address-space.ll

Show First 20 Lines • Show All 75 Lines • ▼ Show 20 Lines	define amdgpu_kernel void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] addrspace(3)* %lds, i32 %tid) {
%val = load float, float addrspace(3)* %ptr		%val = load float, float addrspace(3)* %ptr
store float %val, float addrspace(1)* %out		store float %val, float addrspace(1)* %out
ret void		ret void
}		}

@g_lds = addrspace(3) global float undef, align 4		@g_lds = addrspace(3) global float undef, align 4

; FUNC-LABEL: {{^}}infer_ptr_alignment_global_offset:		; FUNC-LABEL: {{^}}infer_ptr_alignment_global_offset:
; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0		; SI: v_mov_b32_e32 [[PTR:v[0-9]+]], g_lds@abs32@lo
; SI: ds_read_b32 v{{[0-9]+}}, [[REG]]		; SI: ds_read_b32 v{{[0-9]+}}, [[PTR]]
define amdgpu_kernel void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) {		define amdgpu_kernel void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) {
%val = load float, float addrspace(3)* @g_lds		%val = load float, float addrspace(3)* @g_lds
store float %val, float addrspace(1)* %out		store float %val, float addrspace(1)* %out
ret void		ret void
}		}


@ptr = addrspace(3) global i32 addrspace(3)* undef		@ptr = addrspace(3) global i32 addrspace(3)* undef
▲ Show 20 Lines • Show All 46 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/ds-sub-offset.ll

	; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,CI %s			; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
	; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s			; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s

	declare i32 @llvm.amdgcn.workitem.id.x() #0			declare i32 @llvm.amdgcn.workitem.id.x() #0

	@lds.obj = addrspace(3) global [256 x i32] undef, align 4			@lds.obj = addrspace(3) global [256 x i32] undef, align 4

	; GCN-LABEL: {{^}}write_ds_sub0_offset0_global:			; GCN-LABEL: {{^}}write_ds_sub0_offset0_global:
	; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, v0			; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, v0
	; CI: v_sub_i32_e32 [[BASEPTR:v[0-9]+]], vcc, 0, [[SHL]]			; GCN: v_sub_{{[iu]}}32_e32 [[BASEPTR:v[0-9]+]], {{(vcc, )?}}lds.obj@abs32@lo, [[SHL]]
	; GFX9: v_sub_u32_e32 [[BASEPTR:v[0-9]+]], 0, [[SHL]]
	; GCN: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b			; GCN: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b
	; GCN: ds_write_b32 [[BASEPTR]], [[VAL]] offset:12			; GCN: ds_write_b32 [[BASEPTR]], [[VAL]] offset:12
	define amdgpu_kernel void @write_ds_sub0_offset0_global() #0 {			define amdgpu_kernel void @write_ds_sub0_offset0_global() #0 {
	entry:			entry:
	%x.i = call i32 @llvm.amdgcn.workitem.id.x() #1			%x.i = call i32 @llvm.amdgcn.workitem.id.x() #1
	%sub1 = sub i32 0, %x.i			%sub1 = sub i32 0, %x.i
	%tmp0 = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds.obj, i32 0, i32 %sub1			%tmp0 = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds.obj, i32 0, i32 %sub1
	%arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %tmp0, i32 3			%arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %tmp0, i32 3
	▲ Show 20 Lines • Show All 145 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/ds_read2.ll

Show First 20 Lines • Show All 349 Lines • ▼ Show 20 Lines	define amdgpu_kernel void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
store float %sum, float addrspace(1)* %out.gep, align 4		store float %sum, float addrspace(1)* %out.gep, align 4
ret void		ret void
}		}

; GCN-LABEL: @simple_read2_f64		; GCN-LABEL: @simple_read2_f64
; CI-DAG: s_mov_b32 m0		; CI-DAG: s_mov_b32 m0
; GFX9-NOT: m0		; GFX9-NOT: m0

; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, {{v[0-9]+}}		; GCN-DAG: v_lshlrev_b32_e32 [[VOFS:v[0-9]+]], 3, {{v[0-9]+}}
		; GCN-DAG: v_add_{{[iu]}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}lds.f64@abs32@lo, [[VOFS]]
; GCN: ds_read2_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, [[VPTR]] offset1:8		; GCN: ds_read2_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, [[VPTR]] offset1:8
; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}		; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}

; CI: buffer_store_dwordx2 [[RESULT]]		; CI: buffer_store_dwordx2 [[RESULT]]
; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]		; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @simple_read2_f64(double addrspace(1)* %out) #0 {		define amdgpu_kernel void @simple_read2_f64(double addrspace(1)* %out) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1		%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i		%arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
▲ Show 20 Lines • Show All 69 Lines • ▼ Show 20 Lines
}		}

@foo = addrspace(3) global [4 x i32] undef, align 4		@foo = addrspace(3) global [4 x i32] undef, align 4

; GCN-LABEL: @load_constant_adjacent_offsets		; GCN-LABEL: @load_constant_adjacent_offsets
; CI-DAG: s_mov_b32 m0		; CI-DAG: s_mov_b32 m0
; GFX9-NOT: m0		; GFX9-NOT: m0

; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}		; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], foo@abs32@lo{{$}}
; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1		; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset1:1
define amdgpu_kernel void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) {		define amdgpu_kernel void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) {
%val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4		%val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
%val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4		%val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
%sum = add i32 %val0, %val1		%sum = add i32 %val0, %val1
store i32 %sum, i32 addrspace(1)* %out, align 4		store i32 %sum, i32 addrspace(1)* %out, align 4
ret void		ret void
}		}

; GCN-LABEL: @load_constant_disjoint_offsets		; GCN-LABEL: @load_constant_disjoint_offsets
; CI-DAG: s_mov_b32 m0		; CI-DAG: s_mov_b32 m0
; GFX9-NOT: m0		; GFX9-NOT: m0

; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}		; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], foo@abs32@lo{{$}}
; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:2		; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset1:2
define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) {		define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) {
%val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4		%val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
%val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4		%val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
%sum = add i32 %val0, %val1		%sum = add i32 %val0, %val1
store i32 %sum, i32 addrspace(1)* %out, align 4		store i32 %sum, i32 addrspace(1)* %out, align 4
ret void		ret void
}		}

@bar = addrspace(3) global [4 x i64] undef, align 4		@bar = addrspace(3) global [4 x i64] undef, align 4

; GCN-LABEL: @load_misaligned64_constant_offsets		; GCN-LABEL: @load_misaligned64_constant_offsets
; CI-DAG: s_mov_b32 m0		; CI-DAG: s_mov_b32 m0
; GFX9-NOT: m0		; GFX9-NOT: m0

; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}		; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], bar@abs32@lo{{$}}
; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1		; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset1:1
; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:2 offset1:3		; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:2 offset1:3
define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) {		define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) {
%val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4		%val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
%val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4		%val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
%sum = add i64 %val0, %val1		%sum = add i64 %val0, %val1
store i64 %sum, i64 addrspace(1)* %out, align 8		store i64 %sum, i64 addrspace(1)* %out, align 8
ret void		ret void
}		}

@bar.large = addrspace(3) global [4096 x i64] undef, align 4		@bar.large = addrspace(3) global [4096 x i64] undef, align 4

; GCN-LABEL: @load_misaligned64_constant_large_offsets		; GCN-LABEL: @load_misaligned64_constant_large_offsets
; CI-DAG: s_mov_b32 m0		; CI-DAG: s_mov_b32 m0
; GFX9-NOT: m0		; GFX9-NOT: m0

; GCN-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}}		; GCN-DAG: s_mov_b32 [[SBASE0:s[0-9]+]], bar.large@abs32@lo
; GCN-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000		; GCN-DAG: s_add_i32 [[SBASE1:s[0-9]+]], [[SBASE0]], 0x4000{{$}}
; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE0]] offset1:1		; GCN-DAG: s_addk_i32 [[SBASE0]], 0x7ff8{{$}}
; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE1]] offset1:1		; GCN-DAG: v_mov_b32_e32 [[VBASE0:v[0-9]+]], [[SBASE0]]
		; GCN-DAG: v_mov_b32_e32 [[VBASE1:v[0-9]+]], [[SBASE1]]
		; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VBASE0]] offset1:1
		; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VBASE1]] offset1:1
; GCN: s_endpgm		; GCN: s_endpgm
define amdgpu_kernel void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) {		define amdgpu_kernel void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) {
%val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4		%val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
%val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4		%val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
%sum = add i64 %val0, %val1		%sum = add i64 %val0, %val1
store i64 %sum, i64 addrspace(1)* %out, align 8		store i64 %sum, i64 addrspace(1)* %out, align 8
ret void		ret void
}		}
▲ Show 20 Lines • Show All 165 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/ds_write2.ll

	Show First 20 Lines • Show All 97 Lines • ▼ Show 20 Lines
	; 2 data subregisters from different super registers.			; 2 data subregisters from different super registers.
	; GCN-LABEL: {{^}}simple_write2_two_val_subreg2_mixed_f32:			; GCN-LABEL: {{^}}simple_write2_two_val_subreg2_mixed_f32:
	; GFX9-NOT: m0			; GFX9-NOT: m0

	; CI: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}}			; CI: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}}
	; CI: buffer_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}			; CI: buffer_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}
	; CI-DAG: s_mov_b32 m0			; CI-DAG: s_mov_b32 m0

	; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}			; CI-DAG: v_lshlrev_b32_e32 [[VOFS:v[0-9]+]], 2, v{{[0-9]+}}
				; CI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]], vcc, lds@abs32@lo, [[VOFS]]
				;
				; TODO: This should be an s_mov_b32. The v_mov_b32 gets introduced by an
				; early legalization of the constant bus constraint on the v_lshl_add_u32,
				; and then SIFoldOperands folds in an unlucky order.
				; GFX9-DAG: v_mov_b32_e32 [[VBASE:v[0-9]+]], lds@abs32@lo
				; GFX9-DAG: v_lshl_add_u32 [[VPTR:v[0-9]+]], {{v[0-9]+}}, 2, [[VBASE]]

	; GFX9: global_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}}			; GFX9-DAG: global_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}}
	; GFX9: global_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}			; GFX9-DAG: global_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}

	; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8			; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
	; GCN: s_endpgm			; GCN: s_endpgm
	define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {			define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
	%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1			%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
	%in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i			%in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
	%in.gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in.gep.0, i32 1			%in.gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in.gep.0, i32 1
	%val0 = load volatile <2 x float>, <2 x float> addrspace(1)* %in.gep.0, align 8			%val0 = load volatile <2 x float>, <2 x float> addrspace(1)* %in.gep.0, align 8
	%val1 = load volatile <2 x float>, <2 x float> addrspace(1)* %in.gep.1, align 8			%val1 = load volatile <2 x float>, <2 x float> addrspace(1)* %in.gep.1, align 8
	%val0.0 = extractelement <2 x float> %val0, i32 0			%val0.0 = extractelement <2 x float> %val0, i32 0
	%val1.1 = extractelement <2 x float> %val1, i32 1			%val1.1 = extractelement <2 x float> %val1, i32 1
	%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i			%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
	store float %val0.0, float addrspace(3)* %arrayidx0, align 4			store float %val0.0, float addrspace(3)* %arrayidx0, align 4
	%add.x = add nsw i32 %x.i, 8			%add.x = add nsw i32 %x.i, 8
	%arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x			%arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
	store float %val1.1, float addrspace(3)* %arrayidx1, align 4			store float %val1.1, float addrspace(3)* %arrayidx1, align 4
	ret void			ret void
	}			}

	; GCN-LABEL: @simple_write2_two_val_subreg2_f32			; GCN-LABEL: @simple_write2_two_val_subreg2_f32
	; CI-DAG: s_mov_b32 m0			; CI-DAG: s_mov_b32 m0
	; GFX9-NOT: m0			; GFX9-NOT: m0

	; GCN-DAG: {{buffer\|global}}_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}			; GCN-DAG: {{buffer\|global}}_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
	; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
				; CI-DAG: v_lshlrev_b32_e32 [[VOFS:v[0-9]+]], 2, v{{[0-9]+}}
				; CI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]], vcc, lds@abs32@lo, [[VOFS]]
				; GFX9-DAG: v_mov_b32_e32 [[VBASE:v[0-9]+]], lds@abs32@lo
				; GFX9-DAG: v_lshl_add_u32 [[VPTR:v[0-9]+]], v{{[0-9]+}}, 2, [[VBASE]]

	; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8			; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
	; GCN: s_endpgm			; GCN: s_endpgm
	define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {			define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
	%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1			%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
	%in.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i			%in.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
	%val = load <2 x float>, <2 x float> addrspace(1)* %in.gep, align 8			%val = load <2 x float>, <2 x float> addrspace(1)* %in.gep, align 8
	%val0 = extractelement <2 x float> %val, i32 0			%val0 = extractelement <2 x float> %val, i32 0
	%val1 = extractelement <2 x float> %val, i32 1			%val1 = extractelement <2 x float> %val, i32 1
	%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i			%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
	store float %val0, float addrspace(3)* %arrayidx0, align 4			store float %val0, float addrspace(3)* %arrayidx0, align 4
	%add.x = add nsw i32 %x.i, 8			%add.x = add nsw i32 %x.i, 8
	%arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x			%arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
	store float %val1, float addrspace(3)* %arrayidx1, align 4			store float %val1, float addrspace(3)* %arrayidx1, align 4
	ret void			ret void
	}			}

	; GCN-LABEL: @simple_write2_two_val_subreg4_f32			; GCN-LABEL: @simple_write2_two_val_subreg4_f32
	; CI-DAG: s_mov_b32 m0			; CI-DAG: s_mov_b32 m0
	; GFX9-NOT: m0			; GFX9-NOT: m0

	; GCN-DAG: {{buffer\|global}}_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}			; GCN-DAG: {{buffer\|global}}_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
	; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
				; CI-DAG: v_lshlrev_b32_e32 [[VOFS:v[0-9]+]], 2, v{{[0-9]+}}
				; CI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]], vcc, lds@abs32@lo, [[VOFS]]
				; GFX9-DAG: v_mov_b32_e32 [[VBASE:v[0-9]+]], lds@abs32@lo
				; GFX9-DAG: v_lshl_add_u32 [[VPTR:v[0-9]+]], v{{[0-9]+}}, 2, [[VBASE]]

	; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8			; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
	; GCN: s_endpgm			; GCN: s_endpgm
	define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 {			define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 {
	%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1			%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
	%in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i			%in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i
	%val = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 16			%val = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 16
	%val0 = extractelement <4 x float> %val, i32 0			%val0 = extractelement <4 x float> %val, i32 0
	%val1 = extractelement <4 x float> %val, i32 3			%val1 = extractelement <4 x float> %val, i32 3
	▲ Show 20 Lines • Show All 219 Lines • ▼ Show 20 Lines
	}			}

	@foo = addrspace(3) global [4 x i32] undef, align 4			@foo = addrspace(3) global [4 x i32] undef, align 4

	; GCN-LABEL: @store_constant_adjacent_offsets			; GCN-LABEL: @store_constant_adjacent_offsets
	; CI-DAG: s_mov_b32 m0			; CI-DAG: s_mov_b32 m0
	; GFX9-NOT: m0			; GFX9-NOT: m0

	; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}			; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], foo@abs32@lo{{$}}
	; GCN: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1			; GCN: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
	define amdgpu_kernel void @store_constant_adjacent_offsets() {			define amdgpu_kernel void @store_constant_adjacent_offsets() {
	store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4			store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
	store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4			store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
	ret void			ret void
	}			}

	; GCN-LABEL: @store_constant_disjoint_offsets			; GCN-LABEL: @store_constant_disjoint_offsets
	; CI-DAG: s_mov_b32 m0			; CI-DAG: s_mov_b32 m0
	; GFX9-NOT: m0			; GFX9-NOT: m0

	; GCN-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}}			; GCN-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}}
	; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}			; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], foo@abs32@lo{{$}}
	; GCN: ds_write2_b32 [[ZERO]], [[VAL]], [[VAL]] offset1:2			; GCN: ds_write2_b32 [[PTR]], [[VAL]], [[VAL]] offset1:2
	define amdgpu_kernel void @store_constant_disjoint_offsets() {			define amdgpu_kernel void @store_constant_disjoint_offsets() {
	store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4			store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
	store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4			store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
	ret void			ret void
	}			}

	@bar = addrspace(3) global [4 x i64] undef, align 4			@bar = addrspace(3) global [4 x i64] undef, align 4

	; GCN-LABEL: @store_misaligned64_constant_offsets			; GCN-LABEL: @store_misaligned64_constant_offsets
	; CI-DAG: s_mov_b32 m0			; CI-DAG: s_mov_b32 m0
	; GFX9-NOT: m0			; GFX9-NOT: m0

	; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}			; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], bar@abs32@lo{{$}}
	; GCN-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1			; GCN-DAG: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
	; GCN-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3			; GCN-DAG: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
	; GCN: s_endpgm			; GCN: s_endpgm
	define amdgpu_kernel void @store_misaligned64_constant_offsets() {			define amdgpu_kernel void @store_misaligned64_constant_offsets() {
	store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4			store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
	store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4			store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
	ret void			ret void
	}			}

	@bar.large = addrspace(3) global [4096 x i64] undef, align 4			@bar.large = addrspace(3) global [4096 x i64] undef, align 4

	; GCN-LABEL: @store_misaligned64_constant_large_offsets			; GCN-LABEL: @store_misaligned64_constant_large_offsets
	; CI-DAG: s_mov_b32 m0			; CI-DAG: s_mov_b32 m0
	; GFX9-NOT: m0			; GFX9-NOT: m0

	; GCN-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}}			; GCN-DAG: s_mov_b32 [[SBASE0:s[0-9]+]], bar.large@abs32@lo
	; GCN-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000{{$}}			; GCN-DAG: s_add_i32 [[SBASE1:s[0-9]+]], [[SBASE0]], 0x4000{{$}}
	; GCN-DAG: ds_write2_b32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1			; GCN-DAG: s_addk_i32 [[SBASE0]], 0x7ff8{{$}}
	; GCN-DAG: ds_write2_b32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1			; GCN-DAG: v_mov_b32_e32 [[VBASE0:v[0-9]+]], [[SBASE0]]{{$}}
				; GCN-DAG: v_mov_b32_e32 [[VBASE1:v[0-9]+]], [[SBASE1]]{{$}}
				; GCN-DAG: ds_write2_b32 [[VBASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
				; GCN-DAG: ds_write2_b32 [[VBASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
	; GCN: s_endpgm			; GCN: s_endpgm
	define amdgpu_kernel void @store_misaligned64_constant_large_offsets() {			define amdgpu_kernel void @store_misaligned64_constant_large_offsets() {
	store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4			store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
	store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4			store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
	ret void			ret void
	}			}

	@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4			@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4
	▲ Show 20 Lines • Show All 60 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/lds-initializer.ll

	; RUN: not llc -march=amdgcn -mcpu=tahiti < %s 2>&1 \| FileCheck %s			; RUN: not llc -march=amdgcn -mcpu=tahiti < %s 2>&1 \| FileCheck %s
	; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 \| FileCheck %s			; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 \| FileCheck %s

	; CHECK: in function load_init_lds_global{{.*}}: unsupported initializer for address space			; CHECK: lds: unsupported initializer for address space

	@lds = addrspace(3) global [8 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8]			@lds = addrspace(3) global [8 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8]

	define amdgpu_kernel void @load_init_lds_global(i32 addrspace(1)* %out, i1 %p) {			define amdgpu_kernel void @load_init_lds_global(i32 addrspace(1)* %out, i1 %p) {
	%gep = getelementptr [8 x i32], [8 x i32] addrspace(3)* @lds, i32 0, i32 10			%gep = getelementptr [8 x i32], [8 x i32] addrspace(3)* @lds, i32 0, i32 10
	%ld = load i32, i32 addrspace(3)* %gep			%ld = load i32, i32 addrspace(3)* %gep
	store i32 %ld, i32 addrspace(1)* %out			store i32 %ld, i32 addrspace(1)* %out
	ret void			ret void
	}			}

llvm/trunk/test/CodeGen/AMDGPU/lds-relocs.ll

				; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -show-mc-encoding < %s \| FileCheck -check-prefixes=GCN %s
				; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -filetype=obj < %s \| llvm-readobj -r -t \| FileCheck -check-prefixes=ELF %s

				@lds.external = external unnamed_addr addrspace(3) global [0 x i32]
				@lds.defined = unnamed_addr addrspace(3) global [8 x i32] undef, align 8

				; ELF: Relocations [
				; ELF-NEXT: Section (3) .rel.text {
				; ELF-NEXT: 0x{{[0-9a-f]*}} R_AMDGPU_ABS32 lds.external 0x0
				; ELF-NEXT: 0x{{[0-9a-f]*}} R_AMDGPU_ABS32 lds.defined 0x0
				; ELF-NEXT: }
				; ELF-NEXT: ]

				; ELF: Symbol {
				; ELF: Name: lds.defined
				; ELF-NEXT: Value: 0x8
				; ELF-NEXT: Size: 32
				; ELF-NEXT: Binding: Global (0x1)
				; ELF-NEXT: Type: Object (0x1)
				; ELF-NEXT: Other: 0
				; ELF-NEXT: Section: Processor Specific (0xFF00)
				; ELF-NEXT: }

				; ELF: Symbol {
				; ELF: Name: lds.external
				; ELF-NEXT: Value: 0x4
				; ELF-NEXT: Size: 0
				; ELF-NEXT: Binding: Global (0x1)
				; ELF-NEXT: Type: Object (0x1)
				; ELF-NEXT: Other: 0
				; ELF-NEXT: Section: Processor Specific (0xFF00)
				; ELF-NEXT: }

				; GCN-LABEL: {{^}}test_basic:
				; GCN: v_mov_b32_e32 v1, lds.external@abs32@lo ; encoding: [0xff,0x02,0x02,0x7e,A,A,A,A]
				; GCN-NEXT: ; fixup A - offset: 4, value: lds.external@abs32@lo, kind: FK_Data_4{{$}}
				;
				; GCN: s_add_i32 s0, lds.defined@abs32@lo, s0 ; encoding: [0xff,0x00,0x00,0x81,A,A,A,A]
				; GCN-NEXT: ; fixup A - offset: 4, value: lds.defined@abs32@lo, kind: FK_Data_4{{$}}
				;
				; GCN: .globl lds.external
				; GCN: .amdgpu_lds lds.external, 0, 4
				; GCN: .globl lds.defined
				; GCN: .amdgpu_lds lds.defined, 32, 8
				define amdgpu_gs float @test_basic(i32 inreg %wave, i32 %arg1) #0 {
				main_body:
				%gep0 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @lds.external, i32 0, i32 %arg1
				%tmp = load i32, i32 addrspace(3)* %gep0

				%mask = call i64 @llvm.amdgcn.icmp.i64.i32(i32 %tmp, i32 0, i32 0)
				%mask.32 = trunc i64 %mask to i32
				%gep1 = getelementptr [8 x i32], [8 x i32] addrspace(3)* @lds.defined, i32 0, i32 %wave
				store i32 %mask.32, i32 addrspace(3)* %gep1

				%r = bitcast i32 %tmp to float
				ret float %r
				}

				; Function Attrs: convergent nounwind readnone
				declare i64 @llvm.amdgcn.icmp.i64.i32(i32, i32, i32) #4

				attributes #0 = { "no-signed-zeros-fp-math"="true" }
				attributes #4 = { convergent nounwind readnone }

llvm/trunk/test/CodeGen/AMDGPU/lds-size.ll

	; RUN: llc -march=amdgcn < %s \| FileCheck -check-prefix=ALL -check-prefix=GCN %s
	; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s \| FileCheck -check-prefix=ALL -check-prefix=HSA %s			; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s \| FileCheck -check-prefix=ALL -check-prefix=HSA %s
	; RUN: llc -march=r600 -mcpu=redwood < %s \| FileCheck -check-prefix=ALL -check-prefix=EG %s			; RUN: llc -march=r600 -mcpu=redwood < %s \| FileCheck -check-prefix=ALL -check-prefix=EG %s

	; This test makes sure we do not double count global values when they are			; This test makes sure we do not double count global values when they are
	; used in different basic blocks.			; used in different basic blocks.

	; GCN: .long 47180			; GCN: .long 47180
	; GCN-NEXT: .long 32900			; GCN-NEXT: .long 32900
	Show All 27 Lines

llvm/trunk/test/CodeGen/AMDGPU/lds-zero-initializer.ll

	; RUN: not llc -march=amdgcn -mcpu=tahiti < %s 2>&1 \| FileCheck %s			; RUN: not llc -march=amdgcn -mcpu=tahiti < %s 2>&1 \| FileCheck %s
	; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 \| FileCheck %s			; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 \| FileCheck %s

	; CHECK: in function load_zeroinit_lds_global{{.*}}: unsupported initializer for address space			; CHECK: lds: unsupported initializer for address space

	@lds = addrspace(3) global [256 x i32] zeroinitializer			@lds = addrspace(3) global [256 x i32] zeroinitializer

	define amdgpu_kernel void @load_zeroinit_lds_global(i32 addrspace(1)* %out, i1 %p) {			define amdgpu_kernel void @load_zeroinit_lds_global(i32 addrspace(1)* %out, i1 %p) {
	%gep = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds, i32 0, i32 10			%gep = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds, i32 0, i32 10
	%ld = load i32, i32 addrspace(3)* %gep			%ld = load i32, i32 addrspace(3)* %gep
	store i32 %ld, i32 addrspace(1)* %out			store i32 %ld, i32 addrspace(1)* %out
	ret void			ret void
	}			}

llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll

	Show First 20 Lines • Show All 262 Lines • ▼ Show 20 Lines
	}			}

	@lds0 = addrspace(3) global [512 x i32] undef			@lds0 = addrspace(3) global [512 x i32] undef

	; GCN-LABEL: {{^}}atomic_dec_shl_base_lds_0:			; GCN-LABEL: {{^}}atomic_dec_shl_base_lds_0:
	; CIVI-DAG: s_mov_b32 m0			; CIVI-DAG: s_mov_b32 m0
	; GFX9-NOT: m0			; GFX9-NOT: m0

	; GCN-DAG: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}			; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}}
				; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]]
				; GFX9-DAG: s_mov_b32 [[BASE:s[0-9]+]], lds0@abs32@lo
				; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 2, [[BASE]]

	; GCN: ds_dec_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8			; GCN: ds_dec_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
	define amdgpu_kernel void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {			define amdgpu_kernel void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
	%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1			%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
	%idx.0 = add nsw i32 %tid.x, 2			%idx.0 = add nsw i32 %tid.x, 2
	%arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds0, i32 0, i32 %idx.0			%arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds0, i32 0, i32 %idx.0
	%val0 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %arrayidx0, i32 9, i32 0, i32 0, i1 false)			%val0 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %arrayidx0, i32 9, i32 0, i32 0, i1 false)
	store i32 %idx.0, i32 addrspace(1)* %add_use			store i32 %idx.0, i32 addrspace(1)* %add_use
	store i32 %val0, i32 addrspace(1)* %out			store i32 %val0, i32 addrspace(1)* %out
	▲ Show 20 Lines • Show All 127 Lines • ▼ Show 20 Lines
	}			}

	@lds1 = addrspace(3) global [512 x i64] undef, align 8			@lds1 = addrspace(3) global [512 x i64] undef, align 8

	; GCN-LABEL: {{^}}atomic_dec_shl_base_lds_0_i64:			; GCN-LABEL: {{^}}atomic_dec_shl_base_lds_0_i64:
	; CIVI-DAG: s_mov_b32 m0			; CIVI-DAG: s_mov_b32 m0
	; GFX9-NOT: m0			; GFX9-NOT: m0

	; GCN-DAG: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}}			; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 3, {{v[0-9]+}}
				; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds1@abs32@lo, [[OFS]]
				; GFX9-DAG: v_mov_b32_e32 [[BASE:v[0-9]+]], lds1@abs32@lo
				; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 3, [[BASE]]

	; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16			; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16
	define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {			define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
	%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1			%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
	%idx.0 = add nsw i32 %tid.x, 2			%idx.0 = add nsw i32 %tid.x, 2
	%arrayidx0 = getelementptr inbounds [512 x i64], [512 x i64] addrspace(3)* @lds1, i32 0, i32 %idx.0			%arrayidx0 = getelementptr inbounds [512 x i64], [512 x i64] addrspace(3)* @lds1, i32 0, i32 %idx.0
	%val0 = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %arrayidx0, i64 9, i32 0, i32 0, i1 false)			%val0 = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %arrayidx0, i64 9, i32 0, i32 0, i1 false)
	store i32 %idx.0, i32 addrspace(1)* %add_use			store i32 %idx.0, i32 addrspace(1)* %add_use
	store i64 %val0, i64 addrspace(1)* %out			store i64 %val0, i64 addrspace(1)* %out
	ret void			ret void
	}			}

	attributes #0 = { nounwind }			attributes #0 = { nounwind }
	attributes #1 = { nounwind readnone }			attributes #1 = { nounwind readnone }
	attributes #2 = { nounwind argmemonly }			attributes #2 = { nounwind argmemonly }

llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll

Show First 20 Lines • Show All 125 Lines • ▼ Show 20 Lines	define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #0 {
%gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5		%gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5
%result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)		%result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
ret void		ret void
}		}

@lds0 = addrspace(3) global [512 x i32] undef, align 4		@lds0 = addrspace(3) global [512 x i32] undef, align 4

; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i32:		; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i32:
; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}		; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}}
		; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]]
		; GFX9-DAG: s_mov_b32 [[BASE:s[0-9]+]], lds0@abs32@lo
		; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 2, [[BASE]]
; GCN: ds_inc_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8		; GCN: ds_inc_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {		define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1		%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 2		%idx.0 = add nsw i32 %tid.x, 2
%arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds0, i32 0, i32 %idx.0		%arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds0, i32 0, i32 %idx.0
%val0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %arrayidx0, i32 9, i32 0, i32 0, i1 false)		%val0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %arrayidx0, i32 9, i32 0, i32 0, i1 false)
store i32 %idx.0, i32 addrspace(1)* %add_use		store i32 %idx.0, i32 addrspace(1)* %add_use
store i32 %val0, i32 addrspace(1)* %out		store i32 %val0, i32 addrspace(1)* %out
▲ Show 20 Lines • Show All 177 Lines • ▼ Show 20 Lines	define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0 {
%gep = getelementptr i32, i32* %gep.tid, i32 5		%gep = getelementptr i32, i32* %gep.tid, i32 5
%result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false)		%result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false)
ret void		ret void
}		}

@lds1 = addrspace(3) global [512 x i64] undef, align 8		@lds1 = addrspace(3) global [512 x i64] undef, align 8

; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i64:		; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i64:
; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}}		; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 3, {{v[0-9]+}}
		; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds1@abs32@lo, [[OFS]]
		; GFX9-DAG: v_mov_b32_e32 [[BASE:v[0-9]+]], lds1@abs32@lo
		; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 3, [[BASE]]
; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16		; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16
define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {		define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1		%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 2		%idx.0 = add nsw i32 %tid.x, 2
%arrayidx0 = getelementptr inbounds [512 x i64], [512 x i64] addrspace(3)* @lds1, i32 0, i32 %idx.0		%arrayidx0 = getelementptr inbounds [512 x i64], [512 x i64] addrspace(3)* @lds1, i32 0, i32 %idx.0
%val0 = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %arrayidx0, i64 9, i32 0, i32 0, i1 false)		%val0 = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %arrayidx0, i64 9, i32 0, i32 0, i1 false)
store i32 %idx.0, i32 addrspace(1)* %add_use		store i32 %idx.0, i32 addrspace(1)* %add_use
store i64 %val0, i64 addrspace(1)* %out		store i64 %val0, i64 addrspace(1)* %out
▲ Show 20 Lines • Show All 89 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll

; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s \| FileCheck %s		; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s \| FileCheck -check-prefixes=CHECK,NOHSA %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s \| FileCheck %s		; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s \| FileCheck -check-prefixes=CHECK,HSA %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s \| FileCheck %s		; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s \| FileCheck -check-prefixes=CHECK,HSA %s

@lds0 = addrspace(3) global [512 x float] undef, align 4		@lds0 = addrspace(3) global [512 x float] undef, align 4
@lds1 = addrspace(3) global [256 x float] undef, align 4		@lds1 = addrspace(3) global [256 x float] undef, align 4

@large = addrspace(3) global [4096 x i32] undef, align 4		@large = addrspace(3) global [4096 x i32] undef, align 4

; CHECK-LABEL: {{^}}groupstaticsize_test0:		; CHECK-LABEL: {{^}}groupstaticsize_test0:
; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0x800{{$}}		; NOHSA: v_mov_b32_e32 v{{[0-9]+}}, llvm.amdgcn.groupstaticsize@abs32@lo
		; HSA: v_mov_b32_e32 v{{[0-9]+}}, 0x800{{$}}
define amdgpu_kernel void @groupstaticsize_test0(float addrspace(1)* %out, i32 addrspace(1)* %lds_size) #0 {		define amdgpu_kernel void @groupstaticsize_test0(float addrspace(1)* %out, i32 addrspace(1)* %lds_size) #0 {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1		%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 64		%idx.0 = add nsw i32 %tid.x, 64
%static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1		%static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1
store i32 %static_lds_size, i32 addrspace(1)* %lds_size, align 4		store i32 %static_lds_size, i32 addrspace(1)* %lds_size, align 4
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0		%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
%val0 = load float, float addrspace(3)* %arrayidx0, align 4		%val0 = load float, float addrspace(3)* %arrayidx0, align 4
store float %val0, float addrspace(1)* %out, align 4		store float %val0, float addrspace(1)* %out, align 4

ret void		ret void
}		}

; CHECK-LABEL: {{^}}groupstaticsize_test1:		; CHECK-LABEL: {{^}}groupstaticsize_test1:
; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0xc00{{$}}		; NOHSA: v_mov_b32_e32 v{{[0-9]+}}, llvm.amdgcn.groupstaticsize@abs32@lo
		; HSA: v_mov_b32_e32 v{{[0-9]+}}, 0xc00{{$}}
define amdgpu_kernel void @groupstaticsize_test1(float addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %lds_size) {		define amdgpu_kernel void @groupstaticsize_test1(float addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %lds_size) {
entry:		entry:
%static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1		%static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1
store i32 %static_lds_size, i32 addrspace(1)* %lds_size, align 4		store i32 %static_lds_size, i32 addrspace(1)* %lds_size, align 4
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1		%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 64		%idx.0 = add nsw i32 %tid.x, 64
%tmp = icmp eq i32 %cond, 0		%tmp = icmp eq i32 %cond, 0
br i1 %tmp, label %if, label %else		br i1 %tmp, label %if, label %else
Show All 11 Lines	else: ; preds = %entry
br label %endif		br label %endif

endif: ; preds = %else, %if		endif: ; preds = %else, %if
ret void		ret void
}		}

; Exceeds 16-bit simm limit of s_movk_i32		; Exceeds 16-bit simm limit of s_movk_i32
; CHECK-LABEL: {{^}}large_groupstaticsize:		; CHECK-LABEL: {{^}}large_groupstaticsize:
; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4000{{$}}		; NOHSA: v_mov_b32_e32 v{{[0-9]+}}, llvm.amdgcn.groupstaticsize@abs32@lo
		; HSA: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4000{{$}}
define amdgpu_kernel void @large_groupstaticsize(i32 addrspace(1)* %size, i32 %idx) #0 {		define amdgpu_kernel void @large_groupstaticsize(i32 addrspace(1)* %size, i32 %idx) #0 {
%gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(3)* @large, i32 0, i32 %idx		%gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(3)* @large, i32 0, i32 %idx
store volatile i32 0, i32 addrspace(3)* %gep		store volatile i32 0, i32 addrspace(3)* %gep
%static_lds_size = call i32 @llvm.amdgcn.groupstaticsize()		%static_lds_size = call i32 @llvm.amdgcn.groupstaticsize()
store i32 %static_lds_size, i32 addrspace(1)* %size		store i32 %static_lds_size, i32 addrspace(1)* %size
ret void		ret void
}		}

declare i32 @llvm.amdgcn.groupstaticsize() #1		declare i32 @llvm.amdgcn.groupstaticsize() #1
declare i32 @llvm.amdgcn.workitem.id.x() #1		declare i32 @llvm.amdgcn.workitem.id.x() #1

attributes #0 = { nounwind }		attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }		attributes #1 = { nounwind readnone }

llvm/trunk/test/CodeGen/AMDGPU/local-memory.amdgcn.ll

	; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s \| FileCheck -check-prefix=SI -check-prefix=GCN %s			; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s \| FileCheck -check-prefix=SI -check-prefix=GCN %s
	; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s \| FileCheck -check-prefix=CI -check-prefix=GCN %s			; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s \| FileCheck -check-prefix=CI -check-prefix=GCN %s

	@local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4			@local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4

	; Check that the LDS size emitted correctly
	; SI: .long 47180
	; SI-NEXT: .long 65668
	; CI: .long 47180
	; CI-NEXT: .long 32900

	; GCN-LABEL: {{^}}local_memory:			; GCN-LABEL: {{^}}local_memory:

	; GCN-NOT: s_wqm_b64			; GCN-NOT: s_wqm_b64
	; GCN: ds_write_b32			; GCN: ds_write_b32

	; GCN: s_barrier			; GCN: s_barrier

	; GCN: ds_read_b32 {{v[0-9]+}},			; GCN: ds_read_b32 {{v[0-9]+}},
	Show All 32 Lines
	; SI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 28, [[ADDRW]]			; SI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 28, [[ADDRW]]
	; SI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]]			; SI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]]

	; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]]			; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]]
	; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]]			; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]]

	; CI: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]]			; CI: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]]
	; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7			; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7

	define amdgpu_kernel void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {			define amdgpu_kernel void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
	entry:			entry:
	%x.i = call i32 @llvm.amdgcn.workitem.id.x()			%x.i = call i32 @llvm.amdgcn.workitem.id.x()
	%arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i			%arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i
	store i32 %x.i, i32 addrspace(3)* %arrayidx, align 4			store i32 %x.i, i32 addrspace(3)* %arrayidx, align 4
	%mul = shl nsw i32 %x.i, 1			%mul = shl nsw i32 %x.i, 1
	%arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %x.i			%arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %x.i
	store i32 %mul, i32 addrspace(3)* %arrayidx1, align 4			store i32 %mul, i32 addrspace(3)* %arrayidx1, align 4
	Show All 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/local-memory.ll

	; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=FUNC %s			; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=FUNC %s
	; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=FUNC %s			; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=FUNC %s
	; RUN: llc -march=r600 -mcpu=redwood < %s \| FileCheck -check-prefix=EG -check-prefix=FUNC %s			; RUN: llc -march=r600 -mcpu=redwood < %s \| FileCheck -check-prefix=EG -check-prefix=FUNC %s

	@local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4			@local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4

	@lds = addrspace(3) global [512 x i32] undef, align 4			@lds = addrspace(3) global [512 x i32] undef, align 4

	; On SI we need to make sure that the base offset is a register and			; On SI we need to make sure that the base offset is a register and
	; not an immediate.			; not an immediate.

	; FUNC-LABEL: {{^}}load_i32_local_const_ptr:			; FUNC-LABEL: {{^}}load_i32_local_const_ptr:
	; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0			; GCN: v_mov_b32_e32 v[[PTR:[0-9]+]], lds@abs32@lo
	; GCN: ds_read_b32 v{{[0-9]+}}, v[[ZERO]] offset:4			; GCN: ds_read_b32 v{{[0-9]+}}, v[[PTR]] offset:4

	; R600: LDS_READ_RET			; R600: LDS_READ_RET
	define amdgpu_kernel void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) #0 {			define amdgpu_kernel void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) #0 {
	entry:			entry:
	%tmp0 = getelementptr [512 x i32], [512 x i32] addrspace(3)* @lds, i32 0, i32 1			%tmp0 = getelementptr [512 x i32], [512 x i32] addrspace(3)* @lds, i32 0, i32 1
	%tmp1 = load i32, i32 addrspace(3)* %tmp0			%tmp1 = load i32, i32 addrspace(3)* %tmp0
	%tmp2 = getelementptr i32, i32 addrspace(1)* %out, i32 1			%tmp2 = getelementptr i32, i32 addrspace(1)* %out, i32 1
	store i32 %tmp1, i32 addrspace(1)* %tmp2			store i32 %tmp1, i32 addrspace(1)* %tmp2
	Show All 22 Lines

llvm/trunk/test/CodeGen/AMDGPU/merge-store-crash.ll

	; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s \| FileCheck %s			; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s \| FileCheck %s
	; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck %s			; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck %s

	; This is used to crash in LiveIntervalAnalysis via SILoadStoreOptimizer			; This is used to crash in LiveIntervalAnalysis via SILoadStoreOptimizer
	; while fixing up the merge of two ds_write instructions.			; while fixing up the merge of two ds_write instructions.

	@tess_lds = external addrspace(3) global [8192 x i32]			@tess_lds = external addrspace(3) global [8192 x i32]

	; CHECK-LABEL: {{^}}main:			; CHECK-LABEL: {{^}}main:
	; CHECK: ds_write2_b32			; CHECK: ds_write_b32
				; CHECK: ds_write_b32
	; CHECK: v_mov_b32_e32 v1, v0			; CHECK: v_mov_b32_e32 v1, v0
	; CHECK: tbuffer_store_format_xyzw v[0:3],			; CHECK: tbuffer_store_format_xyzw v[0:3],
	define amdgpu_vs void @main(i32 inreg %arg) {			define amdgpu_vs void @main(i32 inreg %arg) {
	main_body:			main_body:
	%tmp = load float, float addrspace(3)* undef, align 4			%tmp = load float, float addrspace(3)* undef, align 4
	%tmp1 = load float, float addrspace(3)* undef, align 4			%tmp1 = load float, float addrspace(3)* undef, align 4
	store float %tmp, float addrspace(3)* null, align 4			store float %tmp, float addrspace(3)* null, align 4
	%tmp2 = bitcast float %tmp to i32			%tmp2 = bitcast float %tmp to i32
	Show All 18 Lines

llvm/trunk/test/CodeGen/AMDGPU/over-max-lds-size.ll

	; RUN: not llc -march=amdgcn -mcpu=tahiti < %s 2>&1 \| FileCheck -check-prefix=ERROR %s
	; RUN: not llc -march=amdgcn -mcpu=hawaii < %s 2>&1 \| FileCheck -check-prefix=ERROR %s
	; RUN: not llc -march=amdgcn -mcpu=fiji < %s 2>&1 \| FileCheck -check-prefix=ERROR %s

	; ERROR: error: local memory limit exceeded (400000) in use_huge_lds

	@huge = internal unnamed_addr addrspace(3) global [100000 x i32] undef, align 4

	define amdgpu_kernel void @use_huge_lds() {
	entry:
	%v0 = getelementptr inbounds [100000 x i32], [100000 x i32] addrspace(3)* @huge, i32 0, i32 0
	store i32 0, i32 addrspace(3)* %v0
	ret void
	}

llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-globals.ll

	; RUN: opt -data-layout=A5 -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s \| FileCheck -check-prefix=IR %s			; RUN: opt -data-layout=A5 -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s \| FileCheck -check-prefix=IR %s
	; RUN: llc -march=amdgcn -mcpu=tonga < %s \| FileCheck -check-prefix=ASM %s			; RUN: llc -march=amdgcn -mcpu=tonga < %s \| FileCheck -check-prefix=ASM %s


	@global_array0 = internal unnamed_addr addrspace(3) global [750 x [10 x i32]] undef, align 4			@global_array0 = internal unnamed_addr addrspace(3) global [750 x [10 x i32]] undef, align 4
	@global_array1 = internal unnamed_addr addrspace(3) global [750 x [10 x i32]] undef, align 4			@global_array1 = internal unnamed_addr addrspace(3) global [750 x [10 x i32]] undef, align 4

	; IR-LABEL: define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {			; IR-LABEL: define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
	; IR: alloca [10 x i32]			; IR: alloca [10 x i32]
	; ASM-LABEL: {{^}}promote_alloca_size_256:			; ASM-LABEL: {{^}}promote_alloca_size_256:
	; ASM: ; LDSByteSize: 60000 bytes/workgroup (compile time only)			; ASM: .amdgpu_lds global_array0, 30000, 4
				; ASM: .amdgpu_lds global_array1, 30000, 4

	define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {			define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
	entry:			entry:
	%stack = alloca [10 x i32], align 4, addrspace(5)			%stack = alloca [10 x i32], align 4, addrspace(5)
	%tmp = load i32, i32 addrspace(1)* %in, align 4			%tmp = load i32, i32 addrspace(1)* %in, align 4
	%arrayidx1 = getelementptr inbounds [10 x i32], [10 x i32] addrspace(5)* %stack, i32 0, i32 %tmp			%arrayidx1 = getelementptr inbounds [10 x i32], [10 x i32] addrspace(5)* %stack, i32 0, i32 %tmp
	store i32 4, i32 addrspace(5)* %arrayidx1, align 4			store i32 4, i32 addrspace(5)* %arrayidx1, align 4
	%arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1			%arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
	Show All 16 Lines

llvm/trunk/test/CodeGen/AMDGPU/shl_add_ptr.ll

Show All 27 Lines	define amdgpu_kernel void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
store float %val0, float addrspace(1)* %out		store float %val0, float addrspace(1)* %out
ret void		ret void
}		}

; Make sure once the first use is folded into the addressing mode, the		; Make sure once the first use is folded into the addressing mode, the
; remaining add use goes through the normal shl + add constant fold.		; remaining add use goes through the normal shl + add constant fold.

; GCN-LABEL: {{^}}load_shl_base_lds_1:		; GCN-LABEL: {{^}}load_shl_base_lds_1:
; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}		; GCN: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}}

		; TODO: integrate into the ds_read_b32 offset using a 16-bit relocation
		; GCN: v_add_{{[iu]}}32_e32 [[PTR:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]]

; GCN: ds_read_b32 [[RESULT:v[0-9]+]], [[PTR]] offset:8		; GCN: ds_read_b32 [[RESULT:v[0-9]+]], [[PTR]] offset:8
; GCN: v_add_{{[iu]}}32_e32 [[ADDUSE:v[0-9]+]], vcc, 8, v{{[0-9]+}}		; GCN: v_add_{{[iu]}}32_e32 [[ADDUSE:v[0-9]+]], vcc, 8, v{{[0-9]+}}
; GCN-DAG: buffer_store_dword [[RESULT]]		; GCN-DAG: buffer_store_dword [[RESULT]]
; GCN-DAG: buffer_store_dword [[ADDUSE]]		; GCN-DAG: buffer_store_dword [[ADDUSE]]
; GCN: s_endpgm		; GCN: s_endpgm
define amdgpu_kernel void @load_shl_base_lds_1(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {		define amdgpu_kernel void @load_shl_base_lds_1(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1		%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 2		%idx.0 = add nsw i32 %tid.x, 2
Show All 18 Lines	define amdgpu_kernel void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %lds, i32 addrspace(1)* %add_use) #0 {
store i32 %idx.0, i32 addrspace(1)* %add_use		store i32 %idx.0, i32 addrspace(1)* %add_use
store i8 %val0, i8 addrspace(1)* %out		store i8 %val0, i8 addrspace(1)* %out
ret void		ret void
}		}

; The two globals are placed adjacent in memory, so the same base		; The two globals are placed adjacent in memory, so the same base
; pointer can be used with an offset into the second one.		; pointer can be used with an offset into the second one.

		; TODO: Recover the optimization of using ds_read2st64_b32 using alignment hints

; GCN-LABEL: {{^}}load_shl_base_lds_2:		; GCN-LABEL: {{^}}load_shl_base_lds_2:
; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}		; GCN: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}}
		; GCN-DAG: v_add_{{[iu]}}32_e32 [[PTR0:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]]
		; GCN-DAG: v_add_{{[iu]}}32_e32 [[PTR1:v[0-9]+]], vcc, lds1@abs32@lo, [[OFS]]
; GCN: s_mov_b32 m0, -1		; GCN: s_mov_b32 m0, -1
; GCN-NEXT: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9
		; GCN-DAG: ds_read_b32 {{v[0-9]+}}, [[PTR0]] offset:256
		; GCN-DAG: ds_read_b32 {{v[0-9]+}}, [[PTR1]] offset:256
		; TODO: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9

; GCN: s_endpgm		; GCN: s_endpgm
define amdgpu_kernel void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {		define amdgpu_kernel void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1		%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 64		%idx.0 = add nsw i32 %tid.x, 64
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0		%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
%val0 = load float, float addrspace(3)* %arrayidx0, align 4		%val0 = load float, float addrspace(3)* %arrayidx0, align 4
%arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds1, i32 0, i32 %idx.0		%arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds1, i32 0, i32 %idx.0
%val1 = load float, float addrspace(3)* %arrayidx1, align 4		%val1 = load float, float addrspace(3)* %arrayidx1, align 4
▲ Show 20 Lines • Show All 342 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/si-sgpr-spill.ll

; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=TOVGPR %s		; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=TOVGPR %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s		; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s

; These tests check that the compiler won't crash when it needs to spill		; These tests check that the compiler won't crash when it needs to spill
; SGPRs.		; SGPRs.

@ddxy_lds = external addrspace(3) global [64 x i32]

; GCN-LABEL: {{^}}main:		; GCN-LABEL: {{^}}main:
; GCN: s_wqm		; GCN: s_wqm

; Make sure not emitting unused scratch resource descriptor setup		; Make sure not emitting unused scratch resource descriptor setup
; GCN-NOT: s_mov_b32		; GCN-NOT: s_mov_b32
; GCN-NOT: s_mov_b32
; GCN-NOT: s_mov_b32
; GCN-NOT: s_mov_b32

; GCN: s_mov_b32 m0		; GCN: s_mov_b32 m0

; Make sure scratch space isn't being used for SGPR->VGPR spills		; Make sure scratch space isn't being used for SGPR->VGPR spills

; Writing to M0 from an SMRD instruction will hang the GPU.		; Writing to M0 from an SMRD instruction will hang the GPU.
; GCN-NOT: s_buffer_load_dword m0		; GCN-NOT: s_buffer_load_dword m0
; GCN: s_endpgm		; GCN: s_endpgm

; TOVGPR: ScratchSize: 0{{$}}		; TOVGPR: ScratchSize: 0{{$}}
define amdgpu_ps void @main([17 x <4 x i32>] addrspace(4)* byval %arg, [32 x <4 x i32>] addrspace(4)* byval %arg1, [16 x <8 x i32>] addrspace(4)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) {		define amdgpu_ps void @main([17 x <4 x i32>] addrspace(4)* byval %arg, [32 x <4 x i32>] addrspace(4)* byval %arg1, [16 x <8 x i32>] addrspace(4)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) {
main_body:		main_body:
		%lds = inttoptr i32 0 to [64 x i32] addrspace(3)*
%tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(4)* %arg, i64 0, i32 0		%tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(4)* %arg, i64 0, i32 0
%tmp21 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0		%tmp21 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0
%tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 96, i32 0)		%tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 96, i32 0)
%tmp23 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 100, i32 0)		%tmp23 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 100, i32 0)
%tmp24 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 104, i32 0)		%tmp24 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 104, i32 0)
%tmp25 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 112, i32 0)		%tmp25 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 112, i32 0)
%tmp26 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 116, i32 0)		%tmp26 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 116, i32 0)
%tmp27 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 120, i32 0)		%tmp27 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 120, i32 0)
▲ Show 20 Lines • Show All 161 Lines • ▼ Show 20 Lines	main_body:
%i.i1 = extractelement <2 x i32> %arg6, i32 0		%i.i1 = extractelement <2 x i32> %arg6, i32 0
%j.i2 = extractelement <2 x i32> %arg6, i32 1		%j.i2 = extractelement <2 x i32> %arg6, i32 1
%i.f.i3 = bitcast i32 %i.i1 to float		%i.f.i3 = bitcast i32 %i.i1 to float
%j.f.i4 = bitcast i32 %j.i2 to float		%j.f.i4 = bitcast i32 %j.i2 to float
%p1.i5 = call float @llvm.amdgcn.interp.p1(float %i.f.i3, i32 2, i32 5, i32 %arg4) #0		%p1.i5 = call float @llvm.amdgcn.interp.p1(float %i.f.i3, i32 2, i32 5, i32 %arg4) #0
%p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 2, i32 5, i32 %arg4) #0		%p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 2, i32 5, i32 %arg4) #0
%mbcnt.lo.0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)		%mbcnt.lo.0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%tmp109 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.0)		%tmp109 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.0)
%tmp110 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp109		%tmp110 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp109
%tmp111 = bitcast float %p2.i to i32		%tmp111 = bitcast float %p2.i to i32
store i32 %tmp111, i32 addrspace(3)* %tmp110		store i32 %tmp111, i32 addrspace(3)* %tmp110
%tmp112 = bitcast float %p2.i96 to i32		%tmp112 = bitcast float %p2.i96 to i32
store i32 %tmp112, i32 addrspace(3)* %tmp110		store i32 %tmp112, i32 addrspace(3)* %tmp110
%mbcnt.lo.1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)		%mbcnt.lo.1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%tmp113 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.1)		%tmp113 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.1)
%tmp114 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp113		%tmp114 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp113
%tmp115 = and i32 %tmp113, -4		%tmp115 = and i32 %tmp113, -4
%tmp116 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp115		%tmp116 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp115
%tmp117 = add i32 %tmp115, 1		%tmp117 = add i32 %tmp115, 1
%tmp118 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp117		%tmp118 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp117
%tmp119 = bitcast float %p2.i to i32		%tmp119 = bitcast float %p2.i to i32
store i32 %tmp119, i32 addrspace(3)* %tmp114		store i32 %tmp119, i32 addrspace(3)* %tmp114
%tmp120 = load i32, i32 addrspace(3)* %tmp116		%tmp120 = load i32, i32 addrspace(3)* %tmp116
%tmp121 = bitcast i32 %tmp120 to float		%tmp121 = bitcast i32 %tmp120 to float
%tmp122 = load i32, i32 addrspace(3)* %tmp118		%tmp122 = load i32, i32 addrspace(3)* %tmp118
%tmp123 = bitcast i32 %tmp122 to float		%tmp123 = bitcast i32 %tmp122 to float
%tmp124 = fsub float %tmp123, %tmp121		%tmp124 = fsub float %tmp123, %tmp121
%tmp125 = bitcast float %p2.i96 to i32		%tmp125 = bitcast float %p2.i96 to i32
Show All 10 Lines	main_body:
%tmp135 = extractelement <4 x float> %tmp134, i32 0		%tmp135 = extractelement <4 x float> %tmp134, i32 0
%tmp136 = extractelement <4 x float> %tmp134, i32 1		%tmp136 = extractelement <4 x float> %tmp134, i32 1
%tmp137 = fmul float %tmp59, %p2.i		%tmp137 = fmul float %tmp59, %p2.i
%tmp138 = fmul float %tmp59, %p2.i96		%tmp138 = fmul float %tmp59, %p2.i96
%tmp139 = fmul float %tmp59, %p2.i96		%tmp139 = fmul float %tmp59, %p2.i96
%tmp140 = fmul float %tmp59, %p2.i96		%tmp140 = fmul float %tmp59, %p2.i96
%mbcnt.lo.2 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)		%mbcnt.lo.2 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%tmp141 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.2)		%tmp141 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.2)
%tmp142 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp141		%tmp142 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp141
%tmp143 = bitcast float %tmp137 to i32		%tmp143 = bitcast float %tmp137 to i32
store i32 %tmp143, i32 addrspace(3)* %tmp142		store i32 %tmp143, i32 addrspace(3)* %tmp142
%tmp144 = bitcast float %tmp138 to i32		%tmp144 = bitcast float %tmp138 to i32
store i32 %tmp144, i32 addrspace(3)* %tmp142		store i32 %tmp144, i32 addrspace(3)* %tmp142
%tmp145 = bitcast float %tmp139 to i32		%tmp145 = bitcast float %tmp139 to i32
store i32 %tmp145, i32 addrspace(3)* %tmp142		store i32 %tmp145, i32 addrspace(3)* %tmp142
%tmp146 = bitcast float %tmp140 to i32		%tmp146 = bitcast float %tmp140 to i32
store i32 %tmp146, i32 addrspace(3)* %tmp142		store i32 %tmp146, i32 addrspace(3)* %tmp142
%mbcnt.lo.3 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)		%mbcnt.lo.3 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%tmp147 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.3)		%tmp147 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.3)
%tmp148 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp147		%tmp148 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp147
%tmp149 = and i32 %tmp147, -4		%tmp149 = and i32 %tmp147, -4
%tmp150 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp149		%tmp150 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp149
%tmp151 = add i32 %tmp149, 2		%tmp151 = add i32 %tmp149, 2
%tmp152 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp151		%tmp152 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp151
%tmp153 = bitcast float %tmp137 to i32		%tmp153 = bitcast float %tmp137 to i32
store i32 %tmp153, i32 addrspace(3)* %tmp148		store i32 %tmp153, i32 addrspace(3)* %tmp148
%tmp154 = load i32, i32 addrspace(3)* %tmp150		%tmp154 = load i32, i32 addrspace(3)* %tmp150
%tmp155 = bitcast i32 %tmp154 to float		%tmp155 = bitcast i32 %tmp154 to float
%tmp156 = load i32, i32 addrspace(3)* %tmp152		%tmp156 = load i32, i32 addrspace(3)* %tmp152
%tmp157 = bitcast i32 %tmp156 to float		%tmp157 = bitcast i32 %tmp156 to float
%tmp158 = fsub float %tmp157, %tmp155		%tmp158 = fsub float %tmp157, %tmp155
%tmp159 = bitcast float %tmp138 to i32		%tmp159 = bitcast float %tmp138 to i32
▲ Show 20 Lines • Show All 1,427 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/target-cpu.ll

Show First 20 Lines • Show All 72 Lines • ▼ Show 20 Lines	define amdgpu_kernel void @target_fiji() #4 {
%gep = getelementptr inbounds i32, i32 addrspace(1)* %ptr, i64 %id.ext		%gep = getelementptr inbounds i32, i32 addrspace(1)* %ptr, i64 %id.ext
store i32 0, i32 addrspace(1)* %gep		store i32 0, i32 addrspace(1)* %gep
call void @llvm.amdgcn.s.dcache.wb()		call void @llvm.amdgcn.s.dcache.wb()
ret void		ret void
}		}

; CHECK-LABEL: {{^}}promote_alloca_enabled:		; CHECK-LABEL: {{^}}promote_alloca_enabled:
; CHECK: ds_read_b32		; CHECK: ds_read_b32
; CHECK: ; LDSByteSize: 5120
define amdgpu_kernel void @promote_alloca_enabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #5 {		define amdgpu_kernel void @promote_alloca_enabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #5 {
entry:		entry:
%stack = alloca [5 x i32], align 4, addrspace(5)		%stack = alloca [5 x i32], align 4, addrspace(5)
%tmp = load i32, i32 addrspace(1)* %in, align 4		%tmp = load i32, i32 addrspace(1)* %in, align 4
%arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp		%arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp
%load = load i32, i32 addrspace(5)* %arrayidx1		%load = load i32, i32 addrspace(5)* %arrayidx1
store i32 %load, i32 addrspace(1)* %out		store i32 %load, i32 addrspace(1)* %out
ret void		ret void
Show All 23 Lines

llvm/trunk/test/CodeGen/MIR/AMDGPU/machine-function-info.ll

	; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after finalize-isel -o %t.mir %s			; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after finalize-isel -o %t.mir %s
	; RUN: llc -run-pass=none -verify-machineinstrs %t.mir -o - \| FileCheck %s			; RUN: llc -run-pass=none -verify-machineinstrs %t.mir -o - \| FileCheck %s

	; Test that SIMachineFunctionInfo can be round trip serialized through			; Test that SIMachineFunctionInfo can be round trip serialized through
	; MIR.			; MIR.

	@lds = addrspace(3) global [512 x float] undef, align 4			@lds = addrspace(3) global [512 x float] undef, align 4

	; CHECK-LABEL: {{^}}name: kernel			; CHECK-LABEL: {{^}}name: kernel
	; CHECK: machineFunctionInfo:			; CHECK: machineFunctionInfo:
	; CHECK-NEXT: explicitKernArgSize: 128			; CHECK-NEXT: explicitKernArgSize: 128
	; CHECK-NEXT: maxKernArgAlign: 64			; CHECK-NEXT: maxKernArgAlign: 64
	; CHECK-NEXT: ldsSize: 2048			; CHECK-NEXT: ldsSize: 0
	; CHECK-NEXT: isEntryFunction: true			; CHECK-NEXT: isEntryFunction: true
	; CHECK-NEXT: noSignedZerosFPMath: false			; CHECK-NEXT: noSignedZerosFPMath: false
	; CHECK-NEXT: memoryBound: false			; CHECK-NEXT: memoryBound: false
	; CHECK-NEXT: waveLimiter: false			; CHECK-NEXT: waveLimiter: false
	; CHECK-NEXT: scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'			; CHECK-NEXT: scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'
	; CHECK-NEXT: scratchWaveOffsetReg: '$sgpr101'			; CHECK-NEXT: scratchWaveOffsetReg: '$sgpr101'
	; CHECK-NEXT: frameOffsetReg: '$sgpr101'			; CHECK-NEXT: frameOffsetReg: '$sgpr101'
	; CHECK-NEXT: stackPtrOffsetReg: '$sgpr101'			; CHECK-NEXT: stackPtrOffsetReg: '$sgpr101'
	▲ Show 20 Lines • Show All 62 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Write LDS objects out as global symbols in code generation
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 206420

llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h

llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

llvm/trunk/lib/Target/AMDGPU/SIFoldOperands.cpp

llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp

llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp

llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td

llvm/trunk/lib/Target/AMDGPU/SIInstructions.td

llvm/trunk/lib/Target/AMDGPU/SIShrinkInstructions.cpp

llvm/trunk/test/CodeGen/AMDGPU/32-bit-local-address-space.ll

llvm/trunk/test/CodeGen/AMDGPU/ds-sub-offset.ll

llvm/trunk/test/CodeGen/AMDGPU/ds_read2.ll

llvm/trunk/test/CodeGen/AMDGPU/ds_write2.ll

llvm/trunk/test/CodeGen/AMDGPU/lds-initializer.ll

llvm/trunk/test/CodeGen/AMDGPU/lds-relocs.ll

llvm/trunk/test/CodeGen/AMDGPU/lds-size.ll

llvm/trunk/test/CodeGen/AMDGPU/lds-zero-initializer.ll

llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll

llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll

llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll

llvm/trunk/test/CodeGen/AMDGPU/local-memory.amdgcn.ll

llvm/trunk/test/CodeGen/AMDGPU/local-memory.ll

llvm/trunk/test/CodeGen/AMDGPU/merge-store-crash.ll

llvm/trunk/test/CodeGen/AMDGPU/over-max-lds-size.ll

llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-globals.ll

llvm/trunk/test/CodeGen/AMDGPU/shl_add_ptr.ll

llvm/trunk/test/CodeGen/AMDGPU/si-sgpr-spill.ll

llvm/trunk/test/CodeGen/AMDGPU/target-cpu.ll

llvm/trunk/test/CodeGen/MIR/AMDGPU/machine-function-info.ll

This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Write LDS objects out as global symbols in code generationClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 206420

llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h

llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

llvm/trunk/lib/Target/AMDGPU/SIFoldOperands.cpp

llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp

llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp

llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td

llvm/trunk/lib/Target/AMDGPU/SIInstructions.td

llvm/trunk/lib/Target/AMDGPU/SIShrinkInstructions.cpp

llvm/trunk/test/CodeGen/AMDGPU/32-bit-local-address-space.ll

llvm/trunk/test/CodeGen/AMDGPU/ds-sub-offset.ll

llvm/trunk/test/CodeGen/AMDGPU/ds_read2.ll

llvm/trunk/test/CodeGen/AMDGPU/ds_write2.ll

llvm/trunk/test/CodeGen/AMDGPU/lds-initializer.ll

llvm/trunk/test/CodeGen/AMDGPU/lds-relocs.ll

llvm/trunk/test/CodeGen/AMDGPU/lds-size.ll

llvm/trunk/test/CodeGen/AMDGPU/lds-zero-initializer.ll

llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll

llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll

llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll

llvm/trunk/test/CodeGen/AMDGPU/local-memory.amdgcn.ll

llvm/trunk/test/CodeGen/AMDGPU/local-memory.ll

llvm/trunk/test/CodeGen/AMDGPU/merge-store-crash.ll

llvm/trunk/test/CodeGen/AMDGPU/over-max-lds-size.ll

llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-globals.ll

llvm/trunk/test/CodeGen/AMDGPU/shl_add_ptr.ll

llvm/trunk/test/CodeGen/AMDGPU/si-sgpr-spill.ll

llvm/trunk/test/CodeGen/AMDGPU/target-cpu.ll

llvm/trunk/test/CodeGen/MIR/AMDGPU/machine-function-info.ll

AMDGPU: Write LDS objects out as global symbols in code generation
ClosedPublic