Diff 198518

lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

Show First 20 Lines • Show All 279 Lines • ▼ Show 20 Lines	DisasmLines.push_back(
+ "_" + Twine(MBB.getNumber()) + ":").str());		+ "_" + Twine(MBB.getNumber()) + ":").str());
DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());		DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
HexLines.push_back("");		HexLines.push_back("");
}		}
AsmPrinter::EmitBasicBlockStart(MBB);		AsmPrinter::EmitBasicBlockStart(MBB);
}		}

void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {		void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
		if (GV->getType()->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
		const Triple::OSType OS = TM.getTargetTriple().getOS();

// Group segment variables aren't emitted in HSA.		// LDS variables aren't emitted in HSA or PAL yet.
if (AMDGPU::isGroupSegment(GV))		if (OS == Triple::AMDHSA \|\| OS == Triple::AMDPAL)
return;		return;
		arsenmUnsubmitted Done Reply Inline Actions This breaks the initializer error. This should be moved after arsenm: This breaks the initializer error. This should be moved after

		if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) {
		OutContext.reportError({},
		Twine(GV->getName()) +
		": unsupported initializer for address space");
		return;
		}

		MCSymbol *GVSym = getSymbol(GV);

		GVSym->redefineIfPossible();
		if (GVSym->isDefined() \|\| GVSym->isVariable())
		report_fatal_error("symbol '" + Twine(GVSym->getName()) +
		"' is already defined");

		const DataLayout &DL = GV->getParent()->getDataLayout();
		uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType());
		arsenmUnsubmitted Done Reply Inline Actions This uses the deprecated pointer type method. This should use getValueType() on the global instead arsenm: This uses the deprecated pointer type method. This should use getValueType() on the global…
		unsigned Align = GV->getAlignment();
		if (!Align)
		Align = 4;

		EmitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
		EmitLinkage(GV, GVSym);
		OutStreamer->emitELFSize(GVSym, MCConstantExpr::create(Size, OutContext));
		getTargetStreamer()->emitAMDGPULDS(GVSym, Align);
		return;
		}

AsmPrinter::EmitGlobalVariable(GV);		AsmPrinter::EmitGlobalVariable(GV);
}		}

bool AMDGPUAsmPrinter::doFinalization(Module &M) {		bool AMDGPUAsmPrinter::doFinalization(Module &M) {
CallGraphResourceInfo.clear();		CallGraphResourceInfo.clear();

if (AMDGPU::isGFX10(*getGlobalSTI())) {		if (AMDGPU::isGFX10(*getGlobalSTI())) {
OutStreamer->SwitchSection(getObjFileLowering().getTextSection());		OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
▲ Show 20 Lines • Show All 868 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIFoldOperands.cpp

Show First 20 Lines • Show All 44 Lines • ▼ Show 20 Lines	FoldCandidate(MachineInstr MI, unsigned OpNo, MachineOperand FoldOp,
UseMI(MI), OpToFold(nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),		UseMI(MI), OpToFold(nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
Kind(FoldOp->getType()),		Kind(FoldOp->getType()),
Commuted(Commuted_) {		Commuted(Commuted_) {
if (FoldOp->isImm()) {		if (FoldOp->isImm()) {
ImmToFold = FoldOp->getImm();		ImmToFold = FoldOp->getImm();
} else if (FoldOp->isFI()) {		} else if (FoldOp->isFI()) {
FrameIndexToFold = FoldOp->getIndex();		FrameIndexToFold = FoldOp->getIndex();
} else {		} else {
assert(FoldOp->isReg());		assert(FoldOp->isReg() \|\| FoldOp->isGlobal());
OpToFold = FoldOp;		OpToFold = FoldOp;
}		}
}		}

bool isFI() const {		bool isFI() const {
return Kind == MachineOperand::MO_FrameIndex;		return Kind == MachineOperand::MO_FrameIndex;
}		}

bool isImm() const {		bool isImm() const {
return Kind == MachineOperand::MO_Immediate;		return Kind == MachineOperand::MO_Immediate;
}		}

bool isReg() const {		bool isReg() const {
return Kind == MachineOperand::MO_Register;		return Kind == MachineOperand::MO_Register;
}		}

		bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; }

bool isCommuted() const {		bool isCommuted() const {
return Commuted;		return Commuted;
}		}

bool needsShrink() const {		bool needsShrink() const {
return ShrinkOpcode != -1;		return ShrinkOpcode != -1;
}		}

▲ Show 20 Lines • Show All 182 Lines • ▼ Show 20 Lines	static bool updateOperand(FoldCandidate &Fold,

assert(!Fold.needsShrink() && "not handled");		assert(!Fold.needsShrink() && "not handled");

if (Fold.isImm()) {		if (Fold.isImm()) {
Old.ChangeToImmediate(Fold.ImmToFold);		Old.ChangeToImmediate(Fold.ImmToFold);
return true;		return true;
}		}

		if (Fold.isGlobal()) {
		Old.ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(),
		Fold.OpToFold->getTargetFlags());
		return true;
		}

if (Fold.isFI()) {		if (Fold.isFI()) {
Old.ChangeToFrameIndex(Fold.FrameIndexToFold);		Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
return true;		return true;
}		}

MachineOperand *New = Fold.OpToFold;		MachineOperand *New = Fold.OpToFold;
if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) &&		if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) &&
TargetRegisterInfo::isVirtualRegister(New->getReg())) {		TargetRegisterInfo::isVirtualRegister(New->getReg())) {
▲ Show 20 Lines • Show All 178 Lines • ▼ Show 20 Lines	for (MachineRegisterInfo::use_iterator

foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList,		foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList,
CopiesToReplace);		CopiesToReplace);
}		}

return;		return;
}		}

		bool FoldingImm = OpToFold.isImm() \|\| OpToFold.isGlobal();
bool FoldingImm = OpToFold.isImm();

if (FoldingImm && UseMI->isCopy()) {		if (FoldingImm && UseMI->isCopy()) {
unsigned DestReg = UseMI->getOperand(0).getReg();		unsigned DestReg = UseMI->getOperand(0).getReg();
const TargetRegisterClass *DestRC		const TargetRegisterClass *DestRC
= TargetRegisterInfo::isVirtualRegister(DestReg) ?		= TargetRegisterInfo::isVirtualRegister(DestReg) ?
MRI->getRegClass(DestReg) :		MRI->getRegClass(DestReg) :
TRI->getPhysRegClass(DestReg);		TRI->getPhysRegClass(DestReg);

▲ Show 20 Lines • Show All 344 Lines • ▼ Show 20 Lines	void SIFoldOperands::foldInstOperand(MachineInstr &MI,
MachineOperand &OpToFold) const {		MachineOperand &OpToFold) const {
// We need mutate the operands of new mov instructions to add implicit		// We need mutate the operands of new mov instructions to add implicit
// uses of EXEC, but adding them invalidates the use_iterator, so defer		// uses of EXEC, but adding them invalidates the use_iterator, so defer
// this.		// this.
SmallVector<MachineInstr *, 4> CopiesToReplace;		SmallVector<MachineInstr *, 4> CopiesToReplace;
SmallVector<FoldCandidate, 4> FoldList;		SmallVector<FoldCandidate, 4> FoldList;
MachineOperand &Dst = MI.getOperand(0);		MachineOperand &Dst = MI.getOperand(0);

bool FoldingImm = OpToFold.isImm() \|\| OpToFold.isFI();		bool FoldingImm = OpToFold.isImm() \|\| OpToFold.isFI() \|\| OpToFold.isGlobal();
if (FoldingImm) {		if (FoldingImm) {
unsigned NumLiteralUses = 0;		unsigned NumLiteralUses = 0;
MachineOperand *NonInlineUse = nullptr;		MachineOperand *NonInlineUse = nullptr;
int NonInlineUseOpNo = -1;		int NonInlineUseOpNo = -1;

MachineRegisterInfo::use_iterator NextUse;		MachineRegisterInfo::use_iterator NextUse;
for (MachineRegisterInfo::use_iterator		for (MachineRegisterInfo::use_iterator
Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();		Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
▲ Show 20 Lines • Show All 329 Lines • ▼ Show 20 Lines	for (I = MBB->begin(); I != MBB->end(); I = Next) {
// instruction, and not the omod multiply.		// instruction, and not the omod multiply.
if (IsIEEEMode \|\| (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) \|\|		if (IsIEEEMode \|\| (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) \|\|
!tryFoldOMod(MI))		!tryFoldOMod(MI))
tryFoldClamp(MI);		tryFoldClamp(MI);
continue;		continue;
}		}

MachineOperand &OpToFold = MI.getOperand(1);		MachineOperand &OpToFold = MI.getOperand(1);
bool FoldingImm = OpToFold.isImm() \|\| OpToFold.isFI();		bool FoldingImm =
		OpToFold.isImm() \|\| OpToFold.isFI() \|\| OpToFold.isGlobal();

// FIXME: We could also be folding things like TargetIndexes.		// FIXME: We could also be folding things like TargetIndexes.
if (!FoldingImm && !OpToFold.isReg())		if (!FoldingImm && !OpToFold.isReg())
continue;		continue;

if (OpToFold.isReg() &&		if (OpToFold.isReg() &&
!TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()))		!TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()))
continue;		continue;
Show All 17 Lines

lib/Target/AMDGPU/SIISelLowering.h

Show First 20 Lines • Show All 343 Lines • ▼ Show 20 Lines	public:
getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,		getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
StringRef Constraint, MVT VT) const override;		StringRef Constraint, MVT VT) const override;
ConstraintType getConstraintType(StringRef Constraint) const override;		ConstraintType getConstraintType(StringRef Constraint) const override;
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL,		SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL,
SDValue V) const;		SDValue V) const;

void finalizeLowering(MachineFunction &MF) const override;		void finalizeLowering(MachineFunction &MF) const override;

		void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known,
		const APInt &DemandedElts,
		const SelectionDAG &DAG,
		unsigned Depth = 0) const override;
void computeKnownBitsForFrameIndex(const SDValue Op,		void computeKnownBitsForFrameIndex(const SDValue Op,
KnownBits &Known,		KnownBits &Known,
const APInt &DemandedElts,		const APInt &DemandedElts,
const SelectionDAG &DAG,		const SelectionDAG &DAG,
unsigned Depth = 0) const override;		unsigned Depth = 0) const override;

bool isSDNodeSourceOfDivergence(const SDNode *N,		bool isSDNodeSourceOfDivergence(const SDNode *N,
FunctionLoweringInfo FLI, LegacyDivergenceAnalysis DA) const override;		FunctionLoweringInfo FLI, LegacyDivergenceAnalysis DA) const override;
Show All 17 Lines

lib/Target/AMDGPU/SIISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 3,438 Lines • ▼ Show 20 Lines	case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMOV_B64),		BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMOV_B64),
AMDGPU::EXEC)		AMDGPU::EXEC)
.addImm(-1);		.addImm(-1);
MI.eraseFromParent();		MI.eraseFromParent();
return BB;		return BB;
}		}

case AMDGPU::GET_GROUPSTATICSIZE: {		case AMDGPU::GET_GROUPSTATICSIZE: {
		assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA \|\|
		getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
DebugLoc DL = MI.getDebugLoc();		DebugLoc DL = MI.getDebugLoc();
BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))		BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
.add(MI.getOperand(0))		.add(MI.getOperand(0))
.addImm(MFI->getLDSSize());		.addImm(MFI->getLDSSize());
MI.eraseFromParent();		MI.eraseFromParent();
return BB;		return BB;
}		}
case AMDGPU::SI_INDIRECT_SRC_V1:		case AMDGPU::SI_INDIRECT_SRC_V1:
▲ Show 20 Lines • Show All 1,122 Lines • ▼ Show 20 Lines	buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);		return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
}		}

SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,		SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
SDValue Op,		SDValue Op,
SelectionDAG &DAG) const {		SelectionDAG &DAG) const {
GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);		GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
const GlobalValue *GV = GSD->getGlobal();		const GlobalValue *GV = GSD->getGlobal();
if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS \|\|		if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
		(!GV->hasExternalLinkage() \|\|
		getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA \|\|
		getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL)) \|\|
GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS \|\|		GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS \|\|
GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)		GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);		return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);

SDLoc DL(GSD);		SDLoc DL(GSD);
EVT PtrVT = Op.getValueType();		EVT PtrVT = Op.getValueType();

// FIXME: Should not make address space based decisions here.		if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
		SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
		SIInstrInfo::MO_ABS32_LO);
		GA = DAG.getNode(ISD::AssertZext, DL, MVT::i32, GA,
		DAG.getValueType(MVT::i16));
		return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
		arsenmUnsubmitted Done Reply Inline Actions Shouldn't introduce machine nodes here. What is the problem with selecting the GlobalAddress directly in a tablegen pattern or manually in AMDGPUISelDAGToDAG? arsenm: Shouldn't introduce machine nodes here. What is the problem with selecting the GlobalAddress…
		}

if (shouldEmitFixup(GV))		if (shouldEmitFixup(GV))
return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);		return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
else if (shouldEmitPCReloc(GV))		else if (shouldEmitPCReloc(GV))
return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,		return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
SIInstrInfo::MO_REL32);		SIInstrInfo::MO_REL32);

SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,		SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
SIInstrInfo::MO_GOTPCREL32);		SIInstrInfo::MO_GOTPCREL32);
▲ Show 20 Lines • Show All 947 Lines • ▼ Show 20 Lines	SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::amdgcn_wwm: {		case Intrinsic::amdgcn_wwm: {
SDValue Src = Op.getOperand(1);		SDValue Src = Op.getOperand(1);
return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src),		return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src),
0);		0);
}		}
case Intrinsic::amdgcn_fmad_ftz:		case Intrinsic::amdgcn_fmad_ftz:
return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),		return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
Op.getOperand(2), Op.getOperand(3));		Op.getOperand(2), Op.getOperand(3));
		case Intrinsic::amdgcn_groupstaticsize: {
		Triple::OSType OS = getTargetMachine().getTargetTriple().getOS();
		if (OS == Triple::AMDHSA \|\| OS == Triple::AMDPAL)
		return Op;
		arsenmUnsubmitted Done Reply Inline Actions Maybe we can define the intrinsic to return -1 if the size can't be determined, similar to how llvm.objectsize works? arsenm: Maybe we can define the intrinsic to return -1 if the size can't be determined, similar to how…
		nhaehnleAuthorUnsubmitted Done Reply Inline Actions Makes sense, though that seems like a task for a separate change. nhaehnle: Makes sense, though that seems like a task for a separate change.

		const Module *M = MF.getFunction().getParent();
		const GlobalValue *GV =
		M->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize));
		SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
		SIInstrInfo::MO_ABS32_LO);
		return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
		}
default:		default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =		if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))		AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
return lowerImage(Op, ImageDimIntr, DAG);		return lowerImage(Op, ImageDimIntr, DAG);

return Op;		return Op;
}		}
}		}
▲ Show 20 Lines • Show All 4,392 Lines • ▼ Show 20 Lines	MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG,
Info->getScratchWaveOffsetReg());		Info->getScratchWaveOffsetReg());
}		}

Info->limitOccupancy(MF);		Info->limitOccupancy(MF);

TargetLoweringBase::finalizeLowering(MF);		TargetLoweringBase::finalizeLowering(MF);
}		}

		void SITargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
		KnownBits &Known,
		const APInt &DemandedElts,
		const SelectionDAG &DAG,
		unsigned Depth) const {
		if (Op->isMachineOpcode()) {
		switch (Op->getMachineOpcode()) {
		case AMDGPU::S_MOV_B32:
		case AMDGPU::V_MOV_B32_e32:
		Known = DAG.computeKnownBits(Op->getOperand(0), DemandedElts, Depth + 1);
		break;
		default:
		break;
		}
		return;
		}

		AMDGPUTargetLowering::computeKnownBitsForTargetNode(Op, Known, DemandedElts,
		DAG, Depth);
		arsenmUnsubmitted Done Reply Inline Actions I don't think there should be a need to handle machine nodes here, particularly moves. The GlobalAddress should be legal, and the select pattern should be inserting the move? arsenm: I don't think there should be a need to handle machine nodes here, particularly moves. The…
		}

void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,		void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
KnownBits &Known,		KnownBits &Known,
const APInt &DemandedElts,		const APInt &DemandedElts,
const SelectionDAG &DAG,		const SelectionDAG &DAG,
unsigned Depth) const {		unsigned Depth) const {
TargetLowering::computeKnownBitsForFrameIndex(Op, Known, DemandedElts,		TargetLowering::computeKnownBitsForFrameIndex(Op, Known, DemandedElts,
DAG, Depth);		DAG, Depth);

▲ Show 20 Lines • Show All 210 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIInstrInfo.cpp

Show First 20 Lines • Show All 2,622 Lines • ▼ Show 20 Lines	static bool compareMachineOp(const MachineOperand &Op0,
}		}
}		}

bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,		bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
const MachineOperand &MO) const {		const MachineOperand &MO) const {
const MCInstrDesc &InstDesc = MI.getDesc();		const MCInstrDesc &InstDesc = MI.getDesc();
const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo];		const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo];

assert(MO.isImm() \|\| MO.isTargetIndex() \|\| MO.isFI());		assert(MO.isImm() \|\| MO.isTargetIndex() \|\| MO.isFI() \|\| MO.isGlobal());

if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)		if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
return true;		return true;

if (OpInfo.RegClass < 0)		if (OpInfo.RegClass < 0)
return false;		return false;

if (MO.isImm() && isInlineConstant(MO, OpInfo))		if (MO.isImm() && isInlineConstant(MO, OpInfo))
▲ Show 20 Lines • Show All 940 Lines • ▼ Show 20 Lines

bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,		bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
const MCOperandInfo &OpInfo,		const MCOperandInfo &OpInfo,
const MachineOperand &MO) const {		const MachineOperand &MO) const {
if (MO.isReg())		if (MO.isReg())
return isLegalRegOperand(MRI, OpInfo, MO);		return isLegalRegOperand(MRI, OpInfo, MO);

// Handle non-register types that are treated like immediates.		// Handle non-register types that are treated like immediates.
assert(MO.isImm() \|\| MO.isTargetIndex() \|\| MO.isFI());		assert(MO.isImm() \|\| MO.isTargetIndex() \|\| MO.isFI() \|\| MO.isGlobal());
return true;		return true;
}		}

bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,		bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
const MachineOperand *MO) const {		const MachineOperand *MO) const {
const MachineFunction &MF = *MI.getParent()->getParent();		const MachineFunction &MF = *MI.getParent()->getParent();
const MachineRegisterInfo &MRI = MF.getRegInfo();		const MachineRegisterInfo &MRI = MF.getRegInfo();
const MCInstrDesc &InstDesc = MI.getDesc();		const MCInstrDesc &InstDesc = MI.getDesc();
Show All 40 Lines	bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
}		}

if (MO->isReg()) {		if (MO->isReg()) {
assert(DefinedRC);		assert(DefinedRC);
return isLegalRegOperand(MRI, OpInfo, *MO);		return isLegalRegOperand(MRI, OpInfo, *MO);
}		}

// Handle non-register types that are treated like immediates.		// Handle non-register types that are treated like immediates.
assert(MO->isImm() \|\| MO->isTargetIndex() \|\| MO->isFI());		assert(MO->isImm() \|\| MO->isTargetIndex() \|\| MO->isFI() \|\| MO->isGlobal());

if (!DefinedRC) {		if (!DefinedRC) {
// This operand expects an immediate.		// This operand expects an immediate.
return true;		return true;
}		}

return isImmOperandLegal(MI, OpIdx, *MO);		return isImmOperandLegal(MI, OpIdx, *MO);
}		}
▲ Show 20 Lines • Show All 2,290 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/32-bit-local-address-space.ll

Show First 20 Lines • Show All 75 Lines • ▼ Show 20 Lines	define amdgpu_kernel void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] addrspace(3)* %lds, i32 %tid) {
%val = load float, float addrspace(3)* %ptr		%val = load float, float addrspace(3)* %ptr
store float %val, float addrspace(1)* %out		store float %val, float addrspace(1)* %out
ret void		ret void
}		}

@g_lds = addrspace(3) global float undef, align 4		@g_lds = addrspace(3) global float undef, align 4

; FUNC-LABEL: {{^}}infer_ptr_alignment_global_offset:		; FUNC-LABEL: {{^}}infer_ptr_alignment_global_offset:
; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0		; SI: v_mov_b32_e32 [[PTR:v[0-9]+]], g_lds@abs32@lo
; SI: ds_read_b32 v{{[0-9]+}}, [[REG]]		; SI: ds_read_b32 v{{[0-9]+}}, [[PTR]]
define amdgpu_kernel void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) {		define amdgpu_kernel void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) {
%val = load float, float addrspace(3)* @g_lds		%val = load float, float addrspace(3)* @g_lds
store float %val, float addrspace(1)* %out		store float %val, float addrspace(1)* %out
ret void		ret void
}		}


@ptr = addrspace(3) global i32 addrspace(3)* undef		@ptr = addrspace(3) global i32 addrspace(3)* undef
▲ Show 20 Lines • Show All 46 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/constant-fold-mi-operands.ll

; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s		; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s		; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s

; GCN-LABEL: {{^}}fold_mi_v_and_0:
; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
; GCN-NOT: [[RESULT]]
arsenmUnsubmitted Done Reply Inline Actions I'm not sure these tests have been entirely replaced with MIR yet. Can you just set the triple to preserve them? arsenm: I'm not sure these tests have been entirely replaced with MIR yet. Can you just set the triple…
; GCN: buffer_store_dword [[RESULT]]
define amdgpu_kernel void @fold_mi_v_and_0(i32 addrspace(1)* %out) {
%x = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
%size = call i32 @llvm.amdgcn.groupstaticsize()
%and = and i32 %size, %x
store i32 %and, i32 addrspace(1)* %out
ret void
}

; GCN-LABEL: {{^}}fold_mi_s_and_0:
; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
; GCN-NOT: [[RESULT]]
; GCN: buffer_store_dword [[RESULT]]
define amdgpu_kernel void @fold_mi_s_and_0(i32 addrspace(1)* %out, i32 %x) #0 {
%size = call i32 @llvm.amdgcn.groupstaticsize()
%and = and i32 %size, %x
store i32 %and, i32 addrspace(1)* %out
ret void
}

; GCN-LABEL: {{^}}fold_mi_v_or_0:
; GCN: v_mbcnt_lo_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]]
; GCN-NOT: [[RESULT]]
; GCN: buffer_store_dword [[RESULT]]
define amdgpu_kernel void @fold_mi_v_or_0(i32 addrspace(1)* %out) {
%x = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
%size = call i32 @llvm.amdgcn.groupstaticsize()
%or = or i32 %size, %x
store i32 %or, i32 addrspace(1)* %out
ret void
}

; GCN-LABEL: {{^}}fold_mi_s_or_0:
; GCN: s_load_dword [[SVAL:s[0-9]+]]
; GCN-NOT: [[SVAL]]
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
; GCN-NOT: [[VVAL]]
; GCN: buffer_store_dword [[VVAL]]
define amdgpu_kernel void @fold_mi_s_or_0(i32 addrspace(1)* %out, i32 %x) #0 {
%size = call i32 @llvm.amdgcn.groupstaticsize()
%or = or i32 %size, %x
store i32 %or, i32 addrspace(1)* %out
ret void
}

; GCN-LABEL: {{^}}fold_mi_v_xor_0:
; GCN: v_mbcnt_lo_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]]
; GCN-NOT: [[RESULT]]
; GCN: buffer_store_dword [[RESULT]]
define amdgpu_kernel void @fold_mi_v_xor_0(i32 addrspace(1)* %out) {
%x = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
%size = call i32 @llvm.amdgcn.groupstaticsize()
%xor = xor i32 %size, %x
store i32 %xor, i32 addrspace(1)* %out
ret void
}

; GCN-LABEL: {{^}}fold_mi_s_xor_0:
; GCN: s_load_dword [[SVAL:s[0-9]+]]
; GCN-NOT: [[SVAL]]
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
; GCN-NOT: [[VVAL]]
; GCN: buffer_store_dword [[VVAL]]
define amdgpu_kernel void @fold_mi_s_xor_0(i32 addrspace(1)* %out, i32 %x) #0 {
%size = call i32 @llvm.amdgcn.groupstaticsize()
%xor = xor i32 %size, %x
store i32 %xor, i32 addrspace(1)* %out
ret void
}

; GCN-LABEL: {{^}}fold_mi_s_not_0:
; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], -1{{$}}
; GCN-NOT: [[RESULT]]
; GCN: buffer_store_dword [[RESULT]]
define amdgpu_kernel void @fold_mi_s_not_0(i32 addrspace(1)* %out, i32 %x) #0 {
%size = call i32 @llvm.amdgcn.groupstaticsize()
%xor = xor i32 %size, -1
store i32 %xor, i32 addrspace(1)* %out
ret void
}

; GCN-LABEL: {{^}}fold_mi_v_not_0:		; GCN-LABEL: {{^}}fold_mi_v_not_0:
; GCN: v_bcnt_u32_b32{{(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, 0{{$}}		; GCN: v_bcnt_u32_b32{{(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, 0{{$}}
; GCN: v_bcnt_u32_b32{{(_e32)(_e64)}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, v[[RESULT_LO]]{{$}}		; GCN: v_bcnt_u32_b32{{(_e32)(_e64)}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, v[[RESULT_LO]]{{$}}
; GCN-NEXT: v_not_b32_e32 v[[RESULT_LO]]		; GCN-NEXT: v_not_b32_e32 v[[RESULT_LO]]
; GCN-NEXT: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], -1{{$}}		; GCN-NEXT: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], -1{{$}}
; GCN-NEXT: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}		; GCN-NEXT: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
define amdgpu_kernel void @fold_mi_v_not_0(i64 addrspace(1)* %out) {		define amdgpu_kernel void @fold_mi_v_not_0(i64 addrspace(1)* %out) {
%vreg = load volatile i64, i64 addrspace(1)* undef		%vreg = load volatile i64, i64 addrspace(1)* undef
Show All 37 Lines	define amdgpu_kernel void @fold_mi_and_neg1(i64 addrspace(1)* %out) {
%xor = xor i64 %ctpop, -1		%xor = xor i64 %ctpop, -1
%and = and i64 %xor, %vreg1		%and = and i64 %xor, %vreg1
store i64 %and, i64 addrspace(1)* %out		store i64 %and, i64 addrspace(1)* %out
ret void		ret void
}		}

declare i64 @llvm.ctpop.i64(i64) #1		declare i64 @llvm.ctpop.i64(i64) #1
declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1		declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
declare i32 @llvm.amdgcn.groupstaticsize() #1

attributes #0 = { nounwind }		attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }		attributes #1 = { nounwind readnone }

test/CodeGen/AMDGPU/ds-sub-offset.ll

	; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,CI %s			; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
	; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s			; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s

	declare i32 @llvm.amdgcn.workitem.id.x() #0			declare i32 @llvm.amdgcn.workitem.id.x() #0

	@lds.obj = addrspace(3) global [256 x i32] undef, align 4			@lds.obj = addrspace(3) global [256 x i32] undef, align 4

	; GCN-LABEL: {{^}}write_ds_sub0_offset0_global:			; GCN-LABEL: {{^}}write_ds_sub0_offset0_global:
	; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, v0			; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, v0
	; CI: v_sub_i32_e32 [[BASEPTR:v[0-9]+]], vcc, 0, [[SHL]]			; GCN: s_mov_b32 [[LDS:s[0-9]+]], lds.obj@abs32@lo
	; GFX9: v_sub_u32_e32 [[BASEPTR:v[0-9]+]], 0, [[SHL]]			; CI: v_sub_i32_e32 [[BASEPTR:v[0-9]+]], vcc, [[LDS]], [[SHL]]
				; GFX9: v_sub_u32_e32 [[BASEPTR:v[0-9]+]], [[LDS]], [[SHL]]
	; GCN: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b			; GCN: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b
	; GCN: ds_write_b32 [[BASEPTR]], [[VAL]] offset:12			; GCN: ds_write_b32 [[BASEPTR]], [[VAL]] offset:12
	define amdgpu_kernel void @write_ds_sub0_offset0_global() #0 {			define amdgpu_kernel void @write_ds_sub0_offset0_global() #0 {
	entry:			entry:
	%x.i = call i32 @llvm.amdgcn.workitem.id.x() #1			%x.i = call i32 @llvm.amdgcn.workitem.id.x() #1
	%sub1 = sub i32 0, %x.i			%sub1 = sub i32 0, %x.i
	%tmp0 = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds.obj, i32 0, i32 %sub1			%tmp0 = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds.obj, i32 0, i32 %sub1
	%arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %tmp0, i32 3			%arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %tmp0, i32 3
	▲ Show 20 Lines • Show All 145 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/ds_read2.ll

Show First 20 Lines • Show All 349 Lines • ▼ Show 20 Lines	define amdgpu_kernel void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
store float %sum, float addrspace(1)* %out.gep, align 4		store float %sum, float addrspace(1)* %out.gep, align 4
ret void		ret void
}		}

; GCN-LABEL: @simple_read2_f64		; GCN-LABEL: @simple_read2_f64
; CI-DAG: s_mov_b32 m0		; CI-DAG: s_mov_b32 m0
; GFX9-NOT: m0		; GFX9-NOT: m0

; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, {{v[0-9]+}}		; GCN-DAG: v_lshlrev_b32_e32 [[VOFS:v[0-9]+]], 3, {{v[0-9]+}}
		; GCN-DAG: v_add_{{[iu]}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}lds.f64@abs32@lo, [[VOFS]]
; GCN: ds_read2_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, [[VPTR]] offset1:8		; GCN: ds_read2_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, [[VPTR]] offset1:8
; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}		; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}

; CI: buffer_store_dwordx2 [[RESULT]]		; CI: buffer_store_dwordx2 [[RESULT]]
; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]		; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @simple_read2_f64(double addrspace(1)* %out) #0 {		define amdgpu_kernel void @simple_read2_f64(double addrspace(1)* %out) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1		%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i		%arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
▲ Show 20 Lines • Show All 69 Lines • ▼ Show 20 Lines
}		}

@foo = addrspace(3) global [4 x i32] undef, align 4		@foo = addrspace(3) global [4 x i32] undef, align 4

; GCN-LABEL: @load_constant_adjacent_offsets		; GCN-LABEL: @load_constant_adjacent_offsets
; CI-DAG: s_mov_b32 m0		; CI-DAG: s_mov_b32 m0
; GFX9-NOT: m0		; GFX9-NOT: m0

; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}		; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], foo@abs32@lo{{$}}
; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1		; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset1:1
define amdgpu_kernel void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) {		define amdgpu_kernel void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) {
%val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4		%val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
%val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4		%val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
%sum = add i32 %val0, %val1		%sum = add i32 %val0, %val1
store i32 %sum, i32 addrspace(1)* %out, align 4		store i32 %sum, i32 addrspace(1)* %out, align 4
ret void		ret void
}		}

; GCN-LABEL: @load_constant_disjoint_offsets		; GCN-LABEL: @load_constant_disjoint_offsets
; CI-DAG: s_mov_b32 m0		; CI-DAG: s_mov_b32 m0
; GFX9-NOT: m0		; GFX9-NOT: m0

; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}		; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], foo@abs32@lo{{$}}
; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:2		; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset1:2
define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) {		define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) {
%val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4		%val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
%val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4		%val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
%sum = add i32 %val0, %val1		%sum = add i32 %val0, %val1
store i32 %sum, i32 addrspace(1)* %out, align 4		store i32 %sum, i32 addrspace(1)* %out, align 4
ret void		ret void
}		}

@bar = addrspace(3) global [4 x i64] undef, align 4		@bar = addrspace(3) global [4 x i64] undef, align 4

; GCN-LABEL: @load_misaligned64_constant_offsets		; GCN-LABEL: @load_misaligned64_constant_offsets
; CI-DAG: s_mov_b32 m0		; CI-DAG: s_mov_b32 m0
; GFX9-NOT: m0		; GFX9-NOT: m0

; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}		; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], bar@abs32@lo{{$}}
; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1		; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset1:1
; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:2 offset1:3		; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:2 offset1:3
define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) {		define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) {
%val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4		%val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
%val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4		%val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
%sum = add i64 %val0, %val1		%sum = add i64 %val0, %val1
store i64 %sum, i64 addrspace(1)* %out, align 8		store i64 %sum, i64 addrspace(1)* %out, align 8
ret void		ret void
}		}

@bar.large = addrspace(3) global [4096 x i64] undef, align 4		@bar.large = addrspace(3) global [4096 x i64] undef, align 4

; GCN-LABEL: @load_misaligned64_constant_large_offsets		; GCN-LABEL: @load_misaligned64_constant_large_offsets
; CI-DAG: s_mov_b32 m0		; CI-DAG: s_mov_b32 m0
; GFX9-NOT: m0		; GFX9-NOT: m0

; GCN-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}}		; GCN-DAG: s_mov_b32 [[SBASE0:s[0-9]+]], bar.large@abs32@lo
; GCN-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000		; GCN-DAG: s_add_i32 [[SBASE1:s[0-9]+]], [[SBASE0]], 0x4000{{$}}
; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE0]] offset1:1		; GCN-DAG: s_addk_i32 [[SBASE0]], 0x7ff8{{$}}
; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE1]] offset1:1		; GCN-DAG: v_mov_b32_e32 [[VBASE0:v[0-9]+]], [[SBASE0]]
		; GCN-DAG: v_mov_b32_e32 [[VBASE1:v[0-9]+]], [[SBASE1]]
		; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VBASE0]] offset1:1
		; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VBASE1]] offset1:1
; GCN: s_endpgm		; GCN: s_endpgm
define amdgpu_kernel void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) {		define amdgpu_kernel void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) {
%val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4		%val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
%val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4		%val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
%sum = add i64 %val0, %val1		%sum = add i64 %val0, %val1
store i64 %sum, i64 addrspace(1)* %out, align 8		store i64 %sum, i64 addrspace(1)* %out, align 8
ret void		ret void
}		}
▲ Show 20 Lines • Show All 165 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/ds_write2.ll

	Show First 20 Lines • Show All 97 Lines • ▼ Show 20 Lines
	; 2 data subregisters from different super registers.			; 2 data subregisters from different super registers.
	; GCN-LABEL: {{^}}simple_write2_two_val_subreg2_mixed_f32:			; GCN-LABEL: {{^}}simple_write2_two_val_subreg2_mixed_f32:
	; GFX9-NOT: m0			; GFX9-NOT: m0

	; CI: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}}			; CI: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}}
	; CI: buffer_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}			; CI: buffer_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}
	; CI-DAG: s_mov_b32 m0			; CI-DAG: s_mov_b32 m0

	; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}			; CI-DAG: v_lshlrev_b32_e32 [[VOFS:v[0-9]+]], 2, v{{[0-9]+}}
				; CI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]], vcc, lds@abs32@lo, [[VOFS]]
				;
				; TODO: This should be an s_mov_b32. The v_mov_b32 gets introduced by an
				; early legalization of the constant bus constraint on the v_lshl_add_u32,
				; and then SIFoldOperands folds in an unlucky order.
				; GFX9-DAG: v_mov_b32_e32 [[VBASE:v[0-9]+]], lds@abs32@lo
				; GFX9-DAG: v_lshl_add_u32 [[VPTR:v[0-9]+]], {{v[0-9]+}}, 2, [[VBASE]]

	; GFX9: global_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}}			; GFX9-DAG: global_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}}
	; GFX9: global_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}			; GFX9-DAG: global_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}

	; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8			; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
	; GCN: s_endpgm			; GCN: s_endpgm
	define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {			define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
	%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1			%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
	%in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i			%in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
	%in.gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in.gep.0, i32 1			%in.gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in.gep.0, i32 1
	%val0 = load volatile <2 x float>, <2 x float> addrspace(1)* %in.gep.0, align 8			%val0 = load volatile <2 x float>, <2 x float> addrspace(1)* %in.gep.0, align 8
	%val1 = load volatile <2 x float>, <2 x float> addrspace(1)* %in.gep.1, align 8			%val1 = load volatile <2 x float>, <2 x float> addrspace(1)* %in.gep.1, align 8
	%val0.0 = extractelement <2 x float> %val0, i32 0			%val0.0 = extractelement <2 x float> %val0, i32 0
	%val1.1 = extractelement <2 x float> %val1, i32 1			%val1.1 = extractelement <2 x float> %val1, i32 1
	%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i			%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
	store float %val0.0, float addrspace(3)* %arrayidx0, align 4			store float %val0.0, float addrspace(3)* %arrayidx0, align 4
	%add.x = add nsw i32 %x.i, 8			%add.x = add nsw i32 %x.i, 8
	%arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x			%arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
	store float %val1.1, float addrspace(3)* %arrayidx1, align 4			store float %val1.1, float addrspace(3)* %arrayidx1, align 4
	ret void			ret void
	}			}

	; GCN-LABEL: @simple_write2_two_val_subreg2_f32			; GCN-LABEL: @simple_write2_two_val_subreg2_f32
	; CI-DAG: s_mov_b32 m0			; CI-DAG: s_mov_b32 m0
	; GFX9-NOT: m0			; GFX9-NOT: m0

	; GCN-DAG: {{buffer\|global}}_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}			; GCN-DAG: {{buffer\|global}}_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
	; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
				; CI-DAG: v_lshlrev_b32_e32 [[VOFS:v[0-9]+]], 2, v{{[0-9]+}}
				; CI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]], vcc, lds@abs32@lo, [[VOFS]]
				; GFX9-DAG: v_mov_b32_e32 [[VBASE:v[0-9]+]], lds@abs32@lo
				; GFX9-DAG: v_lshl_add_u32 [[VPTR:v[0-9]+]], v{{[0-9]+}}, 2, [[VBASE]]

	; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8			; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
	; GCN: s_endpgm			; GCN: s_endpgm
	define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {			define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
	%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1			%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
	%in.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i			%in.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
	%val = load <2 x float>, <2 x float> addrspace(1)* %in.gep, align 8			%val = load <2 x float>, <2 x float> addrspace(1)* %in.gep, align 8
	%val0 = extractelement <2 x float> %val, i32 0			%val0 = extractelement <2 x float> %val, i32 0
	%val1 = extractelement <2 x float> %val, i32 1			%val1 = extractelement <2 x float> %val, i32 1
	%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i			%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
	store float %val0, float addrspace(3)* %arrayidx0, align 4			store float %val0, float addrspace(3)* %arrayidx0, align 4
	%add.x = add nsw i32 %x.i, 8			%add.x = add nsw i32 %x.i, 8
	%arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x			%arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x
	store float %val1, float addrspace(3)* %arrayidx1, align 4			store float %val1, float addrspace(3)* %arrayidx1, align 4
	ret void			ret void
	}			}

	; GCN-LABEL: @simple_write2_two_val_subreg4_f32			; GCN-LABEL: @simple_write2_two_val_subreg4_f32
	; CI-DAG: s_mov_b32 m0			; CI-DAG: s_mov_b32 m0
	; GFX9-NOT: m0			; GFX9-NOT: m0

	; GCN-DAG: {{buffer\|global}}_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}			; GCN-DAG: {{buffer\|global}}_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
	; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
				; CI-DAG: v_lshlrev_b32_e32 [[VOFS:v[0-9]+]], 2, v{{[0-9]+}}
				; CI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]], vcc, lds@abs32@lo, [[VOFS]]
				; GFX9-DAG: v_mov_b32_e32 [[VBASE:v[0-9]+]], lds@abs32@lo
				; GFX9-DAG: v_lshl_add_u32 [[VPTR:v[0-9]+]], v{{[0-9]+}}, 2, [[VBASE]]

	; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8			; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
	; GCN: s_endpgm			; GCN: s_endpgm
	define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 {			define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 {
	%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1			%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
	%in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i			%in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i
	%val = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 16			%val = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 16
	%val0 = extractelement <4 x float> %val, i32 0			%val0 = extractelement <4 x float> %val, i32 0
	%val1 = extractelement <4 x float> %val, i32 3			%val1 = extractelement <4 x float> %val, i32 3
	▲ Show 20 Lines • Show All 219 Lines • ▼ Show 20 Lines
	}			}

	@foo = addrspace(3) global [4 x i32] undef, align 4			@foo = addrspace(3) global [4 x i32] undef, align 4

	; GCN-LABEL: @store_constant_adjacent_offsets			; GCN-LABEL: @store_constant_adjacent_offsets
	; CI-DAG: s_mov_b32 m0			; CI-DAG: s_mov_b32 m0
	; GFX9-NOT: m0			; GFX9-NOT: m0

	; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}			; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], foo@abs32@lo{{$}}
	; GCN: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1			; GCN: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
	define amdgpu_kernel void @store_constant_adjacent_offsets() {			define amdgpu_kernel void @store_constant_adjacent_offsets() {
	store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4			store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
	store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4			store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
	ret void			ret void
	}			}

	; GCN-LABEL: @store_constant_disjoint_offsets			; GCN-LABEL: @store_constant_disjoint_offsets
	; CI-DAG: s_mov_b32 m0			; CI-DAG: s_mov_b32 m0
	; GFX9-NOT: m0			; GFX9-NOT: m0

	; GCN-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}}			; GCN-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}}
	; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}			; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], foo@abs32@lo{{$}}
	; GCN: ds_write2_b32 [[ZERO]], [[VAL]], [[VAL]] offset1:2			; GCN: ds_write2_b32 [[PTR]], [[VAL]], [[VAL]] offset1:2
	define amdgpu_kernel void @store_constant_disjoint_offsets() {			define amdgpu_kernel void @store_constant_disjoint_offsets() {
	store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4			store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
	store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4			store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
	ret void			ret void
	}			}

	@bar = addrspace(3) global [4 x i64] undef, align 4			@bar = addrspace(3) global [4 x i64] undef, align 4

	; GCN-LABEL: @store_misaligned64_constant_offsets			; GCN-LABEL: @store_misaligned64_constant_offsets
	; CI-DAG: s_mov_b32 m0			; CI-DAG: s_mov_b32 m0
	; GFX9-NOT: m0			; GFX9-NOT: m0

	; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}			; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], bar@abs32@lo{{$}}
	; GCN-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1			; GCN-DAG: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
	; GCN-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3			; GCN-DAG: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
	; GCN: s_endpgm			; GCN: s_endpgm
	define amdgpu_kernel void @store_misaligned64_constant_offsets() {			define amdgpu_kernel void @store_misaligned64_constant_offsets() {
	store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4			store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
	store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4			store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
	ret void			ret void
	}			}

	@bar.large = addrspace(3) global [4096 x i64] undef, align 4			@bar.large = addrspace(3) global [4096 x i64] undef, align 4

	; GCN-LABEL: @store_misaligned64_constant_large_offsets			; GCN-LABEL: @store_misaligned64_constant_large_offsets
	; CI-DAG: s_mov_b32 m0			; CI-DAG: s_mov_b32 m0
	; GFX9-NOT: m0			; GFX9-NOT: m0

	; GCN-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}}			; GCN-DAG: s_mov_b32 [[SBASE0:s[0-9]+]], bar.large@abs32@lo
	; GCN-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000{{$}}			; GCN-DAG: s_add_i32 [[SBASE1:s[0-9]+]], [[SBASE0]], 0x4000{{$}}
	; GCN-DAG: ds_write2_b32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1			; GCN-DAG: s_addk_i32 [[SBASE0]], 0x7ff8{{$}}
	; GCN-DAG: ds_write2_b32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1			; GCN-DAG: v_mov_b32_e32 [[VBASE0:v[0-9]+]], [[SBASE0]]{{$}}
				; GCN-DAG: v_mov_b32_e32 [[VBASE1:v[0-9]+]], [[SBASE1]]{{$}}
				; GCN-DAG: ds_write2_b32 [[VBASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
				; GCN-DAG: ds_write2_b32 [[VBASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
	; GCN: s_endpgm			; GCN: s_endpgm
	define amdgpu_kernel void @store_misaligned64_constant_large_offsets() {			define amdgpu_kernel void @store_misaligned64_constant_large_offsets() {
	store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4			store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
	store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4			store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
	ret void			ret void
	}			}

	@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4			@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4
	▲ Show 20 Lines • Show All 60 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/lds-initializer.ll

	; RUN: not llc -march=amdgcn -mcpu=tahiti < %s 2>&1 \| FileCheck %s			; RUN: not llc -march=amdgcn -mcpu=tahiti < %s 2>&1 \| FileCheck %s
	; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 \| FileCheck %s			; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 \| FileCheck %s

	; CHECK: in function load_init_lds_global{{.*}}: unsupported initializer for address space			; CHECK: lds: unsupported initializer for address space

	@lds = addrspace(3) global [8 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8]			@lds = addrspace(3) global [8 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8]

	define amdgpu_kernel void @load_init_lds_global(i32 addrspace(1)* %out, i1 %p) {			define amdgpu_kernel void @load_init_lds_global(i32 addrspace(1)* %out, i1 %p) {
	%gep = getelementptr [8 x i32], [8 x i32] addrspace(3)* @lds, i32 0, i32 10			%gep = getelementptr [8 x i32], [8 x i32] addrspace(3)* @lds, i32 0, i32 10
	%ld = load i32, i32 addrspace(3)* %gep			%ld = load i32, i32 addrspace(3)* %gep
	store i32 %ld, i32 addrspace(1)* %out			store i32 %ld, i32 addrspace(1)* %out
	ret void			ret void
	}			}

test/CodeGen/AMDGPU/lds-relocs.ll

This file was added.

				; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -show-mc-encoding < %s \| FileCheck -check-prefixes=GCN %s
				; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -filetype=obj < %s \| llvm-readobj -r -t \| FileCheck -check-prefixes=ELF %s

				@lds.external = external unnamed_addr addrspace(3) global [0 x i32]
				@lds.defined = unnamed_addr addrspace(3) global [8 x i32] undef, align 8

				; ELF: Relocations [
				; ELF-NEXT: Section (3) .rel.text {
				; ELF-NEXT: 0x{{[0-9a-f]*}} R_AMDGPU_ABS32 lds.external 0x0
				; ELF-NEXT: 0x{{[0-9a-f]*}} R_AMDGPU_ABS32 lds.defined 0x0
				; ELF-NEXT: }
				; ELF-NEXT: ]

				; ELF: Symbol {
				; ELF: Name: lds.defined
				; ELF-NEXT: Value: 0x0
				; ELF-NEXT: Size: 32
				; ELF-NEXT: Binding: Global (0x1)
				; ELF-NEXT: Type: AMDGPU_LDS (0xD)
				; ELF-NEXT: Align: 8
				; ELF-NEXT: Other: 24
				; ELF-NEXT: Section: Undefined (0x0)
				; ELF-NEXT: }

				; ELF: Symbol {
				; ELF: Name: lds.external
				; ELF-NEXT: Value: 0x0
				; ELF-NEXT: Size: 0
				; ELF-NEXT: Binding: Global (0x1)
				; ELF-NEXT: Type: AMDGPU_LDS (0xD)
				; ELF-NEXT: Align: 4
				; ELF-NEXT: Other: 16
				; ELF-NEXT: Section: Undefined (0x0)
				; ELF-NEXT: }

				; GCN-LABEL: {{^}}test_basic:
				; GCN: v_mov_b32_e32 v1, lds.external@abs32@lo ; encoding: [0xff,0x02,0x02,0x7e,A,A,A,A]
				; GCN-NEXT: ; fixup A - offset: 4, value: lds.external@abs32@lo, kind: FK_Data_4{{$}}
				;
				; GCN: s_add_i32 s0, lds.defined@abs32@lo, s0 ; encoding: [0xff,0x00,0x00,0x81,A,A,A,A]
				; GCN-NEXT: ; fixup A - offset: 4, value: lds.defined@abs32@lo, kind: FK_Data_4{{$}}
				;
				; GCN: .globl lds.external
				; GCN: .size lds.external, 0
				; GCN: .amdgpu_lds lds.external, 4
				; GCN: .globl lds.defined
				; GCN: .size lds.defined, 32
				; GCN: .amdgpu_lds lds.defined, 8
				define amdgpu_gs float @test_basic(i32 inreg %wave, i32 %arg1) #0 {
				main_body:
				%gep0 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @lds.external, i32 0, i32 %arg1
				%tmp = load i32, i32 addrspace(3)* %gep0

				%mask = call i64 @llvm.amdgcn.icmp.i64.i32(i32 %tmp, i32 0, i32 0)
				%mask.32 = trunc i64 %mask to i32
				%gep1 = getelementptr [8 x i32], [8 x i32] addrspace(3)* @lds.defined, i32 0, i32 %wave
				store i32 %mask.32, i32 addrspace(3)* %gep1

				%r = bitcast i32 %tmp to float
				ret float %r
				}

				; Function Attrs: convergent nounwind readnone
				declare i64 @llvm.amdgcn.icmp.i64.i32(i32, i32, i32) #4

				attributes #0 = { "no-signed-zeros-fp-math"="true" }
				attributes #4 = { convergent nounwind readnone }

test/CodeGen/AMDGPU/lds-size.ll

	; RUN: llc -march=amdgcn < %s \| FileCheck -check-prefix=ALL -check-prefix=GCN %s
	; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s \| FileCheck -check-prefix=ALL -check-prefix=HSA %s			; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s \| FileCheck -check-prefix=ALL -check-prefix=HSA %s
	; RUN: llc -march=r600 -mcpu=redwood < %s \| FileCheck -check-prefix=ALL -check-prefix=EG %s			; RUN: llc -march=r600 -mcpu=redwood < %s \| FileCheck -check-prefix=ALL -check-prefix=EG %s

	; This test makes sure we do not double count global values when they are			; This test makes sure we do not double count global values when they are
	; used in different basic blocks.			; used in different basic blocks.

	; GCN: .long 47180			; GCN: .long 47180
	; GCN-NEXT: .long 32900			; GCN-NEXT: .long 32900
	Show All 27 Lines

test/CodeGen/AMDGPU/lds-zero-initializer.ll

	; RUN: not llc -march=amdgcn -mcpu=tahiti < %s 2>&1 \| FileCheck %s			; RUN: not llc -march=amdgcn -mcpu=tahiti < %s 2>&1 \| FileCheck %s
	; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 \| FileCheck %s			; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 \| FileCheck %s

	; CHECK: in function load_zeroinit_lds_global{{.*}}: unsupported initializer for address space			; CHECK: lds: unsupported initializer for address space

	@lds = addrspace(3) global [256 x i32] zeroinitializer			@lds = addrspace(3) global [256 x i32] zeroinitializer

	define amdgpu_kernel void @load_zeroinit_lds_global(i32 addrspace(1)* %out, i1 %p) {			define amdgpu_kernel void @load_zeroinit_lds_global(i32 addrspace(1)* %out, i1 %p) {
	%gep = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds, i32 0, i32 10			%gep = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds, i32 0, i32 10
	%ld = load i32, i32 addrspace(3)* %gep			%ld = load i32, i32 addrspace(3)* %gep
	store i32 %ld, i32 addrspace(1)* %out			store i32 %ld, i32 addrspace(1)* %out
	ret void			ret void
	}			}

test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll

	Show First 20 Lines • Show All 262 Lines • ▼ Show 20 Lines
	}			}

	@lds0 = addrspace(3) global [512 x i32] undef			@lds0 = addrspace(3) global [512 x i32] undef

	; GCN-LABEL: {{^}}atomic_dec_shl_base_lds_0:			; GCN-LABEL: {{^}}atomic_dec_shl_base_lds_0:
	; CIVI-DAG: s_mov_b32 m0			; CIVI-DAG: s_mov_b32 m0
	; GFX9-NOT: m0			; GFX9-NOT: m0

	; GCN-DAG: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}			; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}}
				; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]]
				; GFX9-DAG: s_mov_b32 [[BASE:s[0-9]+]], lds0@abs32@lo
				; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 2, [[BASE]]

	; GCN: ds_dec_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8			; GCN: ds_dec_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
	define amdgpu_kernel void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {			define amdgpu_kernel void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
	%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1			%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
	%idx.0 = add nsw i32 %tid.x, 2			%idx.0 = add nsw i32 %tid.x, 2
	%arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds0, i32 0, i32 %idx.0			%arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds0, i32 0, i32 %idx.0
	%val0 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %arrayidx0, i32 9, i32 0, i32 0, i1 false)			%val0 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %arrayidx0, i32 9, i32 0, i32 0, i1 false)
	store i32 %idx.0, i32 addrspace(1)* %add_use			store i32 %idx.0, i32 addrspace(1)* %add_use
	store i32 %val0, i32 addrspace(1)* %out			store i32 %val0, i32 addrspace(1)* %out
	▲ Show 20 Lines • Show All 127 Lines • ▼ Show 20 Lines
	}			}

	@lds1 = addrspace(3) global [512 x i64] undef, align 8			@lds1 = addrspace(3) global [512 x i64] undef, align 8

	; GCN-LABEL: {{^}}atomic_dec_shl_base_lds_0_i64:			; GCN-LABEL: {{^}}atomic_dec_shl_base_lds_0_i64:
	; CIVI-DAG: s_mov_b32 m0			; CIVI-DAG: s_mov_b32 m0
	; GFX9-NOT: m0			; GFX9-NOT: m0

	; GCN-DAG: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}}			; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 3, {{v[0-9]+}}
				; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds1@abs32@lo, [[OFS]]
				; GFX9-DAG: v_mov_b32_e32 [[BASE:v[0-9]+]], lds1@abs32@lo
				; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 3, [[BASE]]

	; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16			; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16
	define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {			define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
	%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1			%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
	%idx.0 = add nsw i32 %tid.x, 2			%idx.0 = add nsw i32 %tid.x, 2
	%arrayidx0 = getelementptr inbounds [512 x i64], [512 x i64] addrspace(3)* @lds1, i32 0, i32 %idx.0			%arrayidx0 = getelementptr inbounds [512 x i64], [512 x i64] addrspace(3)* @lds1, i32 0, i32 %idx.0
	%val0 = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %arrayidx0, i64 9, i32 0, i32 0, i1 false)			%val0 = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %arrayidx0, i64 9, i32 0, i32 0, i1 false)
	store i32 %idx.0, i32 addrspace(1)* %add_use			store i32 %idx.0, i32 addrspace(1)* %add_use
	store i64 %val0, i64 addrspace(1)* %out			store i64 %val0, i64 addrspace(1)* %out
	ret void			ret void
	}			}

	attributes #0 = { nounwind }			attributes #0 = { nounwind }
	attributes #1 = { nounwind readnone }			attributes #1 = { nounwind readnone }
	attributes #2 = { nounwind argmemonly }			attributes #2 = { nounwind argmemonly }

test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll

Show First 20 Lines • Show All 125 Lines • ▼ Show 20 Lines	define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #0 {
%gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5		%gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5
%result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)		%result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
ret void		ret void
}		}

@lds0 = addrspace(3) global [512 x i32] undef, align 4		@lds0 = addrspace(3) global [512 x i32] undef, align 4

; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i32:		; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i32:
; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}		; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}}
		; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]]
		; GFX9-DAG: s_mov_b32 [[BASE:s[0-9]+]], lds0@abs32@lo
		; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 2, [[BASE]]
; GCN: ds_inc_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8		; GCN: ds_inc_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {		define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1		%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 2		%idx.0 = add nsw i32 %tid.x, 2
%arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds0, i32 0, i32 %idx.0		%arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds0, i32 0, i32 %idx.0
%val0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %arrayidx0, i32 9, i32 0, i32 0, i1 false)		%val0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %arrayidx0, i32 9, i32 0, i32 0, i1 false)
store i32 %idx.0, i32 addrspace(1)* %add_use		store i32 %idx.0, i32 addrspace(1)* %add_use
store i32 %val0, i32 addrspace(1)* %out		store i32 %val0, i32 addrspace(1)* %out
▲ Show 20 Lines • Show All 177 Lines • ▼ Show 20 Lines	define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0 {
%gep = getelementptr i32, i32* %gep.tid, i32 5		%gep = getelementptr i32, i32* %gep.tid, i32 5
%result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false)		%result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false)
ret void		ret void
}		}

@lds1 = addrspace(3) global [512 x i64] undef, align 8		@lds1 = addrspace(3) global [512 x i64] undef, align 8

; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i64:		; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i64:
; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}}		; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 3, {{v[0-9]+}}
		; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds1@abs32@lo, [[OFS]]
		; GFX9-DAG: v_mov_b32_e32 [[BASE:v[0-9]+]], lds1@abs32@lo
		; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 3, [[BASE]]
; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16		; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16
define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {		define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1		%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 2		%idx.0 = add nsw i32 %tid.x, 2
%arrayidx0 = getelementptr inbounds [512 x i64], [512 x i64] addrspace(3)* @lds1, i32 0, i32 %idx.0		%arrayidx0 = getelementptr inbounds [512 x i64], [512 x i64] addrspace(3)* @lds1, i32 0, i32 %idx.0
%val0 = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %arrayidx0, i64 9, i32 0, i32 0, i1 false)		%val0 = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %arrayidx0, i64 9, i32 0, i32 0, i1 false)
store i32 %idx.0, i32 addrspace(1)* %add_use		store i32 %idx.0, i32 addrspace(1)* %add_use
store i64 %val0, i64 addrspace(1)* %out		store i64 %val0, i64 addrspace(1)* %out
▲ Show 20 Lines • Show All 89 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll

; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s \| FileCheck %s		; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s \| FileCheck -check-prefixes=CHECK,NOHSA %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s \| FileCheck %s		; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s \| FileCheck -check-prefixes=CHECK,HSA %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s \| FileCheck %s		; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s \| FileCheck -check-prefixes=CHECK,HSA %s

@lds0 = addrspace(3) global [512 x float] undef, align 4		@lds0 = addrspace(3) global [512 x float] undef, align 4
@lds1 = addrspace(3) global [256 x float] undef, align 4		@lds1 = addrspace(3) global [256 x float] undef, align 4

@large = addrspace(3) global [4096 x i32] undef, align 4		@large = addrspace(3) global [4096 x i32] undef, align 4

; CHECK-LABEL: {{^}}groupstaticsize_test0:		; CHECK-LABEL: {{^}}groupstaticsize_test0:
; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0x800{{$}}		; NOHSA: v_mov_b32_e32 v{{[0-9]+}}, llvm.amdgcn.groupstaticsize@abs32@lo
		; HSA: v_mov_b32_e32 v{{[0-9]+}}, 0x800{{$}}
define amdgpu_kernel void @groupstaticsize_test0(float addrspace(1)* %out, i32 addrspace(1)* %lds_size) #0 {		define amdgpu_kernel void @groupstaticsize_test0(float addrspace(1)* %out, i32 addrspace(1)* %lds_size) #0 {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1		%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 64		%idx.0 = add nsw i32 %tid.x, 64
%static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1		%static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1
store i32 %static_lds_size, i32 addrspace(1)* %lds_size, align 4		store i32 %static_lds_size, i32 addrspace(1)* %lds_size, align 4
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0		%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
%val0 = load float, float addrspace(3)* %arrayidx0, align 4		%val0 = load float, float addrspace(3)* %arrayidx0, align 4
store float %val0, float addrspace(1)* %out, align 4		store float %val0, float addrspace(1)* %out, align 4

ret void		ret void
}		}

; CHECK-LABEL: {{^}}groupstaticsize_test1:		; CHECK-LABEL: {{^}}groupstaticsize_test1:
; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0xc00{{$}}		; NOHSA: v_mov_b32_e32 v{{[0-9]+}}, llvm.amdgcn.groupstaticsize@abs32@lo
		; HSA: v_mov_b32_e32 v{{[0-9]+}}, 0xc00{{$}}
define amdgpu_kernel void @groupstaticsize_test1(float addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %lds_size) {		define amdgpu_kernel void @groupstaticsize_test1(float addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %lds_size) {
entry:		entry:
%static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1		%static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1
store i32 %static_lds_size, i32 addrspace(1)* %lds_size, align 4		store i32 %static_lds_size, i32 addrspace(1)* %lds_size, align 4
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1		%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 64		%idx.0 = add nsw i32 %tid.x, 64
%tmp = icmp eq i32 %cond, 0		%tmp = icmp eq i32 %cond, 0
br i1 %tmp, label %if, label %else		br i1 %tmp, label %if, label %else
Show All 11 Lines	else: ; preds = %entry
br label %endif		br label %endif

endif: ; preds = %else, %if		endif: ; preds = %else, %if
ret void		ret void
}		}

; Exceeds 16-bit simm limit of s_movk_i32		; Exceeds 16-bit simm limit of s_movk_i32
; CHECK-LABEL: {{^}}large_groupstaticsize:		; CHECK-LABEL: {{^}}large_groupstaticsize:
; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4000{{$}}		; NOHSA: v_mov_b32_e32 v{{[0-9]+}}, llvm.amdgcn.groupstaticsize@abs32@lo
		; HSA: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4000{{$}}
define amdgpu_kernel void @large_groupstaticsize(i32 addrspace(1)* %size, i32 %idx) #0 {		define amdgpu_kernel void @large_groupstaticsize(i32 addrspace(1)* %size, i32 %idx) #0 {
%gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(3)* @large, i32 0, i32 %idx		%gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(3)* @large, i32 0, i32 %idx
store volatile i32 0, i32 addrspace(3)* %gep		store volatile i32 0, i32 addrspace(3)* %gep
%static_lds_size = call i32 @llvm.amdgcn.groupstaticsize()		%static_lds_size = call i32 @llvm.amdgcn.groupstaticsize()
store i32 %static_lds_size, i32 addrspace(1)* %size		store i32 %static_lds_size, i32 addrspace(1)* %size
ret void		ret void
}		}

declare i32 @llvm.amdgcn.groupstaticsize() #1		declare i32 @llvm.amdgcn.groupstaticsize() #1
declare i32 @llvm.amdgcn.workitem.id.x() #1		declare i32 @llvm.amdgcn.workitem.id.x() #1

attributes #0 = { nounwind }		attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }		attributes #1 = { nounwind readnone }

test/CodeGen/AMDGPU/local-memory.amdgcn.ll

	; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s \| FileCheck -check-prefix=SI -check-prefix=GCN %s			; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s \| FileCheck -check-prefix=SI -check-prefix=GCN %s
	; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s \| FileCheck -check-prefix=CI -check-prefix=GCN %s			; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s \| FileCheck -check-prefix=CI -check-prefix=GCN %s

	@local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4			@local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4

	; Check that the LDS size emitted correctly
	; SI: .long 47180
	; SI-NEXT: .long 65668
	; CI: .long 47180
	; CI-NEXT: .long 32900

	; GCN-LABEL: {{^}}local_memory:			; GCN-LABEL: {{^}}local_memory:

	; GCN-NOT: s_wqm_b64			; GCN-NOT: s_wqm_b64
	; GCN: ds_write_b32			; GCN: ds_write_b32

	; GCN: s_barrier			; GCN: s_barrier

	; GCN: ds_read_b32 {{v[0-9]+}},			; GCN: ds_read_b32 {{v[0-9]+}},
	Show All 32 Lines
	; SI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 28, [[ADDRW]]			; SI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 28, [[ADDRW]]
	; SI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]]			; SI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]]

	; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]]			; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]]
	; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]]			; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]]

	; CI: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]]			; CI: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]]
	; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7			; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7

	define amdgpu_kernel void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {			define amdgpu_kernel void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
	entry:			entry:
	%x.i = call i32 @llvm.amdgcn.workitem.id.x()			%x.i = call i32 @llvm.amdgcn.workitem.id.x()
	%arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i			%arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i
	store i32 %x.i, i32 addrspace(3)* %arrayidx, align 4			store i32 %x.i, i32 addrspace(3)* %arrayidx, align 4
	%mul = shl nsw i32 %x.i, 1			%mul = shl nsw i32 %x.i, 1
	%arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %x.i			%arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %x.i
	store i32 %mul, i32 addrspace(3)* %arrayidx1, align 4			store i32 %mul, i32 addrspace(3)* %arrayidx1, align 4
	Show All 20 Lines

test/CodeGen/AMDGPU/local-memory.ll

	; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=FUNC %s			; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=FUNC %s
	; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=FUNC %s			; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=FUNC %s
	; RUN: llc -march=r600 -mcpu=redwood < %s \| FileCheck -check-prefix=EG -check-prefix=FUNC %s			; RUN: llc -march=r600 -mcpu=redwood < %s \| FileCheck -check-prefix=EG -check-prefix=FUNC %s

	@local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4			@local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4

	@lds = addrspace(3) global [512 x i32] undef, align 4			@lds = addrspace(3) global [512 x i32] undef, align 4

	; On SI we need to make sure that the base offset is a register and			; On SI we need to make sure that the base offset is a register and
	; not an immediate.			; not an immediate.

	; FUNC-LABEL: {{^}}load_i32_local_const_ptr:			; FUNC-LABEL: {{^}}load_i32_local_const_ptr:
	; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0			; GCN: v_mov_b32_e32 v[[PTR:[0-9]+]], lds@abs32@lo
	; GCN: ds_read_b32 v{{[0-9]+}}, v[[ZERO]] offset:4			; GCN: ds_read_b32 v{{[0-9]+}}, v[[PTR]] offset:4

	; R600: LDS_READ_RET			; R600: LDS_READ_RET
	define amdgpu_kernel void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) #0 {			define amdgpu_kernel void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) #0 {
	entry:			entry:
	%tmp0 = getelementptr [512 x i32], [512 x i32] addrspace(3)* @lds, i32 0, i32 1			%tmp0 = getelementptr [512 x i32], [512 x i32] addrspace(3)* @lds, i32 0, i32 1
	%tmp1 = load i32, i32 addrspace(3)* %tmp0			%tmp1 = load i32, i32 addrspace(3)* %tmp0
	%tmp2 = getelementptr i32, i32 addrspace(1)* %out, i32 1			%tmp2 = getelementptr i32, i32 addrspace(1)* %out, i32 1
	store i32 %tmp1, i32 addrspace(1)* %tmp2			store i32 %tmp1, i32 addrspace(1)* %tmp2
	Show All 22 Lines

test/CodeGen/AMDGPU/merge-store-crash.ll

	; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s \| FileCheck %s			; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s \| FileCheck %s
	; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck %s			; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck %s

	; This is used to crash in LiveIntervalAnalysis via SILoadStoreOptimizer			; This is used to crash in LiveIntervalAnalysis via SILoadStoreOptimizer
	; while fixing up the merge of two ds_write instructions.			; while fixing up the merge of two ds_write instructions.

	@tess_lds = external addrspace(3) global [8192 x i32]			@tess_lds = external addrspace(3) global [8192 x i32]

	; CHECK-LABEL: {{^}}main:			; CHECK-LABEL: {{^}}main:
	; CHECK: ds_write2_b32			; CHECK: ds_write_b32
				; CHECK: ds_write_b32
	; CHECK: v_mov_b32_e32 v1, v0			; CHECK: v_mov_b32_e32 v1, v0
	; CHECK: tbuffer_store_format_xyzw v[0:3],			; CHECK: tbuffer_store_format_xyzw v[0:3],
	define amdgpu_vs void @main(i32 inreg %arg) {			define amdgpu_vs void @main(i32 inreg %arg) {
	main_body:			main_body:
	%tmp = load float, float addrspace(3)* undef, align 4			%tmp = load float, float addrspace(3)* undef, align 4
	%tmp1 = load float, float addrspace(3)* undef, align 4			%tmp1 = load float, float addrspace(3)* undef, align 4
	store float %tmp, float addrspace(3)* null, align 4			store float %tmp, float addrspace(3)* null, align 4
	%tmp2 = bitcast float %tmp to i32			%tmp2 = bitcast float %tmp to i32
	Show All 18 Lines

test/CodeGen/AMDGPU/over-max-lds-size.ll

This file was deleted.

	; RUN: not llc -march=amdgcn -mcpu=tahiti < %s 2>&1 \| FileCheck -check-prefix=ERROR %s
	; RUN: not llc -march=amdgcn -mcpu=hawaii < %s 2>&1 \| FileCheck -check-prefix=ERROR %s
	; RUN: not llc -march=amdgcn -mcpu=fiji < %s 2>&1 \| FileCheck -check-prefix=ERROR %s

	; ERROR: error: local memory limit exceeded (400000) in use_huge_lds

	@huge = internal unnamed_addr addrspace(3) global [100000 x i32] undef, align 4

	define amdgpu_kernel void @use_huge_lds() {
	entry:
	%v0 = getelementptr inbounds [100000 x i32], [100000 x i32] addrspace(3)* @huge, i32 0, i32 0
	store i32 0, i32 addrspace(3)* %v0
	ret void
	}

test/CodeGen/AMDGPU/promote-alloca-globals.ll

	; RUN: opt -data-layout=A5 -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s \| FileCheck -check-prefix=IR %s			; RUN: opt -data-layout=A5 -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s \| FileCheck -check-prefix=IR %s
	; RUN: llc -march=amdgcn -mcpu=tonga < %s \| FileCheck -check-prefix=ASM %s			; RUN: llc -march=amdgcn -mcpu=tonga < %s \| FileCheck -check-prefix=ASM %s


	@global_array0 = internal unnamed_addr addrspace(3) global [750 x [10 x i32]] undef, align 4			@global_array0 = internal unnamed_addr addrspace(3) global [750 x [10 x i32]] undef, align 4
	@global_array1 = internal unnamed_addr addrspace(3) global [750 x [10 x i32]] undef, align 4			@global_array1 = internal unnamed_addr addrspace(3) global [750 x [10 x i32]] undef, align 4

	; IR-LABEL: define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {			; IR-LABEL: define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
	; IR: alloca [10 x i32]			; IR: alloca [10 x i32]
	; ASM-LABEL: {{^}}promote_alloca_size_256:			; ASM-LABEL: {{^}}promote_alloca_size_256:
	; ASM: ; LDSByteSize: 60000 bytes/workgroup (compile time only)			; ASM: .size global_array0, 30000
				; ASM: .amdgpu_lds global_array0, 4
				; ASM: .size global_array1, 30000
				; ASM: .amdgpu_lds global_array1, 4

	define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {			define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
	entry:			entry:
	%stack = alloca [10 x i32], align 4, addrspace(5)			%stack = alloca [10 x i32], align 4, addrspace(5)
	%tmp = load i32, i32 addrspace(1)* %in, align 4			%tmp = load i32, i32 addrspace(1)* %in, align 4
	%arrayidx1 = getelementptr inbounds [10 x i32], [10 x i32] addrspace(5)* %stack, i32 0, i32 %tmp			%arrayidx1 = getelementptr inbounds [10 x i32], [10 x i32] addrspace(5)* %stack, i32 0, i32 %tmp
	store i32 4, i32 addrspace(5)* %arrayidx1, align 4			store i32 4, i32 addrspace(5)* %arrayidx1, align 4
	%arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1			%arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
	Show All 16 Lines

test/CodeGen/AMDGPU/s_addk_i32.ll

	Show First 20 Lines • Show All 95 Lines • ▼ Show 20 Lines
	; SI: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x8000{{$}}			; SI: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x8000{{$}}
	; SI: s_endpgm			; SI: s_endpgm
	define amdgpu_kernel void @no_s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) {			define amdgpu_kernel void @no_s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
	%add = add i32 %b, 32768 ; 1 << 15			%add = add i32 %b, 32768 ; 1 << 15
	store i32 %add, i32 addrspace(1)* %out			store i32 %add, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	@lds = addrspace(3) global [512 x i32] undef, align 4

	; SI-LABEL: {{^}}commute_s_addk_i32:
	; SI: s_addk_i32 s{{[0-9]+}}, 0x800{{$}}
	define amdgpu_kernel void @commute_s_addk_i32(i32 addrspace(1)* %out, i32 %b) #0 {
	%size = call i32 @llvm.amdgcn.groupstaticsize()
	%add = add i32 %size, %b
	call void asm sideeffect "; foo $0, $1", "v,s"([512 x i32] addrspace(3)* @lds, i32 %add)
	ret void
	}

	declare i32 @llvm.amdgcn.groupstaticsize() #1

	attributes #0 = { nounwind }			attributes #0 = { nounwind }
	attributes #1 = { nounwind readnone }			attributes #1 = { nounwind readnone }

test/CodeGen/AMDGPU/s_mulk_i32.ll

	Show All 34 Lines
	; SI: s_mul_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x8001{{$}}			; SI: s_mul_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x8001{{$}}
	; SI: s_endpgm			; SI: s_endpgm
	define amdgpu_kernel void @no_s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) {			define amdgpu_kernel void @no_s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
	%mul = mul i32 %b, 32769 ; 1 << 15 + 1			%mul = mul i32 %b, 32769 ; 1 << 15 + 1
	store i32 %mul, i32 addrspace(1)* %out			store i32 %mul, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	@lds = addrspace(3) global [512 x i32] undef, align 4

	; SI-LABEL: {{^}}commute_s_mulk_i32:
	; SI: s_mulk_i32 s{{[0-9]+}}, 0x800{{$}}
	define amdgpu_kernel void @commute_s_mulk_i32(i32 addrspace(1)* %out, i32 %b) #0 {
	%size = call i32 @llvm.amdgcn.groupstaticsize()
	%add = mul i32 %size, %b
	call void asm sideeffect "; foo $0, $1", "v,s"([512 x i32] addrspace(3)* @lds, i32 %add)
	ret void
	}

	declare i32 @llvm.amdgcn.groupstaticsize() #1

	attributes #0 = { nounwind }			attributes #0 = { nounwind }
	attributes #1 = { nounwind readnone }			attributes #1 = { nounwind readnone }

test/CodeGen/AMDGPU/shl_add_ptr.ll

Show All 27 Lines	define amdgpu_kernel void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
store float %val0, float addrspace(1)* %out		store float %val0, float addrspace(1)* %out
ret void		ret void
}		}

; Make sure once the first use is folded into the addressing mode, the		; Make sure once the first use is folded into the addressing mode, the
; remaining add use goes through the normal shl + add constant fold.		; remaining add use goes through the normal shl + add constant fold.

; GCN-LABEL: {{^}}load_shl_base_lds_1:		; GCN-LABEL: {{^}}load_shl_base_lds_1:
; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}		; GCN: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}}

		; TODO: integrate into the ds_read_b32 offset using a 16-bit relocation
		; GCN: v_add_{{[iu]}}32_e32 [[PTR:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]]

; GCN: ds_read_b32 [[RESULT:v[0-9]+]], [[PTR]] offset:8		; GCN: ds_read_b32 [[RESULT:v[0-9]+]], [[PTR]] offset:8
; GCN: v_add_{{[iu]}}32_e32 [[ADDUSE:v[0-9]+]], vcc, 8, v{{[0-9]+}}		; GCN: v_add_{{[iu]}}32_e32 [[ADDUSE:v[0-9]+]], vcc, 8, v{{[0-9]+}}
; GCN-DAG: buffer_store_dword [[RESULT]]		; GCN-DAG: buffer_store_dword [[RESULT]]
; GCN-DAG: buffer_store_dword [[ADDUSE]]		; GCN-DAG: buffer_store_dword [[ADDUSE]]
; GCN: s_endpgm		; GCN: s_endpgm
define amdgpu_kernel void @load_shl_base_lds_1(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {		define amdgpu_kernel void @load_shl_base_lds_1(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1		%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 2		%idx.0 = add nsw i32 %tid.x, 2
Show All 18 Lines	define amdgpu_kernel void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %lds, i32 addrspace(1)* %add_use) #0 {
store i32 %idx.0, i32 addrspace(1)* %add_use		store i32 %idx.0, i32 addrspace(1)* %add_use
store i8 %val0, i8 addrspace(1)* %out		store i8 %val0, i8 addrspace(1)* %out
ret void		ret void
}		}

; The two globals are placed adjacent in memory, so the same base		; The two globals are placed adjacent in memory, so the same base
; pointer can be used with an offset into the second one.		; pointer can be used with an offset into the second one.

		; TODO: Recover the optimization of using ds_read2st64_b32 using alignment hints

; GCN-LABEL: {{^}}load_shl_base_lds_2:		; GCN-LABEL: {{^}}load_shl_base_lds_2:
; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}		; GCN: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}}
		; GCN-DAG: v_add_{{[iu]}}32_e32 [[PTR0:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]]
		; GCN-DAG: v_add_{{[iu]}}32_e32 [[PTR1:v[0-9]+]], vcc, lds1@abs32@lo, [[OFS]]
; GCN: s_mov_b32 m0, -1		; GCN: s_mov_b32 m0, -1
; GCN-NEXT: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9
		; GCN-DAG: ds_read_b32 {{v[0-9]+}}, [[PTR0]] offset:256
		; GCN-DAG: ds_read_b32 {{v[0-9]+}}, [[PTR1]] offset:256
		; TODO: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9

; GCN: s_endpgm		; GCN: s_endpgm
define amdgpu_kernel void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {		define amdgpu_kernel void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1		%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%idx.0 = add nsw i32 %tid.x, 64		%idx.0 = add nsw i32 %tid.x, 64
%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0		%arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
%val0 = load float, float addrspace(3)* %arrayidx0, align 4		%val0 = load float, float addrspace(3)* %arrayidx0, align 4
%arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds1, i32 0, i32 %idx.0		%arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds1, i32 0, i32 %idx.0
%val1 = load float, float addrspace(3)* %arrayidx1, align 4		%val1 = load float, float addrspace(3)* %arrayidx1, align 4
▲ Show 20 Lines • Show All 342 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/si-sgpr-spill.ll

; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=TOVGPR %s		; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=TOVGPR %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s		; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s

; These tests check that the compiler won't crash when it needs to spill		; These tests check that the compiler won't crash when it needs to spill
; SGPRs.		; SGPRs.

@ddxy_lds = external addrspace(3) global [64 x i32]

; GCN-LABEL: {{^}}main:		; GCN-LABEL: {{^}}main:
; GCN: s_wqm		; GCN: s_wqm

; Make sure not emitting unused scratch resource descriptor setup		; Make sure not emitting unused scratch resource descriptor setup
; GCN-NOT: s_mov_b32		; GCN-NOT: s_mov_b32
; GCN-NOT: s_mov_b32
; GCN-NOT: s_mov_b32
; GCN-NOT: s_mov_b32

; GCN: s_mov_b32 m0		; GCN: s_mov_b32 m0

; Make sure scratch space isn't being used for SGPR->VGPR spills		; Make sure scratch space isn't being used for SGPR->VGPR spills

; Writing to M0 from an SMRD instruction will hang the GPU.		; Writing to M0 from an SMRD instruction will hang the GPU.
; GCN-NOT: s_buffer_load_dword m0		; GCN-NOT: s_buffer_load_dword m0
; GCN: s_endpgm		; GCN: s_endpgm

; TOVGPR: ScratchSize: 0{{$}}		; TOVGPR: ScratchSize: 0{{$}}
define amdgpu_ps void @main([17 x <4 x i32>] addrspace(4)* byval %arg, [32 x <4 x i32>] addrspace(4)* byval %arg1, [16 x <8 x i32>] addrspace(4)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) {		define amdgpu_ps void @main([17 x <4 x i32>] addrspace(4)* byval %arg, [32 x <4 x i32>] addrspace(4)* byval %arg1, [16 x <8 x i32>] addrspace(4)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) {
main_body:		main_body:
		%lds = inttoptr i32 0 to [64 x i32] addrspace(3)*
%tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(4)* %arg, i64 0, i32 0		%tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(4)* %arg, i64 0, i32 0
%tmp21 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0		%tmp21 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0
%tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 96, i32 0)		%tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 96, i32 0)
%tmp23 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 100, i32 0)		%tmp23 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 100, i32 0)
%tmp24 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 104, i32 0)		%tmp24 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 104, i32 0)
%tmp25 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 112, i32 0)		%tmp25 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 112, i32 0)
%tmp26 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 116, i32 0)		%tmp26 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 116, i32 0)
%tmp27 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 120, i32 0)		%tmp27 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 120, i32 0)
▲ Show 20 Lines • Show All 161 Lines • ▼ Show 20 Lines	main_body:
%i.i1 = extractelement <2 x i32> %arg6, i32 0		%i.i1 = extractelement <2 x i32> %arg6, i32 0
%j.i2 = extractelement <2 x i32> %arg6, i32 1		%j.i2 = extractelement <2 x i32> %arg6, i32 1
%i.f.i3 = bitcast i32 %i.i1 to float		%i.f.i3 = bitcast i32 %i.i1 to float
%j.f.i4 = bitcast i32 %j.i2 to float		%j.f.i4 = bitcast i32 %j.i2 to float
%p1.i5 = call float @llvm.amdgcn.interp.p1(float %i.f.i3, i32 2, i32 5, i32 %arg4) #0		%p1.i5 = call float @llvm.amdgcn.interp.p1(float %i.f.i3, i32 2, i32 5, i32 %arg4) #0
%p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 2, i32 5, i32 %arg4) #0		%p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 2, i32 5, i32 %arg4) #0
%mbcnt.lo.0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)		%mbcnt.lo.0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%tmp109 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.0)		%tmp109 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.0)
%tmp110 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp109		%tmp110 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp109
%tmp111 = bitcast float %p2.i to i32		%tmp111 = bitcast float %p2.i to i32
store i32 %tmp111, i32 addrspace(3)* %tmp110		store i32 %tmp111, i32 addrspace(3)* %tmp110
%tmp112 = bitcast float %p2.i96 to i32		%tmp112 = bitcast float %p2.i96 to i32
store i32 %tmp112, i32 addrspace(3)* %tmp110		store i32 %tmp112, i32 addrspace(3)* %tmp110
%mbcnt.lo.1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)		%mbcnt.lo.1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%tmp113 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.1)		%tmp113 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.1)
%tmp114 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp113		%tmp114 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp113
%tmp115 = and i32 %tmp113, -4		%tmp115 = and i32 %tmp113, -4
%tmp116 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp115		%tmp116 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp115
%tmp117 = add i32 %tmp115, 1		%tmp117 = add i32 %tmp115, 1
%tmp118 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp117		%tmp118 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp117
%tmp119 = bitcast float %p2.i to i32		%tmp119 = bitcast float %p2.i to i32
store i32 %tmp119, i32 addrspace(3)* %tmp114		store i32 %tmp119, i32 addrspace(3)* %tmp114
%tmp120 = load i32, i32 addrspace(3)* %tmp116		%tmp120 = load i32, i32 addrspace(3)* %tmp116
%tmp121 = bitcast i32 %tmp120 to float		%tmp121 = bitcast i32 %tmp120 to float
%tmp122 = load i32, i32 addrspace(3)* %tmp118		%tmp122 = load i32, i32 addrspace(3)* %tmp118
%tmp123 = bitcast i32 %tmp122 to float		%tmp123 = bitcast i32 %tmp122 to float
%tmp124 = fsub float %tmp123, %tmp121		%tmp124 = fsub float %tmp123, %tmp121
%tmp125 = bitcast float %p2.i96 to i32		%tmp125 = bitcast float %p2.i96 to i32
Show All 10 Lines	main_body:
%tmp135 = extractelement <4 x float> %tmp134, i32 0		%tmp135 = extractelement <4 x float> %tmp134, i32 0
%tmp136 = extractelement <4 x float> %tmp134, i32 1		%tmp136 = extractelement <4 x float> %tmp134, i32 1
%tmp137 = fmul float %tmp59, %p2.i		%tmp137 = fmul float %tmp59, %p2.i
%tmp138 = fmul float %tmp59, %p2.i96		%tmp138 = fmul float %tmp59, %p2.i96
%tmp139 = fmul float %tmp59, %p2.i96		%tmp139 = fmul float %tmp59, %p2.i96
%tmp140 = fmul float %tmp59, %p2.i96		%tmp140 = fmul float %tmp59, %p2.i96
%mbcnt.lo.2 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)		%mbcnt.lo.2 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%tmp141 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.2)		%tmp141 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.2)
%tmp142 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp141		%tmp142 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp141
%tmp143 = bitcast float %tmp137 to i32		%tmp143 = bitcast float %tmp137 to i32
store i32 %tmp143, i32 addrspace(3)* %tmp142		store i32 %tmp143, i32 addrspace(3)* %tmp142
%tmp144 = bitcast float %tmp138 to i32		%tmp144 = bitcast float %tmp138 to i32
store i32 %tmp144, i32 addrspace(3)* %tmp142		store i32 %tmp144, i32 addrspace(3)* %tmp142
%tmp145 = bitcast float %tmp139 to i32		%tmp145 = bitcast float %tmp139 to i32
store i32 %tmp145, i32 addrspace(3)* %tmp142		store i32 %tmp145, i32 addrspace(3)* %tmp142
%tmp146 = bitcast float %tmp140 to i32		%tmp146 = bitcast float %tmp140 to i32
store i32 %tmp146, i32 addrspace(3)* %tmp142		store i32 %tmp146, i32 addrspace(3)* %tmp142
%mbcnt.lo.3 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)		%mbcnt.lo.3 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%tmp147 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.3)		%tmp147 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.3)
%tmp148 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp147		%tmp148 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp147
%tmp149 = and i32 %tmp147, -4		%tmp149 = and i32 %tmp147, -4
%tmp150 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp149		%tmp150 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp149
%tmp151 = add i32 %tmp149, 2		%tmp151 = add i32 %tmp149, 2
%tmp152 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp151		%tmp152 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp151
%tmp153 = bitcast float %tmp137 to i32		%tmp153 = bitcast float %tmp137 to i32
store i32 %tmp153, i32 addrspace(3)* %tmp148		store i32 %tmp153, i32 addrspace(3)* %tmp148
%tmp154 = load i32, i32 addrspace(3)* %tmp150		%tmp154 = load i32, i32 addrspace(3)* %tmp150
%tmp155 = bitcast i32 %tmp154 to float		%tmp155 = bitcast i32 %tmp154 to float
%tmp156 = load i32, i32 addrspace(3)* %tmp152		%tmp156 = load i32, i32 addrspace(3)* %tmp152
%tmp157 = bitcast i32 %tmp156 to float		%tmp157 = bitcast i32 %tmp156 to float
%tmp158 = fsub float %tmp157, %tmp155		%tmp158 = fsub float %tmp157, %tmp155
%tmp159 = bitcast float %tmp138 to i32		%tmp159 = bitcast float %tmp138 to i32
▲ Show 20 Lines • Show All 1,427 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/sopk-compares.ll

; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=SI %s		; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=VI %s		; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=VI %s

; Since this intrinsic is exposed as a constant after isel, use it to
; defeat the DAG's compare with constant canonicalizations.
declare i32 @llvm.amdgcn.groupstaticsize() #1

@lds = addrspace(3) global [512 x i32] undef, align 4
arsenmUnsubmitted Done Reply Inline Actions I'm 99% sure we don't have replacements for any of these tests either. The point of these isn't to use groupstaticsize, but to get some constant that isn't visible during selection. These shouldn't be removed arsenm: I'm 99% sure we don't have replacements for any of these tests either. The point of these isn't…

; GCN-LABEL: {{^}}br_scc_eq_i32_inline_imm:		; GCN-LABEL: {{^}}br_scc_eq_i32_inline_imm:
; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 4{{$}}		; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 4{{$}}
define amdgpu_kernel void @br_scc_eq_i32_inline_imm(i32 %cond, i32 addrspace(1)* %out) #0 {		define amdgpu_kernel void @br_scc_eq_i32_inline_imm(i32 %cond, i32 addrspace(1)* %out) #0 {
entry:		entry:
%cmp0 = icmp eq i32 %cond, 4		%cmp0 = icmp eq i32 %cond, 4
br i1 %cmp0, label %endif, label %if		br i1 %cmp0, label %endif, label %if

if:		if:
▲ Show 20 Lines • Show All 209 Lines • ▼ Show 20 Lines	if:
call void asm sideeffect "", ""()		call void asm sideeffect "", ""()
br label %endif		br label %endif

endif:		endif:
store volatile i32 1, i32 addrspace(1)* %out		store volatile i32 1, i32 addrspace(1)* %out
ret void		ret void
}		}

; GCN-LABEL: {{^}}br_scc_sge_i32:
; GCN: s_cmpk_ge_i32 s{{[0-9]+}}, 0x800{{$}}
define amdgpu_kernel void @br_scc_sge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
entry:
%size = call i32 @llvm.amdgcn.groupstaticsize()
%cmp0 = icmp sge i32 %cond, %size
br i1 %cmp0, label %endif, label %if

if:
call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
br label %endif

endif:
store volatile i32 1, i32 addrspace(1)* %out
ret void
}

; GCN-LABEL: {{^}}br_scc_slt_i32:		; GCN-LABEL: {{^}}br_scc_slt_i32:
; GCN: s_cmpk_lt_i32 s{{[0-9]+}}, 0x41{{$}}		; GCN: s_cmpk_lt_i32 s{{[0-9]+}}, 0x41{{$}}
define amdgpu_kernel void @br_scc_slt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {		define amdgpu_kernel void @br_scc_slt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
entry:		entry:
%cmp0 = icmp slt i32 %cond, 65		%cmp0 = icmp slt i32 %cond, 65
br i1 %cmp0, label %endif, label %if		br i1 %cmp0, label %endif, label %if

if:		if:
call void asm sideeffect "", ""()		call void asm sideeffect "", ""()
br label %endif		br label %endif

endif:		endif:
store volatile i32 1, i32 addrspace(1)* %out		store volatile i32 1, i32 addrspace(1)* %out
ret void		ret void
}		}

; GCN-LABEL: {{^}}br_scc_sle_i32:
; GCN: s_cmpk_le_i32 s{{[0-9]+}}, 0x800{{$}}
define amdgpu_kernel void @br_scc_sle_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
entry:
%size = call i32 @llvm.amdgcn.groupstaticsize()
%cmp0 = icmp sle i32 %cond, %size
br i1 %cmp0, label %endif, label %if

if:
call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
br label %endif

endif:
store volatile i32 1, i32 addrspace(1)* %out
ret void
}

; GCN-LABEL: {{^}}br_scc_ugt_i32:
; GCN: s_cmpk_gt_u32 s{{[0-9]+}}, 0x800{{$}}
define amdgpu_kernel void @br_scc_ugt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
entry:
%size = call i32 @llvm.amdgcn.groupstaticsize()
%cmp0 = icmp ugt i32 %cond, %size
br i1 %cmp0, label %endif, label %if

if:
call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
br label %endif

endif:
store volatile i32 1, i32 addrspace(1)* %out
ret void
}

; GCN-LABEL: {{^}}br_scc_uge_i32:
; GCN: s_cmpk_ge_u32 s{{[0-9]+}}, 0x800{{$}}
define amdgpu_kernel void @br_scc_uge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
entry:
%size = call i32 @llvm.amdgcn.groupstaticsize()
%cmp0 = icmp uge i32 %cond, %size
br i1 %cmp0, label %endif, label %if

if:
call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
br label %endif

endif:
store volatile i32 1, i32 addrspace(1)* %out
ret void
}

; GCN-LABEL: {{^}}br_scc_ult_i32:		; GCN-LABEL: {{^}}br_scc_ult_i32:
; GCN: s_cmpk_lt_u32 s{{[0-9]+}}, 0x41{{$}}		; GCN: s_cmpk_lt_u32 s{{[0-9]+}}, 0x41{{$}}
define amdgpu_kernel void @br_scc_ult_i32(i32 %cond, i32 addrspace(1)* %out) #0 {		define amdgpu_kernel void @br_scc_ult_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
entry:		entry:
%cmp0 = icmp ult i32 %cond, 65		%cmp0 = icmp ult i32 %cond, 65
br i1 %cmp0, label %endif, label %if		br i1 %cmp0, label %endif, label %if

if:		if:
Show All 32 Lines	if:
call void asm sideeffect "", ""()		call void asm sideeffect "", ""()
br label %endif		br label %endif

endif:		endif:
store volatile i32 1, i32 addrspace(1)* %out		store volatile i32 1, i32 addrspace(1)* %out
ret void		ret void
}		}

; GCN-LABEL: {{^}}br_scc_ule_i32:
; GCN: s_cmpk_le_u32 s{{[0-9]+}}, 0x800{{$}}
define amdgpu_kernel void @br_scc_ule_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
arsenmUnsubmitted Done Reply Inline Actions Ditto arsenm: Ditto
entry:
%size = call i32 @llvm.amdgcn.groupstaticsize()
%cmp0 = icmp ule i32 %cond, %size
br i1 %cmp0, label %endif, label %if

if:
call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
br label %endif

endif:
store volatile i32 1, i32 addrspace(1)* %out
ret void
}

; GCN-LABEL: {{^}}commute_br_scc_eq_i32:
; GCN: s_cmpk_eq_i32 s{{[0-9]+}}, 0x800{{$}}
define amdgpu_kernel void @commute_br_scc_eq_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
entry:
%size = call i32 @llvm.amdgcn.groupstaticsize()
%cmp0 = icmp eq i32 %size, %cond
br i1 %cmp0, label %endif, label %if

if:
call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
br label %endif

endif:
store volatile i32 1, i32 addrspace(1)* %out
ret void
}

; GCN-LABEL: {{^}}commute_br_scc_ne_i32:
; GCN: s_cmpk_lg_i32 s{{[0-9]+}}, 0x800{{$}}
define amdgpu_kernel void @commute_br_scc_ne_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
entry:
%size = call i32 @llvm.amdgcn.groupstaticsize()
%cmp0 = icmp ne i32 %size, %cond
br i1 %cmp0, label %endif, label %if

if:
call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
br label %endif

endif:
store volatile i32 1, i32 addrspace(1)* %out
ret void
}

; GCN-LABEL: {{^}}commute_br_scc_sgt_i32:
; GCN: s_cmpk_lt_i32 s{{[0-9]+}}, 0x800{{$}}
define amdgpu_kernel void @commute_br_scc_sgt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
entry:
%size = call i32 @llvm.amdgcn.groupstaticsize()
%cmp0 = icmp sgt i32 %size, %cond
br i1 %cmp0, label %endif, label %if

if:
call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
br label %endif

endif:
store volatile i32 1, i32 addrspace(1)* %out
ret void
}

; GCN-LABEL: {{^}}commute_br_scc_sge_i32:
; GCN: s_cmpk_le_i32 s{{[0-9]+}}, 0x800{{$}}
define amdgpu_kernel void @commute_br_scc_sge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
entry:
%size = call i32 @llvm.amdgcn.groupstaticsize()
%cmp0 = icmp sge i32 %size, %cond
br i1 %cmp0, label %endif, label %if

if:
call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
br label %endif

endif:
store volatile i32 1, i32 addrspace(1)* %out
ret void
}

; GCN-LABEL: {{^}}commute_br_scc_slt_i32:
; GCN: s_cmpk_gt_i32 s{{[0-9]+}}, 0x800{{$}}
define amdgpu_kernel void @commute_br_scc_slt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
entry:
%size = call i32 @llvm.amdgcn.groupstaticsize()
%cmp0 = icmp slt i32 %size, %cond
br i1 %cmp0, label %endif, label %if

if:
call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
br label %endif

endif:
store volatile i32 1, i32 addrspace(1)* %out
ret void
}

; GCN-LABEL: {{^}}commute_br_scc_sle_i32:
; GCN: s_cmpk_ge_i32 s{{[0-9]+}}, 0x800{{$}}
define amdgpu_kernel void @commute_br_scc_sle_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
entry:
%size = call i32 @llvm.amdgcn.groupstaticsize()
%cmp0 = icmp sle i32 %size, %cond
br i1 %cmp0, label %endif, label %if

if:
call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
br label %endif

endif:
store volatile i32 1, i32 addrspace(1)* %out
ret void
}

; GCN-LABEL: {{^}}commute_br_scc_ugt_i32:
; GCN: s_cmpk_lt_u32 s{{[0-9]+}}, 0x800{{$}}
define amdgpu_kernel void @commute_br_scc_ugt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
entry:
%size = call i32 @llvm.amdgcn.groupstaticsize()
%cmp0 = icmp ugt i32 %size, %cond
br i1 %cmp0, label %endif, label %if

if:
call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
br label %endif

endif:
store volatile i32 1, i32 addrspace(1)* %out
ret void
}

; GCN-LABEL: {{^}}commute_br_scc_uge_i32:
; GCN: s_cmpk_le_u32 s{{[0-9]+}}, 0x800{{$}}
define amdgpu_kernel void @commute_br_scc_uge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
entry:
%size = call i32 @llvm.amdgcn.groupstaticsize()
%cmp0 = icmp uge i32 %size, %cond
br i1 %cmp0, label %endif, label %if

if:
call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
br label %endif

endif:
store volatile i32 1, i32 addrspace(1)* %out
ret void
}

; GCN-LABEL: {{^}}commute_br_scc_ult_i32:
; GCN: s_cmpk_gt_u32 s{{[0-9]+}}, 0x800{{$}}
define amdgpu_kernel void @commute_br_scc_ult_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
entry:
%size = call i32 @llvm.amdgcn.groupstaticsize()
%cmp0 = icmp ult i32 %size, %cond
br i1 %cmp0, label %endif, label %if

if:
call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
br label %endif

endif:
store volatile i32 1, i32 addrspace(1)* %out
ret void
}

; GCN-LABEL: {{^}}commute_br_scc_ule_i32:
; GCN: s_cmpk_ge_u32 s{{[0-9]+}}, 0x800{{$}}
define amdgpu_kernel void @commute_br_scc_ule_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
entry:
%size = call i32 @llvm.amdgcn.groupstaticsize()
%cmp0 = icmp ule i32 %size, %cond
br i1 %cmp0, label %endif, label %if

if:
call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
br label %endif

endif:
store volatile i32 1, i32 addrspace(1)* %out
ret void
}

; GCN-LABEL: {{^}}br_scc_ult_i32_non_u16:
; GCN: s_cmp_lt_u32 s2, 0xfffff7ff
define amdgpu_kernel void @br_scc_ult_i32_non_u16(i32 %cond, i32 addrspace(1)* %out) #0 {
entry:
%size = call i32 @llvm.amdgcn.groupstaticsize()
%not.size = xor i32 %size, -1
%cmp0 = icmp ult i32 %cond, %not.size
br i1 %cmp0, label %endif, label %if

if:
call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
br label %endif

endif:
store volatile i32 1, i32 addrspace(1)* %out
ret void
}

; GCN-LABEL: {{^}}br_scc_eq_i64_inline_imm:		; GCN-LABEL: {{^}}br_scc_eq_i64_inline_imm:
; VI: s_cmp_eq_u64 s{{\[[0-9]+:[0-9]+\]}}, 4		; VI: s_cmp_eq_u64 s{{\[[0-9]+:[0-9]+\]}}, 4

; SI: v_cmp_eq_u64_e64		; SI: v_cmp_eq_u64_e64
define amdgpu_kernel void @br_scc_eq_i64_inline_imm(i64 %cond, i32 addrspace(1)* %out) #0 {		define amdgpu_kernel void @br_scc_eq_i64_inline_imm(i64 %cond, i32 addrspace(1)* %out) #0 {
entry:		entry:
%cmp0 = icmp eq i64 %cond, 4		%cmp0 = icmp eq i64 %cond, 4
br i1 %cmp0, label %endif, label %if		br i1 %cmp0, label %endif, label %if
▲ Show 20 Lines • Show All 70 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/sub.i16.ll

Show First 20 Lines • Show All 138 Lines • ▼ Show 20 Lines	define amdgpu_kernel void @v_test_sub_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
%a = load i16, i16 addrspace(1)* %gep.in0		%a = load i16, i16 addrspace(1)* %gep.in0
%b = load i16, i16 addrspace(1)* %gep.in1		%b = load i16, i16 addrspace(1)* %gep.in1
%add = sub i16 %a, %b		%add = sub i16 %a, %b
%ext = sext i16 %add to i64		%ext = sext i16 %add to i64
store i64 %ext, i64 addrspace(1)* %out		store i64 %ext, i64 addrspace(1)* %out
ret void		ret void
}		}

@lds = addrspace(3) global [512 x i32] undef, align 4
arsenmUnsubmitted Done Reply Inline Actions Ditto arsenm: Ditto

; GCN-LABEL: {{^}}v_test_sub_i16_constant_commute:
; VI: v_subrev_u16_e32 v{{[0-9]+}}, 0x800, v{{[0-9]+}}
; CI: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 0x800, v{{[0-9]+}}
define amdgpu_kernel void @v_test_sub_i16_constant_commute(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
%size = call i32 @llvm.amdgcn.groupstaticsize()
%size.trunc = trunc i32 %size to i16
call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
%gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
%a = load volatile i16, i16 addrspace(1)* %gep.in0
%add = sub i16 %a, %size.trunc
store i16 %add, i16 addrspace(1)* %out
ret void
}

declare i32 @llvm.amdgcn.workitem.id.x() #0		declare i32 @llvm.amdgcn.workitem.id.x() #0
declare i32 @llvm.amdgcn.groupstaticsize() #0

attributes #0 = { nounwind readnone }		attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }		attributes #1 = { nounwind }

test/CodeGen/AMDGPU/target-cpu.ll

Show First 20 Lines • Show All 72 Lines • ▼ Show 20 Lines	define amdgpu_kernel void @target_fiji() #4 {
%gep = getelementptr inbounds i32, i32 addrspace(1)* %ptr, i64 %id.ext		%gep = getelementptr inbounds i32, i32 addrspace(1)* %ptr, i64 %id.ext
store i32 0, i32 addrspace(1)* %gep		store i32 0, i32 addrspace(1)* %gep
call void @llvm.amdgcn.s.dcache.wb()		call void @llvm.amdgcn.s.dcache.wb()
ret void		ret void
}		}

; CHECK-LABEL: {{^}}promote_alloca_enabled:		; CHECK-LABEL: {{^}}promote_alloca_enabled:
; CHECK: ds_read_b32		; CHECK: ds_read_b32
; CHECK: ; LDSByteSize: 5120
define amdgpu_kernel void @promote_alloca_enabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #5 {		define amdgpu_kernel void @promote_alloca_enabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #5 {
entry:		entry:
%stack = alloca [5 x i32], align 4, addrspace(5)		%stack = alloca [5 x i32], align 4, addrspace(5)
%tmp = load i32, i32 addrspace(1)* %in, align 4		%tmp = load i32, i32 addrspace(1)* %in, align 4
%arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp		%arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp
%load = load i32, i32 addrspace(5)* %arrayidx1		%load = load i32, i32 addrspace(5)* %arrayidx1
store i32 %load, i32 addrspace(1)* %out		store i32 %load, i32 addrspace(1)* %out
ret void		ret void
Show All 23 Lines

test/CodeGen/MIR/AMDGPU/machine-function-info.ll

	; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after expand-isel-pseudos -o %t.mir %s			; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after expand-isel-pseudos -o %t.mir %s
	; RUN: llc -run-pass=none -verify-machineinstrs %t.mir -o - \| FileCheck %s			; RUN: llc -run-pass=none -verify-machineinstrs %t.mir -o - \| FileCheck %s

	; Test that SIMachineFunctionInfo can be round trip serialized through			; Test that SIMachineFunctionInfo can be round trip serialized through
	; MIR.			; MIR.

	@lds = addrspace(3) global [512 x float] undef, align 4			@lds = addrspace(3) global [512 x float] undef, align 4

	; CHECK-LABEL: {{^}}name: kernel			; CHECK-LABEL: {{^}}name: kernel
	; CHECK: machineFunctionInfo:			; CHECK: machineFunctionInfo:
	; CHECK-NEXT: explicitKernArgSize: 128			; CHECK-NEXT: explicitKernArgSize: 128
	; CHECK-NEXT: maxKernArgAlign: 64			; CHECK-NEXT: maxKernArgAlign: 64
	; CHECK-NEXT: ldsSize: 2048			; CHECK-NEXT: ldsSize: 0
	; CHECK-NEXT: isEntryFunction: true			; CHECK-NEXT: isEntryFunction: true
	; CHECK-NEXT: noSignedZerosFPMath: false			; CHECK-NEXT: noSignedZerosFPMath: false
	; CHECK-NEXT: memoryBound: false			; CHECK-NEXT: memoryBound: false
	; CHECK-NEXT: waveLimiter: false			; CHECK-NEXT: waveLimiter: false
	; CHECK-NEXT: scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'			; CHECK-NEXT: scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'
	; CHECK-NEXT: scratchWaveOffsetReg: '$sgpr101'			; CHECK-NEXT: scratchWaveOffsetReg: '$sgpr101'
	; CHECK-NEXT: frameOffsetReg: '$sgpr101'			; CHECK-NEXT: frameOffsetReg: '$sgpr101'
	; CHECK-NEXT: stackPtrOffsetReg: '$sp_reg'			; CHECK-NEXT: stackPtrOffsetReg: '$sp_reg'
	▲ Show 20 Lines • Show All 62 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Write LDS objects out as global symbols in code generation
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 198518

lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

lib/Target/AMDGPU/SIFoldOperands.cpp

lib/Target/AMDGPU/SIISelLowering.h

lib/Target/AMDGPU/SIISelLowering.cpp

lib/Target/AMDGPU/SIInstrInfo.cpp

test/CodeGen/AMDGPU/32-bit-local-address-space.ll

test/CodeGen/AMDGPU/constant-fold-mi-operands.ll

test/CodeGen/AMDGPU/ds-sub-offset.ll

test/CodeGen/AMDGPU/ds_read2.ll

test/CodeGen/AMDGPU/ds_write2.ll

test/CodeGen/AMDGPU/lds-initializer.ll

test/CodeGen/AMDGPU/lds-relocs.ll

test/CodeGen/AMDGPU/lds-size.ll

test/CodeGen/AMDGPU/lds-zero-initializer.ll

test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll

test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll

test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll

test/CodeGen/AMDGPU/local-memory.amdgcn.ll

test/CodeGen/AMDGPU/local-memory.ll

test/CodeGen/AMDGPU/merge-store-crash.ll

test/CodeGen/AMDGPU/over-max-lds-size.ll

test/CodeGen/AMDGPU/promote-alloca-globals.ll

test/CodeGen/AMDGPU/s_addk_i32.ll

test/CodeGen/AMDGPU/s_mulk_i32.ll

test/CodeGen/AMDGPU/shl_add_ptr.ll

test/CodeGen/AMDGPU/si-sgpr-spill.ll

test/CodeGen/AMDGPU/sopk-compares.ll

test/CodeGen/AMDGPU/sub.i16.ll

test/CodeGen/AMDGPU/target-cpu.ll

test/CodeGen/MIR/AMDGPU/machine-function-info.ll

This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Write LDS objects out as global symbols in code generationClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 198518

lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

lib/Target/AMDGPU/SIFoldOperands.cpp

lib/Target/AMDGPU/SIISelLowering.h

lib/Target/AMDGPU/SIISelLowering.cpp

lib/Target/AMDGPU/SIInstrInfo.cpp

test/CodeGen/AMDGPU/32-bit-local-address-space.ll

test/CodeGen/AMDGPU/constant-fold-mi-operands.ll

test/CodeGen/AMDGPU/ds-sub-offset.ll

test/CodeGen/AMDGPU/ds_read2.ll

test/CodeGen/AMDGPU/ds_write2.ll

test/CodeGen/AMDGPU/lds-initializer.ll

test/CodeGen/AMDGPU/lds-relocs.ll

test/CodeGen/AMDGPU/lds-size.ll

test/CodeGen/AMDGPU/lds-zero-initializer.ll

test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll

test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll

test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll

test/CodeGen/AMDGPU/local-memory.amdgcn.ll

test/CodeGen/AMDGPU/local-memory.ll

test/CodeGen/AMDGPU/merge-store-crash.ll

test/CodeGen/AMDGPU/over-max-lds-size.ll

test/CodeGen/AMDGPU/promote-alloca-globals.ll

test/CodeGen/AMDGPU/s_addk_i32.ll

test/CodeGen/AMDGPU/s_mulk_i32.ll

test/CodeGen/AMDGPU/shl_add_ptr.ll

test/CodeGen/AMDGPU/si-sgpr-spill.ll

test/CodeGen/AMDGPU/sopk-compares.ll

test/CodeGen/AMDGPU/sub.i16.ll

test/CodeGen/AMDGPU/target-cpu.ll

test/CodeGen/MIR/AMDGPU/machine-function-info.ll

AMDGPU: Write LDS objects out as global symbols in code generation
ClosedPublic