This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Add DS append/consume intrinsics
ClosedPublic

Authored by arsenm on Jan 28 2019, 9:58 AM.

Download Raw Diff

Details

Reviewers

rampitec
b-sumner

Summary

Since these pass the pointer in m0 unlike other DS instructions, these
need to worry about whether the address is uniform or not. This
assumes the address is dynamically uniform, and just uses
readfirstlane to get a copy into an SGPR.

I don't know if these have the same 16-bit add for the addressing mode
offset problem on SI or not, but I've just assumed they do.

Also includes some misc. changes to avoid test differences between the
LDS and GDS versions.

Diff Detail

Event Timeline

arsenm created this revision.Jan 28 2019, 9:58 AM

Herald added subscribers: t-tye, tpr, dstuttard and 5 others. · View Herald TranscriptJan 28 2019, 9:58 AM

arsenm marked an inline comment as done.Jan 28 2019, 9:59 AM

arsenm added inline comments.

include/llvm/IR/IntrinsicsAMDGPU.td
412–413	Not sure if we really need these, I should probably drop them

rampitec added inline comments.Jan 28 2019, 10:06 AM

lib/Target/AMDGPU/SIISelLowering.cpp
5505	Enable it or drop it.

I think it is perfectly reasonable to treat these as essentially relaxed-only atomic RMW operations and require the application to use fences or barriers if necessary. The ordering and scope are only needed if we ever need this operation to act as a non-relaxed atomic RMW.

Remove leftovers

LGTM

This revision is now accepted and ready to land.Jan 28 2019, 11:50 AM

r352422

phani added a subscriber: phani.Feb 12 2019, 7:49 PM

phani removed a subscriber: phani.

phani added a subscriber: phani.

Revision Contents

Path

Size

include/

llvm/

IR/

IntrinsicsAMDGPU.td

13 lines

lib/

Target/

AMDGPU/

AMDGPUISelDAGToDAG.cpp

87 lines

AMDGPULowerKernelArguments.cpp

3 lines

SIISelLowering.cpp

41 lines

SIInstrInfo.cpp

5 lines

test/

CodeGen/

AMDGPU/

llvm.amdgcn.ds.append.ll

125 lines

llvm.amdgcn.ds.consume.ll

125 lines

Diff 183897

include/llvm/IR/IntrinsicsAMDGPU.td

Show First 20 Lines • Show All 400 Lines • ▼ Show 20 Lines	class AMDGPUDSOrderedIntrinsic : Intrinsic<
llvm_i32_ty, // scope		llvm_i32_ty, // scope
llvm_i1_ty, // isVolatile		llvm_i1_ty, // isVolatile
llvm_i32_ty, // ordered count index (OA index), also added to the address		llvm_i32_ty, // ordered count index (OA index), also added to the address
llvm_i1_ty, // wave release, usually set to 1		llvm_i1_ty, // wave release, usually set to 1
llvm_i1_ty], // wave done, set to 1 for the last ordered instruction		llvm_i1_ty], // wave done, set to 1 for the last ordered instruction
[NoCapture<0>]		[NoCapture<0>]
>;		>;

		class AMDGPUDSAppendConsumedIntrinsic : Intrinsic<
		[llvm_i32_ty],
		[llvm_anyptr_ty, // LDS or GDS ptr
		llvm_i32_ty, // ordering
		llvm_i32_ty, // scope
		arsenmAuthorUnsubmitted Done Reply Inline Actions Not sure if we really need these, I should probably drop them arsenm: Not sure if we really need these, I should probably drop them
		llvm_i1_ty], // isVolatile
		[IntrConvergent, IntrArgMemOnly, NoCapture<0>]
		>;

def int_amdgcn_ds_ordered_add : AMDGPUDSOrderedIntrinsic;		def int_amdgcn_ds_ordered_add : AMDGPUDSOrderedIntrinsic;
def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic;		def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic;

		// The pointer argument is assumed to be dynamically uniform if a VGPR.
		def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic;
		def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic;

def int_amdgcn_ds_fadd : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_faddf">;		def int_amdgcn_ds_fadd : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_faddf">;
def int_amdgcn_ds_fmin : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_fminf">;		def int_amdgcn_ds_fmin : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_fminf">;
def int_amdgcn_ds_fmax : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_fmaxf">;		def int_amdgcn_ds_fmax : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_fmaxf">;

} // TargetPrefix = "amdgcn"		} // TargetPrefix = "amdgcn"

// New-style image intrinsics		// New-style image intrinsics

▲ Show 20 Lines • Show All 1,118 Lines • Show Last 20 Lines

lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Show First 20 Lines • Show All 100 Lines • ▼ Show 20 Lines	private:
bool isNoNanSrc(SDValue N) const;		bool isNoNanSrc(SDValue N) const;
bool isInlineImmediate(const SDNode *N) const;		bool isInlineImmediate(const SDNode *N) const;
bool isVGPRImm(const SDNode *N) const;		bool isVGPRImm(const SDNode *N) const;
bool isUniformLoad(const SDNode *N) const;		bool isUniformLoad(const SDNode *N) const;
bool isUniformBr(const SDNode *N) const;		bool isUniformBr(const SDNode *N) const;

MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const;		MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const;

SDNode glueCopyToM0(SDNode N) const;		SDNode glueCopyToM0LDSInit(SDNode N) const;
		SDNode glueCopyToM0(SDNode N, SDValue Val) const;

const TargetRegisterClass getOperandRegClass(SDNode N, unsigned OpNo) const;		const TargetRegisterClass getOperandRegClass(SDNode N, unsigned OpNo) const;
virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);		virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
virtual bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);		virtual bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
bool isDSOffsetLegal(const SDValue &Base, unsigned Offset,		bool isDSOffsetLegal(SDValue Base, unsigned Offset,
unsigned OffsetBits) const;		unsigned OffsetBits) const;
bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;		bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,		bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
SDValue &Offset1) const;		SDValue &Offset1) const;
bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,		bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
SDValue &SOffset, SDValue &Offset, SDValue &Offen,		SDValue &SOffset, SDValue &Offset, SDValue &Offen,
SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC,		SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC,
SDValue &TFE) const;		SDValue &TFE) const;
▲ Show 20 Lines • Show All 81 Lines • ▼ Show 20 Lines	private:
SDNode *getS_BFE(unsigned Opcode, const SDLoc &DL, SDValue Val,		SDNode *getS_BFE(unsigned Opcode, const SDLoc &DL, SDValue Val,
uint32_t Offset, uint32_t Width);		uint32_t Offset, uint32_t Width);
void SelectS_BFEFromShifts(SDNode *N);		void SelectS_BFEFromShifts(SDNode *N);
void SelectS_BFE(SDNode *N);		void SelectS_BFE(SDNode *N);
bool isCBranchSCC(const SDNode *N) const;		bool isCBranchSCC(const SDNode *N) const;
void SelectBRCOND(SDNode *N);		void SelectBRCOND(SDNode *N);
void SelectFMAD_FMA(SDNode *N);		void SelectFMAD_FMA(SDNode *N);
void SelectATOMIC_CMP_SWAP(SDNode *N);		void SelectATOMIC_CMP_SWAP(SDNode *N);
		void SelectINTRINSIC_W_CHAIN(SDNode *N);

protected:		protected:
// Include the pieces autogenerated from the target description.		// Include the pieces autogenerated from the target description.
#include "AMDGPUGenDAGISel.inc"		#include "AMDGPUGenDAGISel.inc"
};		};

class R600DAGToDAGISel : public AMDGPUDAGToDAGISel {		class R600DAGToDAGISel : public AMDGPUDAGToDAGISel {
const R600Subtarget *Subtarget;		const R600Subtarget *Subtarget;
▲ Show 20 Lines • Show All 114 Lines • ▼ Show 20 Lines	case AMDGPU::REG_SEQUENCE: {
SDValue SubRegOp = N->getOperand(OpNo + 1);		SDValue SubRegOp = N->getOperand(OpNo + 1);
unsigned SubRegIdx = cast<ConstantSDNode>(SubRegOp)->getZExtValue();		unsigned SubRegIdx = cast<ConstantSDNode>(SubRegOp)->getZExtValue();
return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,		return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
SubRegIdx);		SubRegIdx);
}		}
}		}
}		}

SDNode AMDGPUDAGToDAGISel::glueCopyToM0(SDNode N) const {		SDNode AMDGPUDAGToDAGISel::glueCopyToM0(SDNode N, SDValue Val) const {
if (cast<MemSDNode>(N)->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS \|\|
!Subtarget->ldsRequiresM0Init())
return N;

const SITargetLowering& Lowering =		const SITargetLowering& Lowering =
static_cast<const SITargetLowering>(getTargetLowering());		static_cast<const SITargetLowering>(getTargetLowering());

// Write max value to m0 before each load operation		// Write max value to m0 before each load operation

SDValue M0 = Lowering.copyToM0(*CurDAG, CurDAG->getEntryNode(), SDLoc(N),		SDValue M0 = Lowering.copyToM0(*CurDAG, CurDAG->getEntryNode(), SDLoc(N),
CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));		Val);

SDValue Glue = M0.getValue(1);		SDValue Glue = M0.getValue(1);

SmallVector <SDValue, 8> Ops;		SmallVector <SDValue, 8> Ops;
for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {		for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
Ops.push_back(N->getOperand(i));		Ops.push_back(N->getOperand(i));
}
Ops.push_back(Glue);		Ops.push_back(Glue);
return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);		return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
}		}

		SDNode AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode N) const {
		if (cast<MemSDNode>(N)->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS \|\|
		!Subtarget->ldsRequiresM0Init())
		return N;
		return glueCopyToM0(N, CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
		}

MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,		MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
EVT VT) const {		EVT VT) const {
SDNode *Lo = CurDAG->getMachineNode(		SDNode *Lo = CurDAG->getMachineNode(
AMDGPU::S_MOV_B32, DL, MVT::i32,		AMDGPU::S_MOV_B32, DL, MVT::i32,
CurDAG->getConstant(Imm & 0xFFFFFFFF, DL, MVT::i32));		CurDAG->getConstant(Imm & 0xFFFFFFFF, DL, MVT::i32));
SDNode *Hi =		SDNode *Hi =
CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,		CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
CurDAG->getConstant(Imm >> 32, DL, MVT::i32));		CurDAG->getConstant(Imm >> 32, DL, MVT::i32));
▲ Show 20 Lines • Show All 94 Lines • ▼ Show 20 Lines	if (N->isMachineOpcode()) {
return; // Already selected.		return; // Already selected.
}		}

if (isa<AtomicSDNode>(N) \|\|		if (isa<AtomicSDNode>(N) \|\|
(Opc == AMDGPUISD::ATOMIC_INC \|\| Opc == AMDGPUISD::ATOMIC_DEC \|\|		(Opc == AMDGPUISD::ATOMIC_INC \|\| Opc == AMDGPUISD::ATOMIC_DEC \|\|
Opc == ISD::ATOMIC_LOAD_FADD \|\|		Opc == ISD::ATOMIC_LOAD_FADD \|\|
Opc == AMDGPUISD::ATOMIC_LOAD_FMIN \|\|		Opc == AMDGPUISD::ATOMIC_LOAD_FMIN \|\|
Opc == AMDGPUISD::ATOMIC_LOAD_FMAX))		Opc == AMDGPUISD::ATOMIC_LOAD_FMAX))
N = glueCopyToM0(N);		N = glueCopyToM0LDSInit(N);

switch (Opc) {		switch (Opc) {
default:		default:
break;		break;
// We are selecting i64 ADD here instead of custom lower it during		// We are selecting i64 ADD here instead of custom lower it during
// DAG legalization, so we can fold some i64 ADDs used for address		// DAG legalization, so we can fold some i64 ADDs used for address
// calculation into the LOAD and STORE instructions.		// calculation into the LOAD and STORE instructions.
case ISD::ADDC:		case ISD::ADDC:
▲ Show 20 Lines • Show All 81 Lines • ▼ Show 20 Lines	case ISD::ConstantFP: {
SDLoc DL(N);		SDLoc DL(N);
ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));		ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
return;		return;
}		}
case ISD::LOAD:		case ISD::LOAD:
case ISD::STORE:		case ISD::STORE:
case ISD::ATOMIC_LOAD:		case ISD::ATOMIC_LOAD:
case ISD::ATOMIC_STORE: {		case ISD::ATOMIC_STORE: {
N = glueCopyToM0(N);		N = glueCopyToM0LDSInit(N);
break;		break;
}		}

case AMDGPUISD::BFE_I32:		case AMDGPUISD::BFE_I32:
case AMDGPUISD::BFE_U32: {		case AMDGPUISD::BFE_U32: {
// There is a scalar version available, but unlike the vector version which		// There is a scalar version available, but unlike the vector version which
// has a separate operand for the offset and width, the scalar version packs		// has a separate operand for the offset and width, the scalar version packs
// the width and offset into a single operand. Try to move to the scalar		// the width and offset into a single operand. Try to move to the scalar
▲ Show 20 Lines • Show All 61 Lines • ▼ Show 20 Lines	case AMDGPUISD::CVT_PK_I16_I32: {
// Hack around using a legal type if f16 is illegal.		// Hack around using a legal type if f16 is illegal.
if (N->getValueType(0) == MVT::i32) {		if (N->getValueType(0) == MVT::i32) {
MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;		MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),		N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
{ N->getOperand(0), N->getOperand(1) });		{ N->getOperand(0), N->getOperand(1) });
SelectCode(N);		SelectCode(N);
return;		return;
}		}

		break;
		}
		case ISD::INTRINSIC_W_CHAIN: {
		SelectINTRINSIC_W_CHAIN(N);
		return;
}		}
}		}

SelectCode(N);		SelectCode(N);
}		}

bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {		bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();		const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
▲ Show 20 Lines • Show All 164 Lines • ▼ Show 20 Lines	void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
unsigned Opc = Signed ? AMDGPU::V_MAD_I64_I32 : AMDGPU::V_MAD_U64_U32;		unsigned Opc = Signed ? AMDGPU::V_MAD_I64_I32 : AMDGPU::V_MAD_U64_U32;

SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);		SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),		SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
Clamp };		Clamp };
CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);		CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
}		}

bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset,		bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset,
unsigned OffsetBits) const {		unsigned OffsetBits) const {
if ((OffsetBits == 16 && !isUInt<16>(Offset)) \|\|		if ((OffsetBits == 16 && !isUInt<16>(Offset)) \|\|
(OffsetBits == 8 && !isUInt<8>(Offset)))		(OffsetBits == 8 && !isUInt<8>(Offset)))
return false;		return false;

if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS \|\|		if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS \|\|
Subtarget->unsafeDSOffsetFoldingEnabled())		Subtarget->unsafeDSOffsetFoldingEnabled())
return true;		return true;
▲ Show 20 Lines • Show All 915 Lines • ▼ Show 20 Lines	void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
SDValue Extract		SDValue Extract
= CurDAG->getTargetExtractSubreg(SubReg, SL, VT, SDValue(CmpSwap, 0));		= CurDAG->getTargetExtractSubreg(SubReg, SL, VT, SDValue(CmpSwap, 0));

ReplaceUses(SDValue(N, 0), Extract);		ReplaceUses(SDValue(N, 0), Extract);
ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 1));		ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 1));
CurDAG->RemoveDeadNode(N);		CurDAG->RemoveDeadNode(N);
}		}

		void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
		unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
		if ((IntrID != Intrinsic::amdgcn_ds_append &&
		IntrID != Intrinsic::amdgcn_ds_consume) \|\|
		N->getValueType(0) != MVT::i32) {
		SelectCode(N);
		return;
		}

		// The address is assumed to be uniform, so if it ends up in a VGPR, it will
		// be copied to an SGPR with readfirstlane.
		unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
		AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;

		SDValue Chain = N->getOperand(0);
		SDValue Ptr = N->getOperand(2);
		MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
		bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;

		SDValue Offset;
		if (CurDAG->isBaseWithConstantOffset(Ptr)) {
		SDValue PtrBase = Ptr.getOperand(0);
		SDValue PtrOffset = Ptr.getOperand(1);

		const APInt &OffsetVal = cast<ConstantSDNode>(PtrOffset)->getAPIntValue();
		if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue(), 16)) {
		N = glueCopyToM0(N, PtrBase);
		Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
		}
		}

		if (!Offset) {
		N = glueCopyToM0(N, Ptr);
		Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
		}

		SDValue Ops[] = {
		Offset,
		CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
		Chain,
		N->getOperand(N->getNumOperands() - 1) // New glue
		};

		CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
		}

bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,		bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
unsigned &Mods) const {		unsigned &Mods) const {
Mods = 0;		Mods = 0;
Src = In;		Src = In;

if (Src.getOpcode() == ISD::FNEG) {		if (Src.getOpcode() == ISD::FNEG) {
Mods \|= SISrcMods::NEG;		Mods \|= SISrcMods::NEG;
Src = Src.getOperand(0);		Src = Src.getOperand(0);
▲ Show 20 Lines • Show All 504 Lines • Show Last 20 Lines

lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp

Show First 20 Lines • Show All 103 Lines • ▼ Show 20 Lines	for (Argument &Arg : F.args()) {
if (Arg.use_empty())		if (Arg.use_empty())
continue;		continue;

if (PointerType *PT = dyn_cast<PointerType>(ArgTy)) {		if (PointerType *PT = dyn_cast<PointerType>(ArgTy)) {
// FIXME: Hack. We rely on AssertZext to be able to fold DS addressing		// FIXME: Hack. We rely on AssertZext to be able to fold DS addressing
// modes on SI to know the high bits are 0 so pointer adds don't wrap. We		// modes on SI to know the high bits are 0 so pointer adds don't wrap. We
// can't represent this with range metadata because it's only allowed for		// can't represent this with range metadata because it's only allowed for
// integer types.		// integer types.
if (PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&		if ((PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS \|\|
		PT->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) &&
ST.getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS)		ST.getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS)
continue;		continue;

// FIXME: We can replace this with equivalent alias.scope/noalias		// FIXME: We can replace this with equivalent alias.scope/noalias
// metadata, but this appears to be a lot of work.		// metadata, but this appears to be a lot of work.
if (Arg.hasNoAliasAttr())		if (Arg.hasNoAliasAttr())
continue;		continue;
}		}
▲ Show 20 Lines • Show All 119 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 920 Lines • ▼ Show 20 Lines	case Intrinsic::amdgcn_ds_fmax: {
Info.flags = MachineMemOperand::MOLoad \| MachineMemOperand::MOStore;		Info.flags = MachineMemOperand::MOLoad \| MachineMemOperand::MOStore;

const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));		const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
if (!Vol \|\| !Vol->isZero())		if (!Vol \|\| !Vol->isZero())
Info.flags \|= MachineMemOperand::MOVolatile;		Info.flags \|= MachineMemOperand::MOVolatile;

return true;		return true;
}		}
		case Intrinsic::amdgcn_ds_append:
		case Intrinsic::amdgcn_ds_consume: {
		Info.opc = ISD::INTRINSIC_W_CHAIN;
		Info.memVT = MVT::getVT(CI.getType());
		Info.ptrVal = CI.getOperand(0);
		Info.align = 0;
		Info.flags = MachineMemOperand::MOLoad \| MachineMemOperand::MOStore;

		const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(3));
		if (!Vol \|\| !Vol->isZero())
		Info.flags \|= MachineMemOperand::MOVolatile;

		return true;
		}
default:		default:
return false;		return false;
}		}
}		}

bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,		bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
SmallVectorImpl<Value*> &Ops,		SmallVectorImpl<Value*> &Ops,
Type *&AccessTy) const {		Type *&AccessTy) const {
▲ Show 20 Lines • Show All 1,035 Lines • ▼ Show 20 Lines	if (IsEntryFunc && VA.isMemLoc()) {

SDValue Arg = lowerKernargMemParameter(		SDValue Arg = lowerKernargMemParameter(
DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]);		DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]);
Chains.push_back(Arg.getValue(1));		Chains.push_back(Arg.getValue(1));

auto *ParamTy =		auto *ParamTy =
dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));		dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&		if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {		ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS \|\|
		ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
// On SI local pointers are just offsets into LDS, so they are always		// On SI local pointers are just offsets into LDS, so they are always
// less than 16-bits. On CI and newer they could potentially be		// less than 16-bits. On CI and newer they could potentially be
// real pointers, so we can't guarantee their size.		// real pointers, so we can't guarantee their size.
Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,		Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
DAG.getValueType(MVT::i16));		DAG.getValueType(MVT::i16));
}		}

InVals.push_back(Arg);		InVals.push_back(Arg);
▲ Show 20 Lines • Show All 3,493 Lines • ▼ Show 20 Lines
}		}

SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,		SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
SelectionDAG &DAG) const {		SelectionDAG &DAG) const {
unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();		unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
SDLoc DL(Op);		SDLoc DL(Op);

switch (IntrID) {		switch (IntrID) {
		#if 0
		rampitecUnsubmitted Not Done Reply Inline Actions Enable it or drop it. rampitec: Enable it or drop it.
		case Intrinsic::amdgcn_ds_consume:
		case Intrinsic::amdgcn_ds_append: {
		MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(Op);
		SDValue Chain = M->getChain();
		SDValue Ptr = M->getBasePtr();

		SDValue Ptr, Offset;

		if (CurDAG->isBaseWithConstantOffset(Ptr)) {
		SDValue N0 = Addr.getOperand(0);
		SDValue N1 = Addr.getOperand(1);

		}

		#if 0
		Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
		SDValue Glue = Chain.getValue(1);
		return DAG.getNode(NodeOp, DL, MVT::Other, Chain,
		Op.getOperand(2), Glue);
		#endif

		return SDValue();
		}
		#endif
case Intrinsic::amdgcn_ds_ordered_add:		case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap: {		case Intrinsic::amdgcn_ds_ordered_swap: {
MemSDNode *M = cast<MemSDNode>(Op);		MemSDNode *M = cast<MemSDNode>(Op);
SDValue Chain = M->getOperand(0);		SDValue Chain = M->getOperand(0);
SDValue M0 = M->getOperand(2);		SDValue M0 = M->getOperand(2);
SDValue Value = M->getOperand(3);		SDValue Value = M->getOperand(3);
unsigned OrderedCountIndex = M->getConstantOperandVal(7);		unsigned OrderedCountIndex = M->getConstantOperandVal(7);
unsigned WaveRelease = M->getConstantOperandVal(8);		unsigned WaveRelease = M->getConstantOperandVal(8);
▲ Show 20 Lines • Show All 4,315 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIInstrInfo.cpp

Show First 20 Lines • Show All 269 Lines • ▼ Show 20 Lines	bool SIInstrInfo::getMemOperandWithOffset(MachineInstr &LdSt,
unsigned Opc = LdSt.getOpcode();		unsigned Opc = LdSt.getOpcode();

if (isDS(LdSt)) {		if (isDS(LdSt)) {
const MachineOperand *OffsetImm =		const MachineOperand *OffsetImm =
getNamedOperand(LdSt, AMDGPU::OpName::offset);		getNamedOperand(LdSt, AMDGPU::OpName::offset);
if (OffsetImm) {		if (OffsetImm) {
// Normal, single offset LDS instruction.		// Normal, single offset LDS instruction.
BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);		BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
		// TODO: ds_consume/ds_append use M0 for the base address. Is it safe to
		// report that here?
		if (!BaseOp)
		return false;

Offset = OffsetImm->getImm();		Offset = OffsetImm->getImm();
assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "		assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
"operands of type register.");		"operands of type register.");
return true;		return true;
}		}

// The 2 offset instructions use offset0 and offset1 instead. We can treat		// The 2 offset instructions use offset0 and offset1 instead. We can treat
// these as a load with a single offset if the 2 offsets are consecutive. We		// these as a load with a single offset if the 2 offsets are consecutive. We
▲ Show 20 Lines • Show All 5,353 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll

This file was added.

				; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,SI,NOTGFX9 %s
				; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s
				; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s
				; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,CIPLUS,GFX9 %s

				; GCN-LABEL: {{^}}ds_append_lds:
				; GCN: s_load_dword [[PTR:s[0-9]+]]
				; GCN: s_mov_b32 m0, [[PTR]]
				; GCN: ds_append [[RESULT:v[0-9]+]]{{$}}
				; GCN: {{.}}store{{.}} [[RESULT]]
				define amdgpu_kernel void @ds_append_lds(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
				%val = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %lds, i32 0, i32 0, i1 false)
				store i32 %val, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: {{^}}ds_append_lds_max_offset:
				; GCN: s_load_dword [[PTR:s[0-9]+]]
				; GCN: s_mov_b32 m0, [[PTR]]
				; GCN: ds_append [[RESULT:v[0-9]+]] offset:65532{{$}}
				; GCN: {{.}}store{{.}} [[RESULT]]
				define amdgpu_kernel void @ds_append_lds_max_offset(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
				%gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 16383
				%val = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %gep, i32 0, i32 0, i1 false)
				store i32 %val, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: {{^}}ds_append_no_fold_offset_si:
				; GCN: s_load_dword [[PTR:s[0-9]+]]

				; SI: s_add_i32 [[PTR]], [[PTR]], 16
				; SI: s_mov_b32 m0, [[PTR]]
				; SI: ds_append [[RESULT:v[0-9]+]]{{$}}

				; CIPLUS: s_mov_b32 m0, [[PTR]]
				; CIPLUS: ds_append [[RESULT:v[0-9]+]] offset:16{{$}}

				; GCN: {{.}}store{{.}} [[RESULT]]
				define amdgpu_kernel void @ds_append_no_fold_offset_si(i32 addrspace(3)* addrspace(4)* %lds.ptr, i32 addrspace(1)* %out) #0 {
				%lds = load i32 addrspace(3), i32 addrspace(3) addrspace(4)* %lds.ptr, align 4
				%gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 4
				%val = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %gep, i32 0, i32 0, i1 false)
				store i32 %val, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: {{^}}ds_append_lds_over_max_offset:
				; GCN: s_load_dword [[PTR:s[0-9]+]]

				; SI: s_bitset1_b32 [[PTR]], 16
				; CIPLUS: s_add_i32 [[PTR]], [[PTR]], 0x10000

				; GCN: s_mov_b32 m0, [[PTR]]
				; GCN: ds_append [[RESULT:v[0-9]+]]{{$}}
				; GCN: {{.}}store{{.}} [[RESULT]]
				define amdgpu_kernel void @ds_append_lds_over_max_offset(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
				%gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 16384
				%val = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %gep, i32 0, i32 0, i1 false)
				store i32 %val, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: {{^}}ds_append_lds_vgpr_addr:
				; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0
				; GCN: s_mov_b32 m0, [[READLANE]]
				; GCN: ds_append [[RESULT:v[0-9]+]]{{$}}
				; GCN: {{.}}store{{.}} [[RESULT]]
				define void @ds_append_lds_vgpr_addr(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
				%val = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %lds, i32 0, i32 0, i1 false)
				store i32 %val, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: {{^}}ds_append_gds:
				; GCN: s_load_dword [[PTR:s[0-9]+]]
				; GCN: s_mov_b32 m0, [[PTR]]
				; GCN: ds_append [[RESULT:v[0-9]+]] gds{{$}}
				; GCN: {{.}}store{{.}} [[RESULT]]
				define amdgpu_kernel void @ds_append_gds(i32 addrspace(2)* %gds, i32 addrspace(1)* %out) #0 {
				%val = call i32 @llvm.amdgcn.ds.append.p2i32(i32 addrspace(2)* %gds, i32 0, i32 0, i1 false)
				store i32 %val, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: {{^}}ds_append_gds_max_offset:
				; GCN: s_load_dword [[PTR:s[0-9]+]]
				; GCN: s_mov_b32 m0, [[PTR]]
				; GCN: ds_append [[RESULT:v[0-9]+]] offset:65532 gds{{$}}
				; GCN: {{.}}store{{.}} [[RESULT]]
				define amdgpu_kernel void @ds_append_gds_max_offset(i32 addrspace(2)* %gds, i32 addrspace(1)* %out) #0 {
				%gep = getelementptr inbounds i32, i32 addrspace(2)* %gds, i32 16383
				%val = call i32 @llvm.amdgcn.ds.append.p2i32(i32 addrspace(2)* %gep, i32 0, i32 0, i1 false)
				store i32 %val, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: {{^}}ds_append_gds_over_max_offset:
				define amdgpu_kernel void @ds_append_gds_over_max_offset(i32 addrspace(2)* %gds, i32 addrspace(1)* %out) #0 {
				%gep = getelementptr inbounds i32, i32 addrspace(2)* %gds, i32 16384
				%val = call i32 @llvm.amdgcn.ds.append.p2i32(i32 addrspace(2)* %gep, i32 0, i32 0, i1 false)
				store i32 %val, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: {{^}}ds_append_lds_m0_restore:
				; GCN: s_load_dword [[PTR:s[0-9]+]]
				; GCN: s_mov_b32 m0, [[PTR]]
				; GCN: ds_append [[RESULT:v[0-9]+]]{{$}}
				; NOTGFX9: s_mov_b32 m0, -1
				; GFX9-NOT: m0
				; GCN: _store_dword
				; GCN: ds_read_b32
				define amdgpu_kernel void @ds_append_lds_m0_restore(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
				%val0 = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %lds, i32 0, i32 0, i1 false)
				store i32 %val0, i32 addrspace(1)* %out
				%val1 = load volatile i32, i32 addrspace(3)* %lds
				ret void
				}

				declare i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* nocapture, i32, i32, i1) #1
				declare i32 @llvm.amdgcn.ds.append.p2i32(i32 addrspace(2)* nocapture, i32, i32, i1) #1

				attributes #0 = { nounwind }
				attributes #1 = { argmemonly convergent nounwind }

test/CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll

This file was added.

				; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,SI,NOTGFX9 %s
				; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s
				; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s
				; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,CIPLUS,GFX9 %s

				; GCN-LABEL: {{^}}ds_consume_lds:
				; GCN: s_load_dword [[PTR:s[0-9]+]]
				; GCN: s_mov_b32 m0, [[PTR]]
				; GCN: ds_consume [[RESULT:v[0-9]+]]{{$}}
				; GCN: {{.}}store{{.}} [[RESULT]]
				define amdgpu_kernel void @ds_consume_lds(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
				%val = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %lds, i32 0, i32 0, i1 false)
				store i32 %val, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: {{^}}ds_consume_lds_max_offset:
				; GCN: s_load_dword [[PTR:s[0-9]+]]
				; GCN: s_mov_b32 m0, [[PTR]]
				; GCN: ds_consume [[RESULT:v[0-9]+]] offset:65532{{$}}
				; GCN: {{.}}store{{.}} [[RESULT]]
				define amdgpu_kernel void @ds_consume_lds_max_offset(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
				%gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 16383
				%val = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %gep, i32 0, i32 0, i1 false)
				store i32 %val, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: {{^}}ds_consume_no_fold_offset_si:
				; GCN: s_load_dword [[PTR:s[0-9]+]]

				; SI: s_add_i32 [[PTR]], [[PTR]], 16
				; SI: s_mov_b32 m0, [[PTR]]
				; SI: ds_consume [[RESULT:v[0-9]+]]{{$}}

				; CIPLUS: s_mov_b32 m0, [[PTR]]
				; CIPLUS: ds_consume [[RESULT:v[0-9]+]] offset:16{{$}}

				; GCN: {{.}}store{{.}} [[RESULT]]
				define amdgpu_kernel void @ds_consume_no_fold_offset_si(i32 addrspace(3)* addrspace(4)* %lds.ptr, i32 addrspace(1)* %out) #0 {
				%lds = load i32 addrspace(3), i32 addrspace(3) addrspace(4)* %lds.ptr, align 4
				%gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 4
				%val = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %gep, i32 0, i32 0, i1 false)
				store i32 %val, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: {{^}}ds_consume_lds_over_max_offset:
				; GCN: s_load_dword [[PTR:s[0-9]+]]

				; SI: s_bitset1_b32 [[PTR]], 16
				; CIPLUS: s_add_i32 [[PTR]], [[PTR]], 0x10000

				; GCN: s_mov_b32 m0, [[PTR]]
				; GCN: ds_consume [[RESULT:v[0-9]+]]{{$}}
				; GCN: {{.}}store{{.}} [[RESULT]]
				define amdgpu_kernel void @ds_consume_lds_over_max_offset(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
				%gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 16384
				%val = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %gep, i32 0, i32 0, i1 false)
				store i32 %val, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: {{^}}ds_consume_lds_vgpr_addr:
				; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0
				; GCN: s_mov_b32 m0, [[READLANE]]
				; GCN: ds_consume [[RESULT:v[0-9]+]]{{$}}
				; GCN: {{.}}store{{.}} [[RESULT]]
				define void @ds_consume_lds_vgpr_addr(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
				%val = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %lds, i32 0, i32 0, i1 false)
				store i32 %val, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: {{^}}ds_consume_gds:
				; GCN: s_load_dword [[PTR:s[0-9]+]]
				; GCN: s_mov_b32 m0, [[PTR]]
				; GCN: ds_consume [[RESULT:v[0-9]+]] gds{{$}}
				; GCN: {{.}}store{{.}} [[RESULT]]
				define amdgpu_kernel void @ds_consume_gds(i32 addrspace(2)* %gds, i32 addrspace(1)* %out) #0 {
				%val = call i32 @llvm.amdgcn.ds.consume.p2i32(i32 addrspace(2)* %gds, i32 0, i32 0, i1 false)
				store i32 %val, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: {{^}}ds_consume_gds_max_offset:
				; GCN: s_load_dword [[PTR:s[0-9]+]]
				; GCN: s_mov_b32 m0, [[PTR]]
				; GCN: ds_consume [[RESULT:v[0-9]+]] offset:65532 gds{{$}}
				; GCN: {{.}}store{{.}} [[RESULT]]
				define amdgpu_kernel void @ds_consume_gds_max_offset(i32 addrspace(2)* %gds, i32 addrspace(1)* %out) #0 {
				%gep = getelementptr inbounds i32, i32 addrspace(2)* %gds, i32 16383
				%val = call i32 @llvm.amdgcn.ds.consume.p2i32(i32 addrspace(2)* %gep, i32 0, i32 0, i1 false)
				store i32 %val, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: {{^}}ds_consume_gds_over_max_offset:
				define amdgpu_kernel void @ds_consume_gds_over_max_offset(i32 addrspace(2)* %gds, i32 addrspace(1)* %out) #0 {
				%gep = getelementptr inbounds i32, i32 addrspace(2)* %gds, i32 16384
				%val = call i32 @llvm.amdgcn.ds.consume.p2i32(i32 addrspace(2)* %gep, i32 0, i32 0, i1 false)
				store i32 %val, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: {{^}}ds_consume_lds_m0_restore:
				; GCN: s_load_dword [[PTR:s[0-9]+]]
				; GCN: s_mov_b32 m0, [[PTR]]
				; GCN: ds_consume [[RESULT:v[0-9]+]]{{$}}
				; NOTGFX9: s_mov_b32 m0, -1
				; GFX9-NOT: m0
				; GCN: _store_dword
				; GCN: ds_read_b32
				define amdgpu_kernel void @ds_consume_lds_m0_restore(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
				%val0 = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %lds, i32 0, i32 0, i1 false)
				store i32 %val0, i32 addrspace(1)* %out
				%val1 = load volatile i32, i32 addrspace(3)* %lds
				ret void
				}

				declare i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* nocapture, i32, i32, i1) #1
				declare i32 @llvm.amdgcn.ds.consume.p2i32(i32 addrspace(2)* nocapture, i32, i32, i1) #1

				attributes #0 = { nounwind }
				attributes #1 = { argmemonly convergent nounwind }