Diff 210133

llvm/trunk/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp

Show All 34 Lines	enum DPP_CTRL {
DPP_ROW_SR8 = 0x118,		DPP_ROW_SR8 = 0x118,
DPP_WF_SR1 = 0x138,		DPP_WF_SR1 = 0x138,
DPP_ROW_BCAST15 = 0x142,		DPP_ROW_BCAST15 = 0x142,
DPP_ROW_BCAST31 = 0x143		DPP_ROW_BCAST31 = 0x143
};		};

struct ReplacementInfo {		struct ReplacementInfo {
Instruction *I;		Instruction *I;
Instruction::BinaryOps Op;		AtomicRMWInst::BinOp Op;
unsigned ValIdx;		unsigned ValIdx;
bool ValDivergent;		bool ValDivergent;
};		};

class AMDGPUAtomicOptimizer : public FunctionPass,		class AMDGPUAtomicOptimizer : public FunctionPass,
public InstVisitor<AMDGPUAtomicOptimizer> {		public InstVisitor<AMDGPUAtomicOptimizer> {
private:		private:
SmallVector<ReplacementInfo, 8> ToReplace;		SmallVector<ReplacementInfo, 8> ToReplace;
const LegacyDivergenceAnalysis *DA;		const LegacyDivergenceAnalysis *DA;
const DataLayout *DL;		const DataLayout *DL;
DominatorTree *DT;		DominatorTree *DT;
bool HasDPP;		bool HasDPP;
bool IsPixelShader;		bool IsPixelShader;

void optimizeAtomic(Instruction &I, Instruction::BinaryOps Op,		void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx,
unsigned ValIdx, bool ValDivergent) const;		bool ValDivergent) const;

public:		public:
static char ID;		static char ID;

AMDGPUAtomicOptimizer() : FunctionPass(ID) {}		AMDGPUAtomicOptimizer() : FunctionPass(ID) {}

bool runOnFunction(Function &F) override;		bool runOnFunction(Function &F) override;

▲ Show 20 Lines • Show All 47 Lines • ▼ Show 20 Lines	void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) {
switch (I.getPointerAddressSpace()) {		switch (I.getPointerAddressSpace()) {
default:		default:
return;		return;
case AMDGPUAS::GLOBAL_ADDRESS:		case AMDGPUAS::GLOBAL_ADDRESS:
case AMDGPUAS::LOCAL_ADDRESS:		case AMDGPUAS::LOCAL_ADDRESS:
break;		break;
}		}

Instruction::BinaryOps Op;		AtomicRMWInst::BinOp Op = I.getOperation();

switch (I.getOperation()) {		switch (Op) {
default:		default:
return;		return;
case AtomicRMWInst::Add:		case AtomicRMWInst::Add:
Op = Instruction::Add;
break;
case AtomicRMWInst::Sub:		case AtomicRMWInst::Sub:
Op = Instruction::Sub;		case AtomicRMWInst::Max:
		case AtomicRMWInst::Min:
		case AtomicRMWInst::UMax:
		case AtomicRMWInst::UMin:
break;		break;
}		}

const unsigned PtrIdx = 0;		const unsigned PtrIdx = 0;
const unsigned ValIdx = 1;		const unsigned ValIdx = 1;

// If the pointer operand is divergent, then each lane is doing an atomic		// If the pointer operand is divergent, then each lane is doing an atomic
// operation on a different address, and we cannot optimize that.		// operation on a different address, and we cannot optimize that.
Show All 15 Lines	void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) {
// atomic operation to do the calculation for the entire wavefront, so		// atomic operation to do the calculation for the entire wavefront, so
// remember the instruction so we can come back to it.		// remember the instruction so we can come back to it.
const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent};		const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent};

ToReplace.push_back(Info);		ToReplace.push_back(Info);
}		}

void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) {		void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) {
Instruction::BinaryOps Op;		AtomicRMWInst::BinOp Op;

switch (I.getIntrinsicID()) {		switch (I.getIntrinsicID()) {
default:		default:
return;		return;
case Intrinsic::amdgcn_buffer_atomic_add:		case Intrinsic::amdgcn_buffer_atomic_add:
case Intrinsic::amdgcn_struct_buffer_atomic_add:		case Intrinsic::amdgcn_struct_buffer_atomic_add:
case Intrinsic::amdgcn_raw_buffer_atomic_add:		case Intrinsic::amdgcn_raw_buffer_atomic_add:
Op = Instruction::Add;		Op = AtomicRMWInst::Add;
break;		break;
case Intrinsic::amdgcn_buffer_atomic_sub:		case Intrinsic::amdgcn_buffer_atomic_sub:
case Intrinsic::amdgcn_struct_buffer_atomic_sub:		case Intrinsic::amdgcn_struct_buffer_atomic_sub:
case Intrinsic::amdgcn_raw_buffer_atomic_sub:		case Intrinsic::amdgcn_raw_buffer_atomic_sub:
Op = Instruction::Sub;		Op = AtomicRMWInst::Sub;
		break;
		case Intrinsic::amdgcn_buffer_atomic_smin:
		case Intrinsic::amdgcn_struct_buffer_atomic_smin:
		case Intrinsic::amdgcn_raw_buffer_atomic_smin:
		Op = AtomicRMWInst::Min;
		break;
		case Intrinsic::amdgcn_buffer_atomic_umin:
		case Intrinsic::amdgcn_struct_buffer_atomic_umin:
		case Intrinsic::amdgcn_raw_buffer_atomic_umin:
		Op = AtomicRMWInst::UMin;
		break;
		case Intrinsic::amdgcn_buffer_atomic_smax:
		case Intrinsic::amdgcn_struct_buffer_atomic_smax:
		case Intrinsic::amdgcn_raw_buffer_atomic_smax:
		Op = AtomicRMWInst::Max;
		break;
		case Intrinsic::amdgcn_buffer_atomic_umax:
		case Intrinsic::amdgcn_struct_buffer_atomic_umax:
		case Intrinsic::amdgcn_raw_buffer_atomic_umax:
		Op = AtomicRMWInst::UMax;
break;		break;
}		}

const unsigned ValIdx = 0;		const unsigned ValIdx = 0;

const bool ValDivergent = DA->isDivergent(I.getOperand(ValIdx));		const bool ValDivergent = DA->isDivergent(I.getOperand(ValIdx));

// If the value operand is divergent, each lane is contributing a different		// If the value operand is divergent, each lane is contributing a different
Show All 15 Lines	void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) {
// If we get here, we can optimize the atomic using a single wavefront-wide		// If we get here, we can optimize the atomic using a single wavefront-wide
// atomic operation to do the calculation for the entire wavefront, so		// atomic operation to do the calculation for the entire wavefront, so
// remember the instruction so we can come back to it.		// remember the instruction so we can come back to it.
const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent};		const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent};

ToReplace.push_back(Info);		ToReplace.push_back(Info);
}		}

		// Use the builder to create the non-atomic counterpart of the specified
		// atomicrmw binary op.
		static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
		Value LHS, Value RHS) {
		CmpInst::Predicate Pred;

		switch (Op) {
		default:
		llvm_unreachable("Unhandled atomic op");
		case AtomicRMWInst::Add:
		return B.CreateBinOp(Instruction::Add, LHS, RHS);
		case AtomicRMWInst::Sub:
		return B.CreateBinOp(Instruction::Sub, LHS, RHS);

		case AtomicRMWInst::Max:
		Pred = CmpInst::ICMP_SGT;
		break;
		case AtomicRMWInst::Min:
		Pred = CmpInst::ICMP_SLT;
		break;
		case AtomicRMWInst::UMax:
		Pred = CmpInst::ICMP_UGT;
		break;
		case AtomicRMWInst::UMin:
		Pred = CmpInst::ICMP_ULT;
		break;
		}
		Value *Cond = B.CreateICmp(Pred, LHS, RHS);
		return B.CreateSelect(Cond, LHS, RHS);
		}

		static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op,
		unsigned BitWidth) {
		switch (Op) {
		default:
		llvm_unreachable("Unhandled atomic op");
		case AtomicRMWInst::Add:
		case AtomicRMWInst::Sub:
		case AtomicRMWInst::UMax:
		return APInt::getMinValue(BitWidth);
		case AtomicRMWInst::UMin:
		return APInt::getMaxValue(BitWidth);
		case AtomicRMWInst::Max:
		return APInt::getSignedMinValue(BitWidth);
		case AtomicRMWInst::Min:
		return APInt::getSignedMaxValue(BitWidth);
		}
		}

void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,		void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
Instruction::BinaryOps Op,		AtomicRMWInst::BinOp Op,
unsigned ValIdx,		unsigned ValIdx,
bool ValDivergent) const {		bool ValDivergent) const {
// Start building just before the instruction.		// Start building just before the instruction.
IRBuilder<> B(&I);		IRBuilder<> B(&I);

// If we are in a pixel shader, because of how we have to mask out helper		// If we are in a pixel shader, because of how we have to mask out helper
// lane invocations, we need to record the entry and exit BB's.		// lane invocations, we need to record the entry and exit BB's.
BasicBlock *PixelEntryBB = nullptr;		BasicBlock *PixelEntryBB = nullptr;
▲ Show 20 Lines • Show All 42 Lines • ▼ Show 20 Lines	void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1));		Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1));
CallInst *const PartialMbcnt = B.CreateIntrinsic(		CallInst *const PartialMbcnt = B.CreateIntrinsic(
Intrinsic::amdgcn_mbcnt_lo, {}, {ExtractLo, B.getInt32(0)});		Intrinsic::amdgcn_mbcnt_lo, {}, {ExtractLo, B.getInt32(0)});
CallInst *const Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {},		CallInst *const Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {},
{ExtractHi, PartialMbcnt});		{ExtractHi, PartialMbcnt});

Value *const MbcntCast = B.CreateIntCast(Mbcnt, Ty, false);		Value *const MbcntCast = B.CreateIntCast(Mbcnt, Ty, false);

Value *LaneOffset = nullptr;		Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth));

		Value *ExclScan = nullptr;
Value *NewV = nullptr;		Value *NewV = nullptr;

// If we have a divergent value in each lane, we need to combine the value		// If we have a divergent value in each lane, we need to combine the value
// using DPP.		// using DPP.
if (ValDivergent) {		if (ValDivergent) {
Value *const Identity = B.getIntN(TyBitWidth, 0);		// First we need to set all inactive invocations to the identity value, so
		// that they can correctly contribute to the final result.
// First we need to set all inactive invocations to 0, so that they can
// correctly contribute to the final result.
CallInst *const SetInactive =		CallInst *const SetInactive =
B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});		B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});

CallInst *const FirstDPP =		CallInst *const FirstDPP =
B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, Ty,		B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, Ty,
{Identity, SetInactive, B.getInt32(DPP_WF_SR1),		{Identity, SetInactive, B.getInt32(DPP_WF_SR1),
B.getInt32(0xf), B.getInt32(0xf), B.getFalse()});		B.getInt32(0xf), B.getInt32(0xf), B.getFalse()});
NewV = FirstDPP;		ExclScan = FirstDPP;

const unsigned Iters = 7;		const unsigned Iters = 7;
const unsigned DPPCtrl[Iters] = {		const unsigned DPPCtrl[Iters] = {
DPP_ROW_SR1, DPP_ROW_SR2, DPP_ROW_SR3, DPP_ROW_SR4,		DPP_ROW_SR1, DPP_ROW_SR2, DPP_ROW_SR3, DPP_ROW_SR4,
DPP_ROW_SR8, DPP_ROW_BCAST15, DPP_ROW_BCAST31};		DPP_ROW_SR8, DPP_ROW_BCAST15, DPP_ROW_BCAST31};
const unsigned RowMask[Iters] = {0xf, 0xf, 0xf, 0xf, 0xf, 0xa, 0xc};		const unsigned RowMask[Iters] = {0xf, 0xf, 0xf, 0xf, 0xf, 0xa, 0xc};
const unsigned BankMask[Iters] = {0xf, 0xf, 0xf, 0xe, 0xc, 0xf, 0xf};		const unsigned BankMask[Iters] = {0xf, 0xf, 0xf, 0xe, 0xc, 0xf, 0xf};

// This loop performs an exclusive scan across the wavefront, with all lanes		// This loop performs an exclusive scan across the wavefront, with all lanes
// active (by using the WWM intrinsic).		// active (by using the WWM intrinsic).
for (unsigned Idx = 0; Idx < Iters; Idx++) {		for (unsigned Idx = 0; Idx < Iters; Idx++) {
Value *const UpdateValue = Idx < 3 ? FirstDPP : NewV;		Value *const UpdateValue = Idx < 3 ? FirstDPP : ExclScan;
CallInst *const DPP = B.CreateIntrinsic(		CallInst *const DPP = B.CreateIntrinsic(
Intrinsic::amdgcn_update_dpp, Ty,		Intrinsic::amdgcn_update_dpp, Ty,
{Identity, UpdateValue, B.getInt32(DPPCtrl[Idx]),		{Identity, UpdateValue, B.getInt32(DPPCtrl[Idx]),
B.getInt32(RowMask[Idx]), B.getInt32(BankMask[Idx]), B.getFalse()});		B.getInt32(RowMask[Idx]), B.getInt32(BankMask[Idx]), B.getFalse()});

NewV = B.CreateBinOp(Op, NewV, DPP);		ExclScan = buildNonAtomicBinOp(B, Op, ExclScan, DPP);
}		}

LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV);		NewV = buildNonAtomicBinOp(B, Op, SetInactive, ExclScan);
NewV = B.CreateBinOp(Op, SetInactive, NewV);

// Read the value from the last lane, which has accumlated the values of		// Read the value from the last lane, which has accumlated the values of
// each active lane in the wavefront. This will be our new value with which		// each active lane in the wavefront. This will be our new value which we
// we will provide to the atomic operation.		// will provide to the atomic operation.
if (TyBitWidth == 64) {		if (TyBitWidth == 64) {
Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty());		Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty());
Value *const ExtractHi =		Value *const ExtractHi =
B.CreateTrunc(B.CreateLShr(NewV, B.getInt64(32)), B.getInt32Ty());		B.CreateTrunc(B.CreateLShr(NewV, B.getInt64(32)), B.getInt32Ty());
CallInst *const ReadLaneLo = B.CreateIntrinsic(		CallInst *const ReadLaneLo = B.CreateIntrinsic(
Intrinsic::amdgcn_readlane, {}, {ExtractLo, B.getInt32(63)});		Intrinsic::amdgcn_readlane, {}, {ExtractLo, B.getInt32(63)});
CallInst *const ReadLaneHi = B.CreateIntrinsic(		CallInst *const ReadLaneHi = B.CreateIntrinsic(
Intrinsic::amdgcn_readlane, {}, {ExtractHi, B.getInt32(63)});		Intrinsic::amdgcn_readlane, {}, {ExtractHi, B.getInt32(63)});
Value *const PartialInsert = B.CreateInsertElement(		Value *const PartialInsert = B.CreateInsertElement(
UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0));		UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0));
Value *const Insert =		Value *const Insert =
B.CreateInsertElement(PartialInsert, ReadLaneHi, B.getInt32(1));		B.CreateInsertElement(PartialInsert, ReadLaneHi, B.getInt32(1));
NewV = B.CreateBitCast(Insert, Ty);		NewV = B.CreateBitCast(Insert, Ty);
} else if (TyBitWidth == 32) {		} else if (TyBitWidth == 32) {
CallInst *const ReadLane = B.CreateIntrinsic(Intrinsic::amdgcn_readlane,		NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
{}, {NewV, B.getInt32(63)});		{NewV, B.getInt32(63)});
NewV = ReadLane;
} else {		} else {
llvm_unreachable("Unhandled atomic bit width");		llvm_unreachable("Unhandled atomic bit width");
}		}

// Finally mark the readlanes in the WWM section.		// Finally mark the readlanes in the WWM section.
NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV);		NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV);
} else {		} else {
		switch (Op) {
		default:
		llvm_unreachable("Unhandled atomic op");

		case AtomicRMWInst::Add:
		case AtomicRMWInst::Sub: {
// Get the total number of active lanes we have by using popcount.		// Get the total number of active lanes we have by using popcount.
Instruction *const Ctpop = B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot);		Instruction *const Ctpop =
		B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot);
Value *const CtpopCast = B.CreateIntCast(Ctpop, Ty, false);		Value *const CtpopCast = B.CreateIntCast(Ctpop, Ty, false);

// Calculate the new value we will be contributing to the atomic operation		// Calculate the new value we will be contributing to the atomic operation
// for the entire wavefront.		// for the entire wavefront.
NewV = B.CreateMul(V, CtpopCast);		NewV = B.CreateMul(V, CtpopCast);
LaneOffset = B.CreateMul(V, MbcntCast);		break;
		}

		case AtomicRMWInst::Max:
		case AtomicRMWInst::Min:
		case AtomicRMWInst::UMax:
		case AtomicRMWInst::UMin:
		// Max/min with a uniform value is idempotent: doing the atomic operation
		// multiple times has the same effect as doing it once.
		NewV = V;
		break;
		}
}		}

// We only want a single lane to enter our new control flow, and we do this		// We only want a single lane to enter our new control flow, and we do this
// by checking if there are any active lanes below us. Only one lane will		// by checking if there are any active lanes below us. Only one lane will
// have 0 active lanes below us, so that will be the only one to progress.		// have 0 active lanes below us, so that will be the only one to progress.
Value *const Cond = B.CreateICmpEQ(MbcntCast, B.getIntN(TyBitWidth, 0));		Value *const Cond = B.CreateICmpEQ(MbcntCast, B.getIntN(TyBitWidth, 0));

// Store I's original basic block before we split the block.		// Store I's original basic block before we split the block.
▲ Show 20 Lines • Show All 49 Lines • ▼ Show 20 Lines	void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
} else {		} else {
llvm_unreachable("Unhandled atomic bit width");		llvm_unreachable("Unhandled atomic bit width");
}		}

// Now that we have the result of our single atomic operation, we need to		// Now that we have the result of our single atomic operation, we need to
// get our individual lane's slice into the result. We use the lane offset we		// get our individual lane's slice into the result. We use the lane offset we
// previously calculated combined with the atomic result value we got from the		// previously calculated combined with the atomic result value we got from the
// first lane, to get our lane's index into the atomic result.		// first lane, to get our lane's index into the atomic result.
Value *const Result = B.CreateBinOp(Op, BroadcastI, LaneOffset);		Value *LaneOffset = nullptr;
		if (ValDivergent) {
		LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, ExclScan);
		} else {
		switch (Op) {
		default:
		llvm_unreachable("Unhandled atomic op");
		case AtomicRMWInst::Add:
		case AtomicRMWInst::Sub:
		LaneOffset = B.CreateMul(V, MbcntCast);
		break;
		case AtomicRMWInst::Max:
		case AtomicRMWInst::Min:
		case AtomicRMWInst::UMax:
		case AtomicRMWInst::UMin:
		LaneOffset = B.CreateSelect(Cond, Identity, V);
		break;
		}
		}
		Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);

if (IsPixelShader) {		if (IsPixelShader) {
// Need a final PHI to reconverge to above the helper lane branch mask.		// Need a final PHI to reconverge to above the helper lane branch mask.
B.SetInsertPoint(PixelExitBB->getFirstNonPHI());		B.SetInsertPoint(PixelExitBB->getFirstNonPHI());

PHINode *const PHI = B.CreatePHI(Ty, 2);		PHINode *const PHI = B.CreatePHI(Ty, 2);
PHI->addIncoming(UndefValue::get(Ty), PixelEntryBB);		PHI->addIncoming(UndefValue::get(Ty), PixelEntryBB);
PHI->addIncoming(Result, I.getParent());		PHI->addIncoming(Result, I.getParent());
Show All 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll

	Show First 20 Lines • Show All 188 Lines • ▼ Show 20 Lines
	define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) {			define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) {
	entry:			entry:
	%lane = call i32 @llvm.amdgcn.workitem.id.x()			%lane = call i32 @llvm.amdgcn.workitem.id.x()
	%zext = zext i32 %lane to i64			%zext = zext i32 %lane to i64
	%old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel			%old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel
	store i64 %old, i64 addrspace(1)* %out			store i64 %old, i64 addrspace(1)* %out
	ret void			ret void
	}			}

				; GCN-LABEL: max_i32_varying:
				; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
				; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
				; GFX8MORE: ds_max_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
				define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
				entry:
				%lane = call i32 @llvm.amdgcn.workitem.id.x()
				%old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: max_i64_constant:
				; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
				; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
				; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
				; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
				; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], 5
				; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], 0
				; GCN: ds_max_rtn_i64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}
				define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
				entry:
				%old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel
				store i64 %old, i64 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: min_i32_varying:
				; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
				; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
				; GFX8MORE: ds_min_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
				define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
				entry:
				%lane = call i32 @llvm.amdgcn.workitem.id.x()
				%old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: min_i64_constant:
				; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
				; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
				; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
				; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
				; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], 5
				; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], 0
				; GCN: ds_min_rtn_i64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}
				define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
				entry:
				%old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel
				store i64 %old, i64 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: umax_i32_varying:
				; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
				; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
				; GFX8MORE: ds_max_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
				define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
				entry:
				%lane = call i32 @llvm.amdgcn.workitem.id.x()
				%old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: umax_i64_constant:
				; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
				; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
				; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
				; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
				; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], 5
				; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], 0
				; GCN: ds_max_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}
				define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
				entry:
				%old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel
				store i64 %old, i64 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: umin_i32_varying:
				; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
				; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
				; GFX8MORE: ds_min_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
				define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
				entry:
				%lane = call i32 @llvm.amdgcn.workitem.id.x()
				%old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: umin_i64_constant:
				; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
				; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
				; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
				; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
				; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], 5
				; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], 0
				; GCN: ds_min_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}
				define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
				entry:
				%old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel
				store i64 %old, i64 addrspace(1)* %out
				ret void
				}

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Optimize atomic max/min
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 210133

llvm/trunk/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp

llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Optimize atomic max/minClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 210133

llvm/trunk/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp

llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll

[AMDGPU] Optimize atomic max/min
ClosedPublic