This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
lib/Target/AMDGPU/
-
Target/
-
AMDGPU/
3/3
AMDGPUAtomicOptimizer.cpp
-
AMDGPUSubtarget.h
-
SIDefines.h
-
test/CodeGen/AMDGPU/
-
CodeGen/
-
AMDGPU/
-
atomic_optimizations_buffer.ll
-
atomic_optimizations_global_pointer.ll
-
atomic_optimizations_local_pointer.ll
-
atomic_optimizations_pixelshader.ll
-
atomic_optimizations_raw_buffer.ll
-
atomic_optimizations_struct_buffer.ll

Differential D65644

[AMDGPU] gfx10 atomic optimizer changes.
ClosedPublic

Authored by foad on Aug 2 2019, 3:33 AM.

Download Raw Diff

Details

Reviewers

arsenm
sheredom
critson
rampitec
nhaehnle

Commits

rL369745: [AMDGPU] gfx10 atomic optimizer changes.
rGeac23862a85f: [AMDGPU] gfx10 atomic optimizer changes.

Summary

Add support for gfx10, where all DPP operations are confined to work
within a single row of 16 lanes, and wave32.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

foad created this revision.Aug 2 2019, 3:33 AM

Herald added a project: Restricted Project. · View Herald TranscriptAug 2 2019, 3:33 AM

Herald added subscribers: jfb, hiraditya, t-tye and 5 others. · View Herald Transcript

Harbormaster completed remote builds in B36014: Diff 213008.Aug 2 2019, 3:35 AM

I'm happy to spilt this up if the reviewers would like. There are a few NFC changes I could apply first, and/or I could try to split the wave32 changes out from the gfx10 dpp changes.

arsenm added inline comments.Aug 5 2019, 7:32 AM

llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
289	I think it would end up being shorter/less line wrapping if you separately got the declaration for the update_dpp intrinsic and reused it in all of these places
293	I'm trying to avoid explicit getGeneration checks everywhere, and restricting them to all be in the Subtarget.

foad marked an inline comment as done.Aug 5 2019, 7:55 AM

foad added inline comments.

llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
293	You mean I should define and use some more specific properties like `hasDPPBroadcasts` and `hasDPPWavefrontShifts`?

Add new hasDPPBroadcasts and hasDPPWavefrontShifts.
Use CreateCall instead of CreateIntrinsic in new helper functions.

Harbormaster completed remote builds in B36204: Diff 213552.Aug 6 2019, 2:31 AM

foad marked 2 inline comments as done.Aug 6 2019, 2:32 AM

Ping.

LGTM

This revision is now accepted and ready to land.Aug 18 2019, 8:21 AM

Closed by commit rGeac23862a85f: [AMDGPU] gfx10 atomic optimizer changes. (authored by foad). · Explain WhyAug 23 2019, 3:08 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

lib/

Target/

AMDGPU/

AMDGPUAtomicOptimizer.cpp

200 lines

AMDGPUSubtarget.h

8 lines

SIDefines.h

1 line

test/

CodeGen/

AMDGPU/

atomic_optimizations_buffer.ll

90 lines

atomic_optimizations_global_pointer.ll

118 lines

atomic_optimizations_local_pointer.ll

318 lines

atomic_optimizations_pixelshader.ll

36 lines

atomic_optimizations_raw_buffer.ll

70 lines

atomic_optimizations_struct_buffer.ll

70 lines

Diff 216792

llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp

Show All 9 Lines
/// This pass optimizes atomic operations by using a single lane of a wavefront		/// This pass optimizes atomic operations by using a single lane of a wavefront
/// to perform the atomic operation, thus reducing contention on that memory		/// to perform the atomic operation, thus reducing contention on that memory
/// location.		/// location.
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "AMDGPU.h"		#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"		#include "AMDGPUSubtarget.h"
		#include "SIDefines.h"
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"		#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
#include "llvm/CodeGen/TargetPassConfig.h"		#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IRBuilder.h"		#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"		#include "llvm/IR/InstVisitor.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"		#include "llvm/Transforms/Utils/BasicBlockUtils.h"

#define DEBUG_TYPE "amdgpu-atomic-optimizer"		#define DEBUG_TYPE "amdgpu-atomic-optimizer"

using namespace llvm;		using namespace llvm;
		using namespace llvm::AMDGPU;

namespace {		namespace {

enum DPP_CTRL {
DPP_ROW_SR1 = 0x111,
DPP_ROW_SR2 = 0x112,
DPP_ROW_SR3 = 0x113,
DPP_ROW_SR4 = 0x114,
DPP_ROW_SR8 = 0x118,
DPP_WF_SR1 = 0x138,
DPP_ROW_BCAST15 = 0x142,
DPP_ROW_BCAST31 = 0x143
};

struct ReplacementInfo {		struct ReplacementInfo {
Instruction *I;		Instruction *I;
AtomicRMWInst::BinOp Op;		AtomicRMWInst::BinOp Op;
unsigned ValIdx;		unsigned ValIdx;
bool ValDivergent;		bool ValDivergent;
};		};

class AMDGPUAtomicOptimizer : public FunctionPass,		class AMDGPUAtomicOptimizer : public FunctionPass,
public InstVisitor<AMDGPUAtomicOptimizer> {		public InstVisitor<AMDGPUAtomicOptimizer> {
private:		private:
SmallVector<ReplacementInfo, 8> ToReplace;		SmallVector<ReplacementInfo, 8> ToReplace;
const LegacyDivergenceAnalysis *DA;		const LegacyDivergenceAnalysis *DA;
const DataLayout *DL;		const DataLayout *DL;
DominatorTree *DT;		DominatorTree *DT;
bool HasDPP;		const GCNSubtarget *ST;
bool IsPixelShader;		bool IsPixelShader;

		Value buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value V,
		Value *const Identity) const;
		Value buildShiftRight(IRBuilder<> &B, Value V, Value *const Identity) const;
void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx,		void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx,
bool ValDivergent) const;		bool ValDivergent) const;

public:		public:
static char ID;		static char ID;

AMDGPUAtomicOptimizer() : FunctionPass(ID) {}		AMDGPUAtomicOptimizer() : FunctionPass(ID) {}

Show All 22 Lines	bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) {

DA = &getAnalysis<LegacyDivergenceAnalysis>();		DA = &getAnalysis<LegacyDivergenceAnalysis>();
DL = &F.getParent()->getDataLayout();		DL = &F.getParent()->getDataLayout();
DominatorTreeWrapperPass *const DTW =		DominatorTreeWrapperPass *const DTW =
getAnalysisIfAvailable<DominatorTreeWrapperPass>();		getAnalysisIfAvailable<DominatorTreeWrapperPass>();
DT = DTW ? &DTW->getDomTree() : nullptr;		DT = DTW ? &DTW->getDomTree() : nullptr;
const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();		const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
const TargetMachine &TM = TPC.getTM<TargetMachine>();		const TargetMachine &TM = TPC.getTM<TargetMachine>();
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);		ST = &TM.getSubtarget<GCNSubtarget>(F);
HasDPP = ST.hasDPP();
IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;		IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;

visit(F);		visit(F);

const bool Changed = !ToReplace.empty();		const bool Changed = !ToReplace.empty();

for (ReplacementInfo &Info : ToReplace) {		for (ReplacementInfo &Info : ToReplace) {
optimizeAtomic(*Info.I, Info.Op, Info.ValIdx, Info.ValDivergent);		optimizeAtomic(*Info.I, Info.Op, Info.ValIdx, Info.ValDivergent);
▲ Show 20 Lines • Show All 41 Lines • ▼ Show 20 Lines	void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) {
}		}

const bool ValDivergent = DA->isDivergentUse(&I.getOperandUse(ValIdx));		const bool ValDivergent = DA->isDivergentUse(&I.getOperandUse(ValIdx));

// If the value operand is divergent, each lane is contributing a different		// If the value operand is divergent, each lane is contributing a different
// value to the atomic calculation. We can only optimize divergent values if		// value to the atomic calculation. We can only optimize divergent values if
// we have DPP available on our subtarget, and the atomic operation is 32		// we have DPP available on our subtarget, and the atomic operation is 32
// bits.		// bits.
if (ValDivergent && (!HasDPP \|\| (DL->getTypeSizeInBits(I.getType()) != 32))) {		if (ValDivergent &&
		(!ST->hasDPP() \|\| DL->getTypeSizeInBits(I.getType()) != 32)) {
return;		return;
}		}

// If we get here, we can optimize the atomic using a single wavefront-wide		// If we get here, we can optimize the atomic using a single wavefront-wide
// atomic operation to do the calculation for the entire wavefront, so		// atomic operation to do the calculation for the entire wavefront, so
// remember the instruction so we can come back to it.		// remember the instruction so we can come back to it.
const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent};		const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent};

▲ Show 20 Lines • Show All 56 Lines • ▼ Show 20 Lines	void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) {
const unsigned ValIdx = 0;		const unsigned ValIdx = 0;

const bool ValDivergent = DA->isDivergentUse(&I.getOperandUse(ValIdx));		const bool ValDivergent = DA->isDivergentUse(&I.getOperandUse(ValIdx));

// If the value operand is divergent, each lane is contributing a different		// If the value operand is divergent, each lane is contributing a different
// value to the atomic calculation. We can only optimize divergent values if		// value to the atomic calculation. We can only optimize divergent values if
// we have DPP available on our subtarget, and the atomic operation is 32		// we have DPP available on our subtarget, and the atomic operation is 32
// bits.		// bits.
if (ValDivergent && (!HasDPP \|\| (DL->getTypeSizeInBits(I.getType()) != 32))) {		if (ValDivergent &&
		(!ST->hasDPP() \|\| DL->getTypeSizeInBits(I.getType()) != 32)) {
return;		return;
}		}

// If any of the other arguments to the intrinsic are divergent, we can't		// If any of the other arguments to the intrinsic are divergent, we can't
// optimize the operation.		// optimize the operation.
for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) {		for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) {
if (DA->isDivergentUse(&I.getOperandUse(Idx))) {		if (DA->isDivergentUse(&I.getOperandUse(Idx))) {
return;		return;
Show All 40 Lines	static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
case AtomicRMWInst::UMin:		case AtomicRMWInst::UMin:
Pred = CmpInst::ICMP_ULT;		Pred = CmpInst::ICMP_ULT;
break;		break;
}		}
Value *Cond = B.CreateICmp(Pred, LHS, RHS);		Value *Cond = B.CreateICmp(Pred, LHS, RHS);
return B.CreateSelect(Cond, LHS, RHS);		return B.CreateSelect(Cond, LHS, RHS);
}		}

		// Use the builder to create an inclusive scan of V across the wavefront, with
		// all lanes active.
		Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
		Value V, Value const Identity) const {
		Type *const Ty = V->getType();
		Module *M = B.GetInsertBlock()->getModule();
		Function *UpdateDPP =
		Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
		Function *PermLaneX16 =
		Intrinsic::getDeclaration(M, Intrinsic::amdgcn_permlanex16, {});
		arsenmUnsubmitted Done Reply Inline Actions I think it would end up being shorter/less line wrapping if you separately got the declaration for the update_dpp intrinsic and reused it in all of these places arsenm: I think it would end up being shorter/less line wrapping if you separately got the declaration…
		Function *ReadLane =
		Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});

		for (unsigned Idx = 0; Idx < 4; Idx++) {
		arsenmUnsubmitted Done Reply Inline Actions I'm trying to avoid explicit getGeneration checks everywhere, and restricting them to all be in the Subtarget. arsenm: I'm trying to avoid explicit getGeneration checks everywhere, and restricting them to all be in…
		foadAuthorUnsubmitted Done Reply Inline Actions You mean I should define and use some more specific properties like `hasDPPBroadcasts` and `hasDPPWavefrontShifts`? foad: You mean I should define and use some more specific properties like `hasDPPBroadcasts` and…
		V = buildNonAtomicBinOp(
		B, Op, V,
		B.CreateCall(UpdateDPP,
		{Identity, V, B.getInt32(DPP::ROW_SHR0 \| 1 << Idx),
		B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
		}
		if (ST->hasDPPBroadcasts()) {
		// GFX9 has DPP row broadcast operations.
		V = buildNonAtomicBinOp(
		B, Op, V,
		B.CreateCall(UpdateDPP,
		{Identity, V, B.getInt32(DPP::BCAST15), B.getInt32(0xa),
		B.getInt32(0xf), B.getFalse()}));
		V = buildNonAtomicBinOp(
		B, Op, V,
		B.CreateCall(UpdateDPP,
		{Identity, V, B.getInt32(DPP::BCAST31), B.getInt32(0xc),
		B.getInt32(0xf), B.getFalse()}));
		} else {
		// On GFX10 all DPP operations are confined to a single row. To get cross-
		// row operations we have to use permlane or readlane.

		// Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
		// 48..63).
		Value *const PermX =
		B.CreateCall(PermLaneX16, {V, V, B.getInt32(-1), B.getInt32(-1),
		B.getFalse(), B.getFalse()});
		V = buildNonAtomicBinOp(
		B, Op, V,
		B.CreateCall(UpdateDPP,
		{Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID),
		B.getInt32(0xa), B.getInt32(0xf), B.getFalse()}));
		if (!ST->isWave32()) {
		// Combine lane 31 into lanes 32..63.
		Value *const Lane31 = B.CreateCall(ReadLane, {V, B.getInt32(31)});
		V = buildNonAtomicBinOp(
		B, Op, V,
		B.CreateCall(UpdateDPP,
		{Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID),
		B.getInt32(0xc), B.getInt32(0xf), B.getFalse()}));
		}
		}
		return V;
		}

		// Use the builder to create a shift right of V across the wavefront, with all
		// lanes active, to turn an inclusive scan into an exclusive scan.
		Value AMDGPUAtomicOptimizer::buildShiftRight(IRBuilder<> &B, Value V,
		Value *const Identity) const {
		Type *const Ty = V->getType();
		Module *M = B.GetInsertBlock()->getModule();
		Function *UpdateDPP =
		Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
		Function *ReadLane =
		Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
		Function *WriteLane =
		Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {});

		if (ST->hasDPPWavefrontShifts()) {
		// GFX9 has DPP wavefront shift operations.
		V = B.CreateCall(UpdateDPP,
		{Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
		B.getInt32(0xf), B.getFalse()});
		} else {
		// On GFX10 all DPP operations are confined to a single row. To get cross-
		// row operations we have to use permlane or readlane.
		Value *Old = V;
		V = B.CreateCall(UpdateDPP,
		{Identity, V, B.getInt32(DPP::ROW_SHR0 + 1),
		B.getInt32(0xf), B.getInt32(0xf), B.getFalse()});

		// Copy the old lane 15 to the new lane 16.
		V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}),
		B.getInt32(16), V});

		if (!ST->isWave32()) {
		// Copy the old lane 31 to the new lane 32.
		V = B.CreateCall(
		WriteLane,
		{B.CreateCall(ReadLane, {Old, B.getInt32(31)}), B.getInt32(32), V});

		// Copy the old lane 47 to the new lane 48.
		V = B.CreateCall(
		WriteLane,
		{B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V});
		}
		}

		return V;
		}

static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op,		static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op,
unsigned BitWidth) {		unsigned BitWidth) {
switch (Op) {		switch (Op) {
default:		default:
llvm_unreachable("Unhandled atomic op");		llvm_unreachable("Unhandled atomic op");
case AtomicRMWInst::Add:		case AtomicRMWInst::Add:
case AtomicRMWInst::Sub:		case AtomicRMWInst::Sub:
case AtomicRMWInst::Or:		case AtomicRMWInst::Or:
▲ Show 20 Lines • Show All 47 Lines • ▼ Show 20 Lines	void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
Type *const VecTy = VectorType::get(B.getInt32Ty(), 2);		Type *const VecTy = VectorType::get(B.getInt32Ty(), 2);

// This is the value in the atomic operation we need to combine in order to		// This is the value in the atomic operation we need to combine in order to
// reduce the number of atomic operations.		// reduce the number of atomic operations.
Value *const V = I.getOperand(ValIdx);		Value *const V = I.getOperand(ValIdx);

// We need to know how many lanes are active within the wavefront, and we do		// We need to know how many lanes are active within the wavefront, and we do
// this by doing a ballot of active lanes.		// this by doing a ballot of active lanes.
		Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize());
CallInst *const Ballot = B.CreateIntrinsic(		CallInst *const Ballot = B.CreateIntrinsic(
Intrinsic::amdgcn_icmp, {B.getInt64Ty(), B.getInt32Ty()},		Intrinsic::amdgcn_icmp, {WaveTy, B.getInt32Ty()},
{B.getInt32(1), B.getInt32(0), B.getInt32(CmpInst::ICMP_NE)});		{B.getInt32(1), B.getInt32(0), B.getInt32(CmpInst::ICMP_NE)});

// We need to know how many lanes are active within the wavefront that are		// We need to know how many lanes are active within the wavefront that are
// below us. If we counted each lane linearly starting from 0, a lane is		// below us. If we counted each lane linearly starting from 0, a lane is
// below us only if its associated index was less than ours. We do this by		// below us only if its associated index was less than ours. We do this by
// using the mbcnt intrinsic.		// using the mbcnt intrinsic.
		Value *Mbcnt;
		if (ST->isWave32()) {
		Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
		{Ballot, B.getInt32(0)});
		} else {
Value *const BitCast = B.CreateBitCast(Ballot, VecTy);		Value *const BitCast = B.CreateBitCast(Ballot, VecTy);
Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0));		Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0));
Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1));		Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1));
CallInst *const PartialMbcnt = B.CreateIntrinsic(		Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
Intrinsic::amdgcn_mbcnt_lo, {}, {ExtractLo, B.getInt32(0)});		{ExtractLo, B.getInt32(0)});
Value *const Mbcnt =		Mbcnt =
B.CreateIntCast(B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {},		B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt});
{ExtractHi, PartialMbcnt}),		}
Ty, false);		Mbcnt = B.CreateIntCast(Mbcnt, Ty, false);

Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth));		Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth));

Value *ExclScan = nullptr;		Value *ExclScan = nullptr;
Value *NewV = nullptr;		Value *NewV = nullptr;

// If we have a divergent value in each lane, we need to combine the value		// If we have a divergent value in each lane, we need to combine the value
// using DPP.		// using DPP.
if (ValDivergent) {		if (ValDivergent) {
// First we need to set all inactive invocations to the identity value, so		// First we need to set all inactive invocations to the identity value, so
// that they can correctly contribute to the final result.		// that they can correctly contribute to the final result.
CallInst *const SetInactive =		NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});

ExclScan =
B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, Ty,
{Identity, SetInactive, B.getInt32(DPP_WF_SR1),
B.getInt32(0xf), B.getInt32(0xf), B.getFalse()});

const unsigned Iters = 6;
const unsigned DPPCtrl[Iters] = {DPP_ROW_SR1, DPP_ROW_SR2,
DPP_ROW_SR4, DPP_ROW_SR8,
DPP_ROW_BCAST15, DPP_ROW_BCAST31};
const unsigned RowMask[Iters] = {0xf, 0xf, 0xf, 0xf, 0xa, 0xc};
const unsigned BankMask[Iters] = {0xf, 0xf, 0xe, 0xc, 0xf, 0xf};

// This loop performs an exclusive scan across the wavefront, with all lanes
// active (by using the WWM intrinsic).
for (unsigned Idx = 0; Idx < Iters; Idx++) {
CallInst *const DPP = B.CreateIntrinsic(
Intrinsic::amdgcn_update_dpp, Ty,
{Identity, ExclScan, B.getInt32(DPPCtrl[Idx]),
B.getInt32(RowMask[Idx]), B.getInt32(BankMask[Idx]), B.getFalse()});

ExclScan = buildNonAtomicBinOp(B, Op, ExclScan, DPP);
}

NewV = buildNonAtomicBinOp(B, Op, SetInactive, ExclScan);		const AtomicRMWInst::BinOp ScanOp =
		Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op;
		NewV = buildScan(B, ScanOp, NewV, Identity);
		ExclScan = buildShiftRight(B, NewV, Identity);

// Read the value from the last lane, which has accumlated the values of		// Read the value from the last lane, which has accumlated the values of
// each active lane in the wavefront. This will be our new value which we		// each active lane in the wavefront. This will be our new value which we
// will provide to the atomic operation.		// will provide to the atomic operation.
		Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
if (TyBitWidth == 64) {		if (TyBitWidth == 64) {
Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty());		Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty());
Value *const ExtractHi =		Value *const ExtractHi =
B.CreateTrunc(B.CreateLShr(NewV, B.getInt64(32)), B.getInt32Ty());		B.CreateTrunc(B.CreateLShr(NewV, 32), B.getInt32Ty());
CallInst *const ReadLaneLo = B.CreateIntrinsic(		CallInst *const ReadLaneLo = B.CreateIntrinsic(
Intrinsic::amdgcn_readlane, {}, {ExtractLo, B.getInt32(63)});		Intrinsic::amdgcn_readlane, {}, {ExtractLo, LastLaneIdx});
CallInst *const ReadLaneHi = B.CreateIntrinsic(		CallInst *const ReadLaneHi = B.CreateIntrinsic(
Intrinsic::amdgcn_readlane, {}, {ExtractHi, B.getInt32(63)});		Intrinsic::amdgcn_readlane, {}, {ExtractHi, LastLaneIdx});
Value *const PartialInsert = B.CreateInsertElement(		Value *const PartialInsert = B.CreateInsertElement(
UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0));		UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0));
Value *const Insert =		Value *const Insert =
B.CreateInsertElement(PartialInsert, ReadLaneHi, B.getInt32(1));		B.CreateInsertElement(PartialInsert, ReadLaneHi, B.getInt32(1));
NewV = B.CreateBitCast(Insert, Ty);		NewV = B.CreateBitCast(Insert, Ty);
} else if (TyBitWidth == 32) {		} else if (TyBitWidth == 32) {
NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},		NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
{NewV, B.getInt32(63)});		{NewV, LastLaneIdx});
} else {		} else {
llvm_unreachable("Unhandled atomic bit width");		llvm_unreachable("Unhandled atomic bit width");
}		}

// Finally mark the readlanes in the WWM section.		// Finally mark the readlanes in the WWM section.
NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV);		NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV);
} else {		} else {
switch (Op) {		switch (Op) {
▲ Show 20 Lines • Show All 70 Lines • ▼ Show 20 Lines	if (NeedResult) {
// We need to broadcast the value who was the lowest active lane (the first		// We need to broadcast the value who was the lowest active lane (the first
// lane) to all other lanes in the wavefront. We use an intrinsic for this,		// lane) to all other lanes in the wavefront. We use an intrinsic for this,
// but have to handle 64-bit broadcasts with two calls to this intrinsic.		// but have to handle 64-bit broadcasts with two calls to this intrinsic.
Value *BroadcastI = nullptr;		Value *BroadcastI = nullptr;

if (TyBitWidth == 64) {		if (TyBitWidth == 64) {
Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty());		Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty());
Value *const ExtractHi =		Value *const ExtractHi =
B.CreateTrunc(B.CreateLShr(PHI, B.getInt64(32)), B.getInt32Ty());		B.CreateTrunc(B.CreateLShr(PHI, 32), B.getInt32Ty());
CallInst *const ReadFirstLaneLo =		CallInst *const ReadFirstLaneLo =
B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);		B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
CallInst *const ReadFirstLaneHi =		CallInst *const ReadFirstLaneHi =
B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);		B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);
Value *const PartialInsert = B.CreateInsertElement(		Value *const PartialInsert = B.CreateInsertElement(
UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));		UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
Value *const Insert =		Value *const Insert =
B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1));		B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1));
▲ Show 20 Lines • Show All 66 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

Show First 20 Lines • Show All 943 Lines • ▼ Show 20 Lines	public:
bool hasLDSFPAtomics() const {		bool hasLDSFPAtomics() const {
return GFX8Insts;		return GFX8Insts;
}		}

bool hasDPP() const {		bool hasDPP() const {
return HasDPP;		return HasDPP;
}		}

		bool hasDPPBroadcasts() const {
		return HasDPP && getGeneration() < GFX10;
		}

		bool hasDPPWavefrontShifts() const {
		return HasDPP && getGeneration() < GFX10;
		}

bool hasDPP8() const {		bool hasDPP8() const {
return HasDPP8;		return HasDPP8;
}		}

bool hasR128A16() const {		bool hasR128A16() const {
return HasR128A16;		return HasR128A16;
}		}

▲ Show 20 Lines • Show All 371 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/SIDefines.h

	Show First 20 Lines • Show All 438 Lines • ▼ Show 20 Lines
	};			};

	} // namespace SDWA			} // namespace SDWA

	namespace DPP {			namespace DPP {

	enum DppCtrl : unsigned {			enum DppCtrl : unsigned {
	QUAD_PERM_FIRST = 0,			QUAD_PERM_FIRST = 0,
				QUAD_PERM_ID = 0xE4, // identity permutation
	QUAD_PERM_LAST = 0xFF,			QUAD_PERM_LAST = 0xFF,
	DPP_UNUSED1 = 0x100,			DPP_UNUSED1 = 0x100,
	ROW_SHL0 = 0x100,			ROW_SHL0 = 0x100,
	ROW_SHL_FIRST = 0x101,			ROW_SHL_FIRST = 0x101,
	ROW_SHL_LAST = 0x10F,			ROW_SHL_LAST = 0x10F,
	DPP_UNUSED2 = 0x110,			DPP_UNUSED2 = 0x110,
	ROW_SHR0 = 0x110,			ROW_SHR0 = 0x110,
	ROW_SHR_FIRST = 0x111,			ROW_SHR_FIRST = 0x111,
	▲ Show 20 Lines • Show All 170 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll

	; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s			; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX7LESS %s
	; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s			; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX89 %s
	; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s			; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX89 %s
				; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64 %s
				; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GCN32,GFX8MORE,GFX8MORE32 %s

	declare i32 @llvm.amdgcn.workitem.id.x()			declare i32 @llvm.amdgcn.workitem.id.x()
	declare i32 @llvm.amdgcn.buffer.atomic.add(i32, <4 x i32>, i32, i32, i1)			declare i32 @llvm.amdgcn.buffer.atomic.add(i32, <4 x i32>, i32, i32, i1)
	declare i32 @llvm.amdgcn.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i1)			declare i32 @llvm.amdgcn.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i1)

	; Show that what the atomic optimization pass will do for raw buffers.			; Show that what the atomic optimization pass will do for raw buffers.

	; GCN-LABEL: add_i32_constant:			; GCN-LABEL: add_i32_constant:
	; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0			; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
	; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0			; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
	; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]			; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
	; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]			; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
	; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}			; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
				; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
				; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
	; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5			; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
	; GCN: buffer_atomic_add v[[value]]			; GCN: buffer_atomic_add v[[value]]
	define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {			define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
	entry:			entry:
	%old = call i32 @llvm.amdgcn.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i1 0)			%old = call i32 @llvm.amdgcn.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i1 0)
	store i32 %old, i32 addrspace(1)* %out			store i32 %old, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: add_i32_uniform:			; GCN-LABEL: add_i32_uniform:
	; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0			; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
	; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0			; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
	; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]			; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
	; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]			; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
	; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}			; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
				; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
				; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
	; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]			; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
	; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]			; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
	; GCN: buffer_atomic_add v[[value]]			; GCN: buffer_atomic_add v[[value]]
	define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) {			define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) {
	entry:			entry:
	%old = call i32 @llvm.amdgcn.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i1 0)			%old = call i32 @llvm.amdgcn.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i1 0)
	store i32 %old, i32 addrspace(1)* %out			store i32 %old, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: add_i32_varying_vdata:			; GCN-LABEL: add_i32_varying_vdata:
	; GFX7LESS-NOT: v_mbcnt_lo_u32_b32			; GFX7LESS-NOT: v_mbcnt_lo_u32_b32
	; GFX7LESS-NOT: v_mbcnt_hi_u32_b32			; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
	; GFX7LESS-NOT: s_bcnt1_i32_b64			; GFX7LESS-NOT: s_bcnt1_i32_b64
	; GFX7LESS: buffer_atomic_add v{{[0-9]+}}			; GFX7LESS: buffer_atomic_add v{{[0-9]+}}
	; GFX8MORE: v_mov_b32_dpp v[[wave_shr1:[0-9]+]], v{{[0-9]+}} wave_shr:1 row_mask:0xf bank_mask:0xf			; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:1 row_mask:0xf bank_mask:0xf
	; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:1 row_mask:0xf bank_mask:0xf			; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:2 row_mask:0xf bank_mask:0xf
	; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:2 row_mask:0xf bank_mask:0xf			; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:4 row_mask:0xf bank_mask:0xf
	; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:4 row_mask:0xf bank_mask:0xe			; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:8 row_mask:0xf bank_mask:0xf
	; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:8 row_mask:0xf bank_mask:0xc			; GFX89: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:15 row_mask:0xa bank_mask:0xf
	; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:15 row_mask:0xa bank_mask:0xf			; GFX89: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf
	; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf			; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
	; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63			; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
				; GFX89: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} wave_shr:1 row_mask:0xf bank_mask:0xf
	; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]			; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
	; GFX8MORE: buffer_atomic_add v[[value]]			; GFX8MORE: buffer_atomic_add v[[value]]
	define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {			define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
	entry:			entry:
	%lane = call i32 @llvm.amdgcn.workitem.id.x()			%lane = call i32 @llvm.amdgcn.workitem.id.x()
	%old = call i32 @llvm.amdgcn.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i1 0)			%old = call i32 @llvm.amdgcn.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i1 0)
	store i32 %old, i32 addrspace(1)* %out			store i32 %old, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: add_i32_varying_offset:			; GCN-LABEL: add_i32_varying_offset:
	; GCN-NOT: v_mbcnt_lo_u32_b32			; GCN-NOT: v_mbcnt_lo_u32_b32
	; GCN-NOT: v_mbcnt_hi_u32_b32			; GCN-NOT: v_mbcnt_hi_u32_b32
	; GCN-NOT: s_bcnt1_i32_b64			; GCN-NOT: s_bcnt1_i32_b64
	; GCN: buffer_atomic_add v{{[0-9]+}}			; GCN: buffer_atomic_add v{{[0-9]+}}
	define amdgpu_kernel void @add_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) {			define amdgpu_kernel void @add_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) {
	entry:			entry:
	%lane = call i32 @llvm.amdgcn.workitem.id.x()			%lane = call i32 @llvm.amdgcn.workitem.id.x()
	%old = call i32 @llvm.amdgcn.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i1 0)			%old = call i32 @llvm.amdgcn.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i1 0)
	store i32 %old, i32 addrspace(1)* %out			store i32 %old, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: sub_i32_constant:			; GCN-LABEL: sub_i32_constant:
	; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0			; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
	; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0			; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
	; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]			; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
	; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]			; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
	; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}			; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
				; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
				; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
	; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5			; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
	; GCN: buffer_atomic_sub v[[value]]			; GCN: buffer_atomic_sub v[[value]]
	define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {			define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
	entry:			entry:
	%old = call i32 @llvm.amdgcn.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i1 0)			%old = call i32 @llvm.amdgcn.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i1 0)
	store i32 %old, i32 addrspace(1)* %out			store i32 %old, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: sub_i32_uniform:			; GCN-LABEL: sub_i32_uniform:
	; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0			; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
	; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0			; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
	; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]			; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
	; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]			; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
	; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}			; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
				; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
				; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
	; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]			; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
	; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]			; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
	; GCN: buffer_atomic_sub v[[value]]			; GCN: buffer_atomic_sub v[[value]]
	define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) {			define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) {
	entry:			entry:
	%old = call i32 @llvm.amdgcn.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i1 0)			%old = call i32 @llvm.amdgcn.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i1 0)
	store i32 %old, i32 addrspace(1)* %out			store i32 %old, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: sub_i32_varying_vdata:			; GCN-LABEL: sub_i32_varying_vdata:
	; GFX7LESS-NOT: v_mbcnt_lo_u32_b32			; GFX7LESS-NOT: v_mbcnt_lo_u32_b32
	; GFX7LESS-NOT: v_mbcnt_hi_u32_b32			; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
	; GFX7LESS-NOT: s_bcnt1_i32_b64			; GFX7LESS-NOT: s_bcnt1_i32_b64
	; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}			; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
	; GFX8MORE: v_mov_b32_dpp v[[wave_shr1:[0-9]+]], v{{[0-9]+}} wave_shr:1 row_mask:0xf bank_mask:0xf			; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:1 row_mask:0xf bank_mask:0xf
	; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:1 row_mask:0xf bank_mask:0xf			; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:2 row_mask:0xf bank_mask:0xf
	; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:2 row_mask:0xf bank_mask:0xf			; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:4 row_mask:0xf bank_mask:0xf
	; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:4 row_mask:0xf bank_mask:0xe			; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:8 row_mask:0xf bank_mask:0xf
	; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:8 row_mask:0xf bank_mask:0xc			; GFX89: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:15 row_mask:0xa bank_mask:0xf
	; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:15 row_mask:0xa bank_mask:0xf			; GFX89: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf
	; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf			; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
	; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63			; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
				; GFX89: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} wave_shr:1 row_mask:0xf bank_mask:0xf
	; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]			; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
	; GFX8MORE: buffer_atomic_sub v[[value]]			; GFX8MORE: buffer_atomic_sub v[[value]]
	define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {			define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
	entry:			entry:
	%lane = call i32 @llvm.amdgcn.workitem.id.x()			%lane = call i32 @llvm.amdgcn.workitem.id.x()
	%old = call i32 @llvm.amdgcn.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i1 0)			%old = call i32 @llvm.amdgcn.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i1 0)
	store i32 %old, i32 addrspace(1)* %out			store i32 %old, i32 addrspace(1)* %out
	ret void			ret void
	Show All 14 Lines

llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll

	; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s			; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX7LESS %s
	; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s			; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s
	; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s			; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s
				; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64 %s
				; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GCN32,GFX8MORE,GFX8MORE32 %s

	declare i32 @llvm.amdgcn.workitem.id.x()			declare i32 @llvm.amdgcn.workitem.id.x()

	; Show that what the atomic optimization pass will do for global pointers.			; Show that what the atomic optimization pass will do for global pointers.

	; GCN-LABEL: add_i32_constant:			; GCN-LABEL: add_i32_constant:
	; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0			; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
	; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0			; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
	; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]			; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
	; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]			; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
	; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}			; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
				; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
				; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
	; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5			; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
	; GCN: {{flat\|buffer\|global}}_atomic_add v[[value]]			; GCN: {{flat\|buffer\|global}}_atomic_add v[[value]]
	define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {			define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {
	entry:			entry:
	%old = atomicrmw add i32 addrspace(1)* %inout, i32 5 acq_rel			%old = atomicrmw add i32 addrspace(1)* %inout, i32 5 acq_rel
	store i32 %old, i32 addrspace(1)* %out			store i32 %old, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: add_i32_uniform:			; GCN-LABEL: add_i32_uniform:
	; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0			; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
	; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0			; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
	; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]			; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
	; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]			; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
	; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}			; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
				; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
				; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
	; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]			; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
	; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]			; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
	; GCN: {{flat\|buffer\|global}}_atomic_add v[[value]]			; GCN: {{flat\|buffer\|global}}_atomic_add v[[value]]
	define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %additive) {			define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %additive) {
	entry:			entry:
	%old = atomicrmw add i32 addrspace(1)* %inout, i32 %additive acq_rel			%old = atomicrmw add i32 addrspace(1)* %inout, i32 %additive acq_rel
	store i32 %old, i32 addrspace(1)* %out			store i32 %old, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: add_i32_varying:			; GCN-LABEL: add_i32_varying:
	; GFX7LESS-NOT: v_mbcnt_lo_u32_b32			; GFX7LESS-NOT: v_mbcnt_lo_u32_b32
	; GFX7LESS-NOT: v_mbcnt_hi_u32_b32			; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
	; GFX7LESS-NOT: s_bcnt1_i32_b64			; GFX7LESS-NOT: s_bcnt1_i32_b64
	; GFX7LESS: buffer_atomic_add v{{[0-9]+}}			; GFX7LESS: buffer_atomic_add v{{[0-9]+}}
	; GFX8MORE: v_add_u32_dpp			; DPPCOMB: v_add_u32_dpp
	; GFX8MORE: v_add_u32_dpp			; DPPCOMB: v_add_u32_dpp
	; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63			; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
				; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
	; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]			; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
	; GFX8MORE: buffer_atomic_add v[[value]]			; GFX8MORE: buffer_atomic_add v[[value]]
	define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {			define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {
	entry:			entry:
	%lane = call i32 @llvm.amdgcn.workitem.id.x()			%lane = call i32 @llvm.amdgcn.workitem.id.x()
	%old = atomicrmw add i32 addrspace(1)* %inout, i32 %lane acq_rel			%old = atomicrmw add i32 addrspace(1)* %inout, i32 %lane acq_rel
	store i32 %old, i32 addrspace(1)* %out			store i32 %old, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: add_i64_constant:			; GCN-LABEL: add_i64_constant:
	; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0			; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
	; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0			; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
	; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]			; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
	; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]			; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
	; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}			; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
				; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
				; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
	; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5			; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5
	; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5			; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5
	; GCN: {{flat\|buffer\|global}}_atomic_add_x2 v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}			; GCN: {{flat\|buffer\|global}}_atomic_add_x2 v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}
	define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) {			define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) {
	entry:			entry:
	%old = atomicrmw add i64 addrspace(1)* %inout, i64 5 acq_rel			%old = atomicrmw add i64 addrspace(1)* %inout, i64 5 acq_rel
	store i64 %old, i64 addrspace(1)* %out			store i64 %old, i64 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: add_i64_uniform:			; GCN-LABEL: add_i64_uniform:
	; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0			; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
	; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0			; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
	; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]			; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
	; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]			; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
	; GCN: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}			; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
				; GCN32: s_bcnt1_i32_b32 s{{[0-9]+}}, s[[exec_lo]]
				; GCN64: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
	; GCN: {{flat\|buffer\|global}}_atomic_add_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}			; GCN: {{flat\|buffer\|global}}_atomic_add_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}
	define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %additive) {			define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %additive) {
	entry:			entry:
	%old = atomicrmw add i64 addrspace(1)* %inout, i64 %additive acq_rel			%old = atomicrmw add i64 addrspace(1)* %inout, i64 %additive acq_rel
	store i64 %old, i64 addrspace(1)* %out			store i64 %old, i64 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: add_i64_varying:			; GCN-LABEL: add_i64_varying:
	; GCN-NOT: v_mbcnt_lo_u32_b32			; GCN-NOT: v_mbcnt_lo_u32_b32
	; GCN-NOT: v_mbcnt_hi_u32_b32			; GCN-NOT: v_mbcnt_hi_u32_b32
	; GCN-NOT: s_bcnt1_i32_b64			; GCN-NOT: s_bcnt1_i32_b64
	; GCN: {{flat\|buffer\|global}}_atomic_add_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}			; GCN: {{flat\|buffer\|global}}_atomic_add_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}
	define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) {			define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) {
	entry:			entry:
	%lane = call i32 @llvm.amdgcn.workitem.id.x()			%lane = call i32 @llvm.amdgcn.workitem.id.x()
	%zext = zext i32 %lane to i64			%zext = zext i32 %lane to i64
	%old = atomicrmw add i64 addrspace(1)* %inout, i64 %zext acq_rel			%old = atomicrmw add i64 addrspace(1)* %inout, i64 %zext acq_rel
	store i64 %old, i64 addrspace(1)* %out			store i64 %old, i64 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: sub_i32_constant:			; GCN-LABEL: sub_i32_constant:
	; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0			; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
	; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0			; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
	; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]			; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
	; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]			; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
	; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}			; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
				; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
				; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
	; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5			; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
	; GCN: {{flat\|buffer\|global}}_atomic_sub v[[value]]			; GCN: {{flat\|buffer\|global}}_atomic_sub v[[value]]
	define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {			define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {
	entry:			entry:
	%old = atomicrmw sub i32 addrspace(1)* %inout, i32 5 acq_rel			%old = atomicrmw sub i32 addrspace(1)* %inout, i32 5 acq_rel
	store i32 %old, i32 addrspace(1)* %out			store i32 %old, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: sub_i32_uniform:			; GCN-LABEL: sub_i32_uniform:
	; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0			; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
	; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0			; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
	; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]			; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
	; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]			; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
	; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}			; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
				; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
				; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
	; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]			; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
	; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]			; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
	; GCN: {{flat\|buffer\|global}}_atomic_sub v[[value]]			; GCN: {{flat\|buffer\|global}}_atomic_sub v[[value]]
	define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %subitive) {			define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %subitive) {
	entry:			entry:
	%old = atomicrmw sub i32 addrspace(1)* %inout, i32 %subitive acq_rel			%old = atomicrmw sub i32 addrspace(1)* %inout, i32 %subitive acq_rel
	store i32 %old, i32 addrspace(1)* %out			store i32 %old, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: sub_i32_varying:			; GCN-LABEL: sub_i32_varying:
	; GFX7LESS-NOT: v_mbcnt_lo_u32_b32			; GFX7LESS-NOT: v_mbcnt_lo_u32_b32
	; GFX7LESS-NOT: v_mbcnt_hi_u32_b32			; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
	; GFX7LESS-NOT: s_bcnt1_i32_b64			; GFX7LESS-NOT: s_bcnt1_i32_b64
	; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}			; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
	; GFX8MORE: v_sub{{(rev)?}}_u32_dpp			; DPPCOMB: v_add_u32_dpp
	; GFX8MORE: v_sub{{(rev)?}}_u32_dpp			; DPPCOMB: v_add_u32_dpp
	; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63			; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
				; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
	; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]			; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
	; GFX8MORE: buffer_atomic_sub v[[value]]			; GFX8MORE: buffer_atomic_sub v[[value]]
	define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {			define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {
	entry:			entry:
	%lane = call i32 @llvm.amdgcn.workitem.id.x()			%lane = call i32 @llvm.amdgcn.workitem.id.x()
	%old = atomicrmw sub i32 addrspace(1)* %inout, i32 %lane acq_rel			%old = atomicrmw sub i32 addrspace(1)* %inout, i32 %lane acq_rel
	store i32 %old, i32 addrspace(1)* %out			store i32 %old, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: sub_i64_constant:			; GCN-LABEL: sub_i64_constant:
	; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0			; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
	; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0			; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
	; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]			; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
	; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]			; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
	; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}			; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
				; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
				; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
	; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5			; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5
	; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5			; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5
	; GCN: {{flat\|buffer\|global}}_atomic_sub_x2 v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}			; GCN: {{flat\|buffer\|global}}_atomic_sub_x2 v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}
	define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) {			define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) {
	entry:			entry:
	%old = atomicrmw sub i64 addrspace(1)* %inout, i64 5 acq_rel			%old = atomicrmw sub i64 addrspace(1)* %inout, i64 5 acq_rel
	store i64 %old, i64 addrspace(1)* %out			store i64 %old, i64 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: sub_i64_uniform:			; GCN-LABEL: sub_i64_uniform:
	; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0			; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
	; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0			; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
	; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]			; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
	; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]			; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
	; GCN: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}			; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
				; GCN32: s_bcnt1_i32_b32 s{{[0-9]+}}, s[[exec_lo]]
				; GCN64: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
	; GCN: {{flat\|buffer\|global}}_atomic_sub_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}			; GCN: {{flat\|buffer\|global}}_atomic_sub_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}
	define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %subitive) {			define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %subitive) {
	entry:			entry:
	%old = atomicrmw sub i64 addrspace(1)* %inout, i64 %subitive acq_rel			%old = atomicrmw sub i64 addrspace(1)* %inout, i64 %subitive acq_rel
	store i64 %old, i64 addrspace(1)* %out			store i64 %old, i64 addrspace(1)* %out
	ret void			ret void
	}			}

	Show All 13 Lines

llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll

	; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s			; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX7LESS %s
	; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s			; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s
				; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s
				; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX1064 %s
				; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GCN32,GFX8MORE,GFX8MORE32,GFX1032 %s

	declare i32 @llvm.amdgcn.workitem.id.x()			declare i32 @llvm.amdgcn.workitem.id.x()

	@local_var32 = addrspace(3) global i32 undef, align 4			@local_var32 = addrspace(3) global i32 undef, align 4
	@local_var64 = addrspace(3) global i64 undef, align 8			@local_var64 = addrspace(3) global i64 undef, align 8

	; Show that what the atomic optimization pass will do for local pointers.			; Show that what the atomic optimization pass will do for local pointers.

	; GCN-LABEL: add_i32_constant:			; GCN-LABEL: add_i32_constant:
	; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0			; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
	; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0			; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
	; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]			; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
	; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]			; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
	; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}			; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
				; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
				; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
	; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5			; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
	; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]			; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
	define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {			define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
	entry:			entry:
	%old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel			%old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel
	store i32 %old, i32 addrspace(1)* %out			store i32 %old, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: add_i32_uniform:			; GCN-LABEL: add_i32_uniform:
	; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0			; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
	; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0			; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
	; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]			; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
	; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]			; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
	; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}			; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
				; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
				; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
	; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]			; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
	; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]			; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
	; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]			; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
	define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) {			define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) {
	entry:			entry:
	%old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel			%old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel
	store i32 %old, i32 addrspace(1)* %out			store i32 %old, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: add_i32_varying:			; GCN-LABEL: add_i32_varying:
	; GFX7LESS-NOT: v_mbcnt_lo_u32_b32			; GFX7LESS-NOT: v_mbcnt_lo_u32_b32
	; GFX7LESS-NOT: v_mbcnt_hi_u32_b32			; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
	; GFX7LESS-NOT: s_bcnt1_i32_b64			; GFX7LESS-NOT: s_bcnt1_i32_b64
	; GFX7LESS: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}			; GFX7LESS: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
	; GFX8MORE: v_add_u32_dpp			; DPPCOMB: v_add_u32_dpp
	; GFX8MORE: v_add_u32_dpp			; DPPCOMB: v_add_u32_dpp
	; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63			; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
				; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
	; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]			; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
	; GFX8MORE: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]			; GFX8MORE: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
	define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {			define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
	entry:			entry:
	%lane = call i32 @llvm.amdgcn.workitem.id.x()			%lane = call i32 @llvm.amdgcn.workitem.id.x()
	%old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel			%old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
	store i32 %old, i32 addrspace(1)* %out			store i32 %old, i32 addrspace(1)* %out
	ret void			ret void
	}			}

				define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) {
				; GFX1032-LABEL: add_i32_varying_gfx1032:
				; GFX1032: v_mov_b32_e32 v2, v0
				; GFX1032: s_or_saveexec_b32 s2, -1
				; GFX1032: s_load_dwordx2 s[0:1], s[0:1], 0x24
				; GFX1032: v_mov_b32_e32 v1, 0
				; GFX1032: s_mov_b32 exec_lo, s2
				; GFX1032: v_cmp_ne_u32_e64 s2, 1, 0
				; GFX1032: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
				; GFX1032: s_not_b32 exec_lo, exec_lo
				; GFX1032: v_mov_b32_e32 v2, 0
				; GFX1032: s_not_b32 exec_lo, exec_lo
				; GFX1032: s_or_saveexec_b32 s4, -1
				; GFX1032: v_mov_b32_e32 v3, v1
				; GFX1032: v_mov_b32_e32 v4, v1
				; GFX1032: s_mov_b32 s2, -1
				; GFX1032: v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf
				; GFX1032: v_add_nc_u32_e32 v2, v2, v3
				; GFX1032: v_mov_b32_e32 v3, v1
				; GFX1032: v_mov_b32_dpp v3, v2 row_shr:2 row_mask:0xf bank_mask:0xf
				; GFX1032: v_add_nc_u32_e32 v2, v2, v3
				; GFX1032: v_mov_b32_e32 v3, v1
				; GFX1032: v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf
				; GFX1032: v_add_nc_u32_e32 v2, v2, v3
				; GFX1032: v_mov_b32_e32 v3, v1
				; GFX1032: v_mov_b32_dpp v3, v2 row_shr:8 row_mask:0xf bank_mask:0xf
				; GFX1032: v_add_nc_u32_e32 v2, v2, v3
				; GFX1032: v_mov_b32_e32 v3, v2
				; GFX1032: v_permlanex16_b32 v3, v3, -1, -1
				; GFX1032: v_mov_b32_dpp v4, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
				; GFX1032: v_add_nc_u32_e32 v2, v2, v4
				; GFX1032: v_readlane_b32 s3, v2, 31
				; GFX1032: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
				; GFX1032: v_readlane_b32 s5, v2, 15
				; GFX1032: v_writelane_b32 v1, s5, 16
				; GFX1032: s_mov_b32 exec_lo, s4
				; GFX1032: v_cmp_eq_u32_e32 vcc_lo, 0, v0
				; GFX1032: s_and_saveexec_b32 s4, vcc_lo
				; GFX1032: s_cbranch_execz BB3_2
				; GFX1032: BB3_1:
				; GFX1032: v_mov_b32_e32 v0, local_var32@abs32@lo
				; GFX1032: v_mov_b32_e32 v5, s3
				; GFX1032: s_waitcnt vmcnt(0) lgkmcnt(0)
				; GFX1032: s_waitcnt_vscnt null, 0x0
				; GFX1032: ds_add_rtn_u32 v0, v0, v5
				; GFX1032: s_waitcnt vmcnt(0) lgkmcnt(0)
				; GFX1032: buffer_gl0_inv
				; GFX1032: buffer_gl1_inv
				; GFX1032: BB3_2:
				; GFX1032: v_nop
				; GFX1032: s_or_b32 exec_lo, exec_lo, s4
				; GFX1032: v_readfirstlane_b32 s3, v0
				; GFX1032: v_mov_b32_e32 v0, v1
				; GFX1032: v_add_nc_u32_e32 v0, s3, v0
				; GFX1032: s_mov_b32 s3, 0x31016000
				; GFX1032: s_nop 1
				; GFX1032: s_waitcnt lgkmcnt(0)
				; GFX1032: buffer_store_dword v0, off, s[0:3], 0
				; GFX1032: s_endpgm
				entry:
				%lane = call i32 @llvm.amdgcn.workitem.id.x()
				%old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

				define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) {
				; GFX1064-LABEL: add_i32_varying_gfx1064:
				; GFX1064: v_mov_b32_e32 v2, v0
				; GFX1064: s_or_saveexec_b64 s[2:3], -1
				; GFX1064: s_load_dwordx2 s[0:1], s[0:1], 0x24
				; GFX1064: v_mov_b32_e32 v1, 0
				; GFX1064: s_mov_b64 exec, s[2:3]
				; GFX1064: v_cmp_ne_u32_e64 s[2:3], 1, 0
				; GFX1064: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
				; GFX1064: v_mbcnt_hi_u32_b32_e64 v0, s3, v0
				; GFX1064: s_not_b64 exec, exec
				; GFX1064: v_mov_b32_e32 v2, 0
				; GFX1064: s_not_b64 exec, exec
				; GFX1064: s_or_saveexec_b64 s[4:5], -1
				; GFX1064: v_mov_b32_e32 v3, v1
				; GFX1064: v_mov_b32_e32 v4, v1
				; GFX1064: s_mov_b32 s2, -1
				; GFX1064: v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf
				; GFX1064: v_add_nc_u32_e32 v2, v2, v3
				; GFX1064: v_mov_b32_e32 v3, v1
				; GFX1064: v_mov_b32_dpp v3, v2 row_shr:2 row_mask:0xf bank_mask:0xf
				; GFX1064: v_add_nc_u32_e32 v2, v2, v3
				; GFX1064: v_mov_b32_e32 v3, v1
				; GFX1064: v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf
				; GFX1064: v_add_nc_u32_e32 v2, v2, v3
				; GFX1064: v_mov_b32_e32 v3, v1
				; GFX1064: v_mov_b32_dpp v3, v2 row_shr:8 row_mask:0xf bank_mask:0xf
				; GFX1064: v_add_nc_u32_e32 v2, v2, v3
				; GFX1064: v_mov_b32_e32 v3, v2
				; GFX1064: v_permlanex16_b32 v3, v3, -1, -1
				; GFX1064: v_mov_b32_dpp v4, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
				; GFX1064: v_add_nc_u32_e32 v2, v2, v4
				; GFX1064: v_mov_b32_e32 v4, v1
				; GFX1064: v_readlane_b32 s3, v2, 31
				; GFX1064: v_mov_b32_e32 v3, s3
				; GFX1064: v_mov_b32_dpp v4, v3 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
				; GFX1064: v_add_nc_u32_e32 v2, v2, v4
				; GFX1064: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
				; GFX1064: v_readlane_b32 s3, v2, 15
				; GFX1064: v_readlane_b32 s6, v2, 31
				; GFX1064: v_writelane_b32 v1, s3, 16
				; GFX1064: v_readlane_b32 s3, v2, 63
				; GFX1064: v_writelane_b32 v1, s6, 32
				; GFX1064: v_readlane_b32 s6, v2, 47
				; GFX1064: v_writelane_b32 v1, s6, 48
				; GFX1064: s_mov_b64 exec, s[4:5]
				; GFX1064: v_cmp_eq_u32_e32 vcc, 0, v0
				; GFX1064: s_and_saveexec_b64 s[4:5], vcc
				; GFX1064: s_cbranch_execz BB4_2
				; GFX1064: BB4_1:
				; GFX1064: v_mov_b32_e32 v0, local_var32@abs32@lo
				; GFX1064: v_mov_b32_e32 v5, s3
				; GFX1064: s_waitcnt vmcnt(0) lgkmcnt(0)
				; GFX1064: s_waitcnt_vscnt null, 0x0
				; GFX1064: ds_add_rtn_u32 v0, v0, v5
				; GFX1064: s_waitcnt vmcnt(0) lgkmcnt(0)
				; GFX1064: buffer_gl0_inv
				; GFX1064: buffer_gl1_inv
				; GFX1064: BB4_2:
				; GFX1064: v_nop
				; GFX1064: s_or_b64 exec, exec, s[4:5]
				; GFX1064: v_readfirstlane_b32 s3, v0
				; GFX1064: v_mov_b32_e32 v0, v1
				; GFX1064: v_add_nc_u32_e32 v0, s3, v0
				; GFX1064: s_mov_b32 s3, 0x31016000
				; GFX1064: s_nop 1
				; GFX1064: s_waitcnt lgkmcnt(0)
				; GFX1064: buffer_store_dword v0, off, s[0:3], 0
				; GFX1064: s_endpgm
				entry:
				%lane = call i32 @llvm.amdgcn.workitem.id.x()
				%old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

	; GCN-LABEL: add_i64_constant:			; GCN-LABEL: add_i64_constant:
	; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0			; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
	; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0			; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
	; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]			; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
	; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]			; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
	; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}			; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
				; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
				; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
	; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5			; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5
	; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5			; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5
	; GCN: ds_add_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}			; GCN: ds_add_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}
	define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {			define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
	entry:			entry:
	%old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel			%old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel
	store i64 %old, i64 addrspace(1)* %out			store i64 %old, i64 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: add_i64_uniform:			; GCN-LABEL: add_i64_uniform:
	; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0			; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
	; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0			; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
	; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]			; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
	; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]			; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
	; GCN: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}			; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
				; GCN32: s_bcnt1_i32_b32 s{{[0-9]+}}, s[[exec_lo]]
				; GCN64: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
	; GCN: ds_add_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}			; GCN: ds_add_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}
	define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) {			define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) {
	entry:			entry:
	%old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel			%old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel
	store i64 %old, i64 addrspace(1)* %out			store i64 %old, i64 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: add_i64_varying:			; GCN-LABEL: add_i64_varying:
	; GCN-NOT: v_mbcnt_lo_u32_b32			; GCN-NOT: v_mbcnt_lo_u32_b32
	; GCN-NOT: v_mbcnt_hi_u32_b32			; GCN-NOT: v_mbcnt_hi_u32_b32
	; GCN-NOT: s_bcnt1_i32_b64			; GCN-NOT: s_bcnt1_i32_b64
	; GCN: ds_add_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}			; GCN: ds_add_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}
	define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) {			define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) {
	entry:			entry:
	%lane = call i32 @llvm.amdgcn.workitem.id.x()			%lane = call i32 @llvm.amdgcn.workitem.id.x()
	%zext = zext i32 %lane to i64			%zext = zext i32 %lane to i64
	%old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel			%old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel
	store i64 %old, i64 addrspace(1)* %out			store i64 %old, i64 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: sub_i32_constant:			; GCN-LABEL: sub_i32_constant:
	; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0			; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
	; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0			; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
	; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]			; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
	; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]			; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
	; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}			; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
				; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
				; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
	; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5			; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
	; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]			; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
	define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {			define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
	entry:			entry:
	%old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel			%old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel
	store i32 %old, i32 addrspace(1)* %out			store i32 %old, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: sub_i32_uniform:			; GCN-LABEL: sub_i32_uniform:
	; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0			; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
	; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0			; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
	; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]			; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
	; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]			; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
	; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}			; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
				; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
				; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
	; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]			; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
	; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]			; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
	; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]			; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
	define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) {			define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) {
	entry:			entry:
	%old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel			%old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel
	store i32 %old, i32 addrspace(1)* %out			store i32 %old, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: sub_i32_varying:			; GCN-LABEL: sub_i32_varying:
	; GFX7LESS-NOT: v_mbcnt_lo_u32_b32			; GFX7LESS-NOT: v_mbcnt_lo_u32_b32
	; GFX7LESS-NOT: v_mbcnt_hi_u32_b32			; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
	; GFX7LESS-NOT: s_bcnt1_i32_b64			; GFX7LESS-NOT: s_bcnt1_i32_b64
	; GFX7LESS: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}			; GFX7LESS: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
	; GFX8MORE: v_sub{{(rev)?}}_u32_dpp			; DPPCOMB: v_add_u32_dpp
	; GFX8MORE: v_sub{{(rev)?}}_u32_dpp			; DPPCOMB: v_add_u32_dpp
	; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63			; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
				; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
	; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]			; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
	; GFX8MORE: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]			; GFX8MORE: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
	define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {			define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
	entry:			entry:
	%lane = call i32 @llvm.amdgcn.workitem.id.x()			%lane = call i32 @llvm.amdgcn.workitem.id.x()
	%old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel			%old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel
	store i32 %old, i32 addrspace(1)* %out			store i32 %old, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: sub_i64_constant:			; GCN-LABEL: sub_i64_constant:
	; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0			; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
	; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0			; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
	; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]			; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
	; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]			; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
	; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}			; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
				; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
				; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
	; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5			; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5
	; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5			; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5
	; GCN: ds_sub_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}			; GCN: ds_sub_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}
	define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {			define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
	entry:			entry:
	%old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel			%old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel
	store i64 %old, i64 addrspace(1)* %out			store i64 %old, i64 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: sub_i64_uniform:			; GCN-LABEL: sub_i64_uniform:
	; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0			; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
	; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0			; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
	; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]			; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
	; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]			; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
	; GCN: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}			; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
				; GCN32: s_bcnt1_i32_b32 s{{[0-9]+}}, s[[exec_lo]]
				; GCN64: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
	; GCN: ds_sub_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}			; GCN: ds_sub_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}
	define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) {			define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) {
	entry:			entry:
	%old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel			%old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel
	store i64 %old, i64 addrspace(1)* %out			store i64 %old, i64 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: sub_i64_varying:			; GCN-LABEL: sub_i64_varying:
	; GCN-NOT: v_mbcnt_lo_u32_b32			; GCN-NOT: v_mbcnt_lo_u32_b32
	; GCN-NOT: v_mbcnt_hi_u32_b32			; GCN-NOT: v_mbcnt_hi_u32_b32
	; GCN-NOT: s_bcnt1_i32_b64			; GCN-NOT: s_bcnt1_i32_b64
	; GCN: ds_sub_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}			; GCN: ds_sub_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}
	define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) {			define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) {
	entry:			entry:
	%lane = call i32 @llvm.amdgcn.workitem.id.x()			%lane = call i32 @llvm.amdgcn.workitem.id.x()
	%zext = zext i32 %lane to i64			%zext = zext i32 %lane to i64
	%old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel			%old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel
	store i64 %old, i64 addrspace(1)* %out			store i64 %old, i64 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: and_i32_varying:			; GCN-LABEL: and_i32_varying:
	; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63			; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
				; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
	; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]			; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
	; GFX8MORE: ds_and_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]			; GFX8MORE: ds_and_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
	define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {			define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
	entry:			entry:
	%lane = call i32 @llvm.amdgcn.workitem.id.x()			%lane = call i32 @llvm.amdgcn.workitem.id.x()
	%old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel			%old = atomicrmw and i32 addrspace(3)* @local_var32, i32 %lane acq_rel
	store i32 %old, i32 addrspace(1)* %out			store i32 %old, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: or_i32_varying:			; GCN-LABEL: or_i32_varying:
	; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63			; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
				; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
	; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]			; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
	; GFX8MORE: ds_or_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]			; GFX8MORE: ds_or_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
	define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {			define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
	entry:			entry:
	%lane = call i32 @llvm.amdgcn.workitem.id.x()			%lane = call i32 @llvm.amdgcn.workitem.id.x()
	%old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel			%old = atomicrmw or i32 addrspace(3)* @local_var32, i32 %lane acq_rel
	store i32 %old, i32 addrspace(1)* %out			store i32 %old, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: xor_i32_varying:			; GCN-LABEL: xor_i32_varying:
	; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63			; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
				; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
	; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]			; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
	; GFX8MORE: ds_xor_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]			; GFX8MORE: ds_xor_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
	define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {			define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
	entry:			entry:
	%lane = call i32 @llvm.amdgcn.workitem.id.x()			%lane = call i32 @llvm.amdgcn.workitem.id.x()
	%old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel			%old = atomicrmw xor i32 addrspace(3)* @local_var32, i32 %lane acq_rel
	store i32 %old, i32 addrspace(1)* %out			store i32 %old, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: max_i32_varying:			; GCN-LABEL: max_i32_varying:
	; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63			; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
				; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
	; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]			; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
	; GFX8MORE: ds_max_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]			; GFX8MORE: ds_max_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
	define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {			define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
	entry:			entry:
	%lane = call i32 @llvm.amdgcn.workitem.id.x()			%lane = call i32 @llvm.amdgcn.workitem.id.x()
	%old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel			%old = atomicrmw max i32 addrspace(3)* @local_var32, i32 %lane acq_rel
	store i32 %old, i32 addrspace(1)* %out			store i32 %old, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: max_i64_constant:			; GCN-LABEL: max_i64_constant:
	; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0			; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
	; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0			; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
	; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]			; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
	; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]			; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
				; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
	; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], 5			; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], 5
	; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], 0			; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], 0
	; GCN: ds_max_rtn_i64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}			; GCN: ds_max_rtn_i64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}
	define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {			define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
	entry:			entry:
	%old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel			%old = atomicrmw max i64 addrspace(3)* @local_var64, i64 5 acq_rel
	store i64 %old, i64 addrspace(1)* %out			store i64 %old, i64 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: min_i32_varying:			; GCN-LABEL: min_i32_varying:
	; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63			; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
				; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
	; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]			; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
	; GFX8MORE: ds_min_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]			; GFX8MORE: ds_min_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
	define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {			define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
	entry:			entry:
	%lane = call i32 @llvm.amdgcn.workitem.id.x()			%lane = call i32 @llvm.amdgcn.workitem.id.x()
	%old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel			%old = atomicrmw min i32 addrspace(3)* @local_var32, i32 %lane acq_rel
	store i32 %old, i32 addrspace(1)* %out			store i32 %old, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: min_i64_constant:			; GCN-LABEL: min_i64_constant:
	; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0			; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
	; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0			; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
	; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]			; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
	; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]			; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
				; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
	; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], 5			; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], 5
	; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], 0			; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], 0
	; GCN: ds_min_rtn_i64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}			; GCN: ds_min_rtn_i64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}
	define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {			define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
	entry:			entry:
	%old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel			%old = atomicrmw min i64 addrspace(3)* @local_var64, i64 5 acq_rel
	store i64 %old, i64 addrspace(1)* %out			store i64 %old, i64 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: umax_i32_varying:			; GCN-LABEL: umax_i32_varying:
	; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63			; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
				; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
	; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]			; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
	; GFX8MORE: ds_max_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]			; GFX8MORE: ds_max_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
	define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {			define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
	entry:			entry:
	%lane = call i32 @llvm.amdgcn.workitem.id.x()			%lane = call i32 @llvm.amdgcn.workitem.id.x()
	%old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel			%old = atomicrmw umax i32 addrspace(3)* @local_var32, i32 %lane acq_rel
	store i32 %old, i32 addrspace(1)* %out			store i32 %old, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: umax_i64_constant:			; GCN-LABEL: umax_i64_constant:
	; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0			; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
	; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0			; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
	; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]			; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
	; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]			; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
				; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
	; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], 5			; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], 5
	; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], 0			; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], 0
	; GCN: ds_max_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}			; GCN: ds_max_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}
	define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {			define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
	entry:			entry:
	%old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel			%old = atomicrmw umax i64 addrspace(3)* @local_var64, i64 5 acq_rel
	store i64 %old, i64 addrspace(1)* %out			store i64 %old, i64 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: umin_i32_varying:			; GCN-LABEL: umin_i32_varying:
	; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63			; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
				; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
	; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]			; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
	; GFX8MORE: ds_min_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]			; GFX8MORE: ds_min_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
	define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {			define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
	entry:			entry:
	%lane = call i32 @llvm.amdgcn.workitem.id.x()			%lane = call i32 @llvm.amdgcn.workitem.id.x()
	%old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel			%old = atomicrmw umin i32 addrspace(3)* @local_var32, i32 %lane acq_rel
	store i32 %old, i32 addrspace(1)* %out			store i32 %old, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: umin_i64_constant:			; GCN-LABEL: umin_i64_constant:
	; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0			; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
	; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0			; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
	; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]			; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
	; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]			; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
				; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
	; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], 5			; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], 5
	; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], 0			; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], 0
	; GCN: ds_min_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}			; GCN: ds_min_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}
	define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {			define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
	entry:			entry:
	%old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel			%old = atomicrmw umin i64 addrspace(3)* @local_var64, i64 5 acq_rel
	store i64 %old, i64 addrspace(1)* %out			store i64 %old, i64 addrspace(1)* %out
	ret void			ret void
	}			}

llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll

	; RUN: llc -mtriple=amdgcn-- -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s			; RUN: llc -mtriple=amdgcn-- -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX7LESS %s
	; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s			; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s
	; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s			; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s
				; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64 %s
				; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GCN32,GFX8MORE,GFX8MORE32 %s

	declare i1 @llvm.amdgcn.wqm.vote(i1)			declare i1 @llvm.amdgcn.wqm.vote(i1)
	declare i32 @llvm.amdgcn.buffer.atomic.add(i32, <4 x i32>, i32, i32, i1)			declare i32 @llvm.amdgcn.buffer.atomic.add(i32, <4 x i32>, i32, i32, i1)
	declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1)			declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1)

	; Show that what the atomic optimization pass will do for raw buffers.			; Show that what the atomic optimization pass will do for raw buffers.

	; GCN-LABEL: add_i32_constant:			; GCN-LABEL: add_i32_constant:
	; GCN-LABEL: BB0_1:			; GCN-LABEL: BB0_1:
	; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0			; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
	; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0			; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
	; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]			; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
	; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]			; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
	; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}			; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
				; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
				; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
	; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5			; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
	; GCN: buffer_atomic_add v[[value]]			; GCN: buffer_atomic_add v[[value]]
	; GCN: v_readfirstlane_b32 s{{[0-9]+}}, v[[value]]			; GCN: v_readfirstlane_b32 s{{[0-9]+}}, v[[value]]
	define amdgpu_ps void @add_i32_constant(<4 x i32> inreg %out, <4 x i32> inreg %inout) {			define amdgpu_ps void @add_i32_constant(<4 x i32> inreg %out, <4 x i32> inreg %inout) {
	entry:			entry:
	%cond1 = call i1 @llvm.amdgcn.wqm.vote(i1 true)			%cond1 = call i1 @llvm.amdgcn.wqm.vote(i1 true)
	%old = call i32 @llvm.amdgcn.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i1 0)			%old = call i32 @llvm.amdgcn.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i1 0)
	%cond2 = call i1 @llvm.amdgcn.wqm.vote(i1 true)			%cond2 = call i1 @llvm.amdgcn.wqm.vote(i1 true)
	%cond = and i1 %cond1, %cond2			%cond = and i1 %cond1, %cond2
	br i1 %cond, label %if, label %else			br i1 %cond, label %if, label %else
	if:			if:
	%bitcast = bitcast i32 %old to float			%bitcast = bitcast i32 %old to float
	call void @llvm.amdgcn.buffer.store.f32(float %bitcast, <4 x i32> %out, i32 0, i32 0, i1 0, i1 0)			call void @llvm.amdgcn.buffer.store.f32(float %bitcast, <4 x i32> %out, i32 0, i32 0, i1 0, i1 0)
	ret void			ret void
	else:			else:
	ret void			ret void
	}			}

	; GCN-LABEL: add_i32_varying:			; GCN-LABEL: add_i32_varying:
	; GFX7LESS-NOT: v_mbcnt_lo_u32_b32			; GFX7LESS-NOT: v_mbcnt_lo_u32_b32
	; GFX7LESS-NOT: v_mbcnt_hi_u32_b32			; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
	; GFX8MORE: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0			; GFX8MORE32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
	; GFX8MORE: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0			; GFX8MORE64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
	; GFX8MORE: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]			; GFX8MORE: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
	; GFX8MORE: v_add_u32_dpp			; GFX8MORE64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
	; GFX8MORE: v_add_u32_dpp			; DPPCOMB: v_add_u32_dpp
	; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63			; DPPCOMB: v_add_u32_dpp
	; GFX8MORE: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]			; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
				; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
				; GFX8MORE: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
	; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]			; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
	; GFX8MORE: buffer_atomic_add v[[value]]			; GFX8MORE: buffer_atomic_add v[[value]]
	; GFX8MORE: v_readfirstlane_b32 s{{[0-9]+}}, v[[value]]			; GFX8MORE: v_readfirstlane_b32 s{{[0-9]+}}, v[[value]]
	define amdgpu_ps void @add_i32_varying(<4 x i32> inreg %out, <4 x i32> inreg %inout, i32 %val) {			define amdgpu_ps void @add_i32_varying(<4 x i32> inreg %out, <4 x i32> inreg %inout, i32 %val) {
	entry:			entry:
	%cond1 = call i1 @llvm.amdgcn.wqm.vote(i1 true)			%cond1 = call i1 @llvm.amdgcn.wqm.vote(i1 true)
	%old = call i32 @llvm.amdgcn.buffer.atomic.add(i32 %val, <4 x i32> %inout, i32 0, i32 0, i1 0)			%old = call i32 @llvm.amdgcn.buffer.atomic.add(i32 %val, <4 x i32> %inout, i32 0, i32 0, i1 0)
	%cond2 = call i1 @llvm.amdgcn.wqm.vote(i1 true)			%cond2 = call i1 @llvm.amdgcn.wqm.vote(i1 true)
	Show All 9 Lines

llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll

	; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s			; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX7LESS %s
	; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s			; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s
	; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s			; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s
				; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64 %s
				; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GCN32,GFX8MORE,GFX8MORE32 %s

	declare i32 @llvm.amdgcn.workitem.id.x()			declare i32 @llvm.amdgcn.workitem.id.x()
	declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32)			declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32)
	declare i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32)			declare i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32)

	; Show that what the atomic optimization pass will do for raw buffers.			; Show that what the atomic optimization pass will do for raw buffers.

	; GCN-LABEL: add_i32_constant:			; GCN-LABEL: add_i32_constant:
	; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0			; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
	; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0			; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
	; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]			; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
	; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]			; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
	; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}			; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
				; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
				; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
	; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5			; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
	; GCN: buffer_atomic_add v[[value]]			; GCN: buffer_atomic_add v[[value]]
	define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {			define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
	entry:			entry:
	%old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0)			%old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0)
	store i32 %old, i32 addrspace(1)* %out			store i32 %old, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: add_i32_uniform:			; GCN-LABEL: add_i32_uniform:
	; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0			; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
	; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0			; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
	; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]			; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
	; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]			; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
	; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}			; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
				; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
				; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
	; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]			; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
	; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]			; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
	; GCN: buffer_atomic_add v[[value]]			; GCN: buffer_atomic_add v[[value]]
	define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) {			define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) {
	entry:			entry:
	%old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i32 0)			%old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i32 0)
	store i32 %old, i32 addrspace(1)* %out			store i32 %old, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: add_i32_varying_vdata:			; GCN-LABEL: add_i32_varying_vdata:
	; GFX7LESS-NOT: v_mbcnt_lo_u32_b32			; GFX7LESS-NOT: v_mbcnt_lo_u32_b32
	; GFX7LESS-NOT: v_mbcnt_hi_u32_b32			; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
	; GFX7LESS-NOT: s_bcnt1_i32_b64			; GFX7LESS-NOT: s_bcnt1_i32_b64
	; GFX7LESS: buffer_atomic_add v{{[0-9]+}}			; GFX7LESS: buffer_atomic_add v{{[0-9]+}}
	; GFX8MORE: v_add_u32_dpp			; DPPCOMB: v_add_u32_dpp
	; GFX8MORE: v_add_u32_dpp			; DPPCOMB: v_add_u32_dpp
	; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63			; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
				; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
	; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]			; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
	; GFX8MORE: buffer_atomic_add v[[value]]			; GFX8MORE: buffer_atomic_add v[[value]]
	define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {			define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
	entry:			entry:
	%lane = call i32 @llvm.amdgcn.workitem.id.x()			%lane = call i32 @llvm.amdgcn.workitem.id.x()
	%old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0)			%old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0)
	store i32 %old, i32 addrspace(1)* %out			store i32 %old, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: add_i32_varying_offset:			; GCN-LABEL: add_i32_varying_offset:
	; GCN-NOT: v_mbcnt_lo_u32_b32			; GCN-NOT: v_mbcnt_lo_u32_b32
	; GCN-NOT: v_mbcnt_hi_u32_b32			; GCN-NOT: v_mbcnt_hi_u32_b32
	; GCN-NOT: s_bcnt1_i32_b64			; GCN-NOT: s_bcnt1_i32_b64
	; GCN: buffer_atomic_add v{{[0-9]+}}			; GCN: buffer_atomic_add v{{[0-9]+}}
	define amdgpu_kernel void @add_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) {			define amdgpu_kernel void @add_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) {
	entry:			entry:
	%lane = call i32 @llvm.amdgcn.workitem.id.x()			%lane = call i32 @llvm.amdgcn.workitem.id.x()
	%old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0)			%old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0)
	store i32 %old, i32 addrspace(1)* %out			store i32 %old, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: sub_i32_constant:			; GCN-LABEL: sub_i32_constant:
	; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0			; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
	; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0			; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
	; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]			; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
	; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]			; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
	; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}			; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
				; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
				; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
	; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5			; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
	; GCN: buffer_atomic_sub v[[value]]			; GCN: buffer_atomic_sub v[[value]]
	define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {			define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
	entry:			entry:
	%old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0)			%old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0)
	store i32 %old, i32 addrspace(1)* %out			store i32 %old, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: sub_i32_uniform:			; GCN-LABEL: sub_i32_uniform:
	; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0			; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
	; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0			; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
	; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]			; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
	; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]			; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
	; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}			; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
				; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
				; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
	; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]			; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
	; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]			; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
	; GCN: buffer_atomic_sub v[[value]]			; GCN: buffer_atomic_sub v[[value]]
	define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) {			define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) {
	entry:			entry:
	%old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i32 0)			%old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i32 0)
	store i32 %old, i32 addrspace(1)* %out			store i32 %old, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: sub_i32_varying_vdata:			; GCN-LABEL: sub_i32_varying_vdata:
	; GFX7LESS-NOT: v_mbcnt_lo_u32_b32			; GFX7LESS-NOT: v_mbcnt_lo_u32_b32
	; GFX7LESS-NOT: v_mbcnt_hi_u32_b32			; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
	; GFX7LESS-NOT: s_bcnt1_i32_b64			; GFX7LESS-NOT: s_bcnt1_i32_b64
	; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}			; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
	; GFX8MORE: v_sub{{(rev)?}}_u32_dpp			; DPPCOMB: v_add_u32_dpp
	; GFX8MORE: v_sub{{(rev)?}}_u32_dpp			; DPPCOMB: v_add_u32_dpp
	; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63			; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
				; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
	; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]			; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
	; GFX8MORE: buffer_atomic_sub v[[value]]			; GFX8MORE: buffer_atomic_sub v[[value]]
	define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {			define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
	entry:			entry:
	%lane = call i32 @llvm.amdgcn.workitem.id.x()			%lane = call i32 @llvm.amdgcn.workitem.id.x()
	%old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0)			%old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0)
	store i32 %old, i32 addrspace(1)* %out			store i32 %old, i32 addrspace(1)* %out
	ret void			ret void
	Show All 14 Lines

llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll

	; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s			; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX7LESS %s
	; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s			; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s
	; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s			; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s
				; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64 %s
				; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GCN32,GFX8MORE,GFX8MORE32 %s

	declare i32 @llvm.amdgcn.workitem.id.x()			declare i32 @llvm.amdgcn.workitem.id.x()
	declare i32 @llvm.amdgcn.struct.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32, i32)			declare i32 @llvm.amdgcn.struct.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32, i32)
	declare i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32, i32)			declare i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32, i32)

	; Show that what the atomic optimization pass will do for struct buffers.			; Show that what the atomic optimization pass will do for struct buffers.

	; GCN-LABEL: add_i32_constant:			; GCN-LABEL: add_i32_constant:
	; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0			; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
	; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0			; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
	; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]			; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
	; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]			; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
	; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}			; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
				; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
				; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
	; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5			; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
	; GCN: buffer_atomic_add v[[value]]			; GCN: buffer_atomic_add v[[value]]
	define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {			define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
	entry:			entry:
	%old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)			%old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
	store i32 %old, i32 addrspace(1)* %out			store i32 %old, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: add_i32_uniform:			; GCN-LABEL: add_i32_uniform:
	; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0			; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
	; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0			; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
	; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]			; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
	; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]			; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
	; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}			; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
				; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
				; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
	; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]			; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
	; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]			; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
	; GCN: buffer_atomic_add v[[value]]			; GCN: buffer_atomic_add v[[value]]
	define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) {			define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) {
	entry:			entry:
	%old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)			%old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
	store i32 %old, i32 addrspace(1)* %out			store i32 %old, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: add_i32_varying_vdata:			; GCN-LABEL: add_i32_varying_vdata:
	; GFX7LESS-NOT: v_mbcnt_lo_u32_b32			; GFX7LESS-NOT: v_mbcnt_lo_u32_b32
	; GFX7LESS-NOT: v_mbcnt_hi_u32_b32			; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
	; GFX7LESS-NOT: s_bcnt1_i32_b64			; GFX7LESS-NOT: s_bcnt1_i32_b64
	; GFX7LESS: buffer_atomic_add v{{[0-9]+}}			; GFX7LESS: buffer_atomic_add v{{[0-9]+}}
	; GFX8MORE: v_add_u32_dpp			; DPPCOMB: v_add_u32_dpp
	; GFX8MORE: v_add_u32_dpp			; DPPCOMB: v_add_u32_dpp
	; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63			; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
				; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
	; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]			; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
	; GFX8MORE: buffer_atomic_add v[[value]]			; GFX8MORE: buffer_atomic_add v[[value]]
	define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {			define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
	entry:			entry:
	%lane = call i32 @llvm.amdgcn.workitem.id.x()			%lane = call i32 @llvm.amdgcn.workitem.id.x()
	%old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)			%old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
	store i32 %old, i32 addrspace(1)* %out			store i32 %old, i32 addrspace(1)* %out
	ret void			ret void
	Show All 21 Lines
	entry:			entry:
	%lane = call i32 @llvm.amdgcn.workitem.id.x()			%lane = call i32 @llvm.amdgcn.workitem.id.x()
	%old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 0, i32 %lane, i32 0, i32 0)			%old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 0, i32 %lane, i32 0, i32 0)
	store i32 %old, i32 addrspace(1)* %out			store i32 %old, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: sub_i32_constant:			; GCN-LABEL: sub_i32_constant:
	; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0			; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
	; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0			; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
	; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]			; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
	; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]			; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
	; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}			; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
				; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
				; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
	; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5			; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
	; GCN: buffer_atomic_sub v[[value]]			; GCN: buffer_atomic_sub v[[value]]
	define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {			define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
	entry:			entry:
	%old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)			%old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
	store i32 %old, i32 addrspace(1)* %out			store i32 %old, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: sub_i32_uniform:			; GCN-LABEL: sub_i32_uniform:
	; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0			; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0
	; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0			; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0
	; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]			; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0
	; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]			; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]]
	; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}			; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]]
				; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]]
				; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
	; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]			; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
	; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]			; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
	; GCN: buffer_atomic_sub v[[value]]			; GCN: buffer_atomic_sub v[[value]]
	define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) {			define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) {
	entry:			entry:
	%old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)			%old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
	store i32 %old, i32 addrspace(1)* %out			store i32 %old, i32 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: sub_i32_varying_vdata:			; GCN-LABEL: sub_i32_varying_vdata:
	; GFX7LESS-NOT: v_mbcnt_lo_u32_b32			; GFX7LESS-NOT: v_mbcnt_lo_u32_b32
	; GFX7LESS-NOT: v_mbcnt_hi_u32_b32			; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
	; GFX7LESS-NOT: s_bcnt1_i32_b64			; GFX7LESS-NOT: s_bcnt1_i32_b64
	; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}			; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
	; GFX8MORE: v_sub{{(rev)?}}_u32_dpp			; DPPCOMB: v_add_u32_dpp
	; GFX8MORE: v_sub{{(rev)?}}_u32_dpp			; DPPCOMB: v_add_u32_dpp
	; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63			; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31
				; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
	; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]			; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
	; GFX8MORE: buffer_atomic_sub v[[value]]			; GFX8MORE: buffer_atomic_sub v[[value]]
	define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {			define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
	entry:			entry:
	%lane = call i32 @llvm.amdgcn.workitem.id.x()			%lane = call i32 @llvm.amdgcn.workitem.id.x()
	%old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)			%old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
	store i32 %old, i32 addrspace(1)* %out			store i32 %old, i32 addrspace(1)* %out
	ret void			ret void
	Show All 27 Lines