Diff 168669

llvm/trunk/lib/Target/AMDGPU/AMDGPU.h

	Show First 20 Lines • Show All 63 Lines • ▼ Show 20 Lines
	extern char &AMDGPUMachineCFGStructurizerID;			extern char &AMDGPUMachineCFGStructurizerID;

	void initializeAMDGPUAlwaysInlinePass(PassRegistry&);			void initializeAMDGPUAlwaysInlinePass(PassRegistry&);

	Pass *createAMDGPUAnnotateKernelFeaturesPass();			Pass *createAMDGPUAnnotateKernelFeaturesPass();
	void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);			void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
	extern char &AMDGPUAnnotateKernelFeaturesID;			extern char &AMDGPUAnnotateKernelFeaturesID;

				FunctionPass *createAMDGPUAtomicOptimizerPass();
				void initializeAMDGPUAtomicOptimizerPass(PassRegistry &);
				extern char &AMDGPUAtomicOptimizerID;

	ModulePass *createAMDGPULowerIntrinsicsPass();			ModulePass *createAMDGPULowerIntrinsicsPass();
	void initializeAMDGPULowerIntrinsicsPass(PassRegistry &);			void initializeAMDGPULowerIntrinsicsPass(PassRegistry &);
	extern char &AMDGPULowerIntrinsicsID;			extern char &AMDGPULowerIntrinsicsID;

	FunctionPass *createAMDGPULowerKernelArgumentsPass();			FunctionPass *createAMDGPULowerKernelArgumentsPass();
	void initializeAMDGPULowerKernelArgumentsPass(PassRegistry &);			void initializeAMDGPULowerKernelArgumentsPass(PassRegistry &);
	extern char &AMDGPULowerKernelArgumentsID;			extern char &AMDGPULowerKernelArgumentsID;

	▲ Show 20 Lines • Show All 193 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp

				//===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===//
				//
				// The LLVM Compiler Infrastructure
				//
				// This file is distributed under the University of Illinois Open Source
				// License. See LICENSE.TXT for details.
				//
				//===----------------------------------------------------------------------===//
				//
				/// \file
				/// This pass optimizes atomic operations by using a single lane of a wavefront
				/// to perform the atomic operation, thus reducing contention on that memory
				/// location.
				//
				//===----------------------------------------------------------------------===//

				#include "AMDGPU.h"
				#include "AMDGPUSubtarget.h"
				#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
				#include "llvm/CodeGen/TargetPassConfig.h"
				#include "llvm/IR/IRBuilder.h"
				#include "llvm/IR/InstVisitor.h"
				#include "llvm/Transforms/Utils/BasicBlockUtils.h"

				#define DEBUG_TYPE "amdgpu-atomic-optimizer"

				using namespace llvm;

				namespace {

				enum DPP_CTRL {
				DPP_ROW_SR1 = 0x111,
				DPP_ROW_SR2 = 0x112,
				DPP_ROW_SR4 = 0x114,
				DPP_ROW_SR8 = 0x118,
				DPP_WF_SR1 = 0x138,
				DPP_ROW_BCAST15 = 0x142,
				DPP_ROW_BCAST31 = 0x143
				};

				struct ReplacementInfo {
				Instruction *I;
				Instruction::BinaryOps Op;
				unsigned ValIdx;
				bool ValDivergent;
				};

				class AMDGPUAtomicOptimizer : public FunctionPass,
				public InstVisitor<AMDGPUAtomicOptimizer> {
				private:
				SmallVector<ReplacementInfo, 8> ToReplace;
				const LegacyDivergenceAnalysis *DA;
				const DataLayout *DL;
				DominatorTree *DT;
				bool HasDPP;

				void optimizeAtomic(Instruction &I, Instruction::BinaryOps Op,
				unsigned ValIdx, bool ValDivergent) const;

				void setConvergent(CallInst *const CI) const;

				public:
				static char ID;

				AMDGPUAtomicOptimizer() : FunctionPass(ID) {}

				bool runOnFunction(Function &F) override;

				void getAnalysisUsage(AnalysisUsage &AU) const override {
				AU.addPreserved<DominatorTreeWrapperPass>();
				AU.addRequired<LegacyDivergenceAnalysis>();
				AU.addRequired<TargetPassConfig>();
				}

				void visitAtomicRMWInst(AtomicRMWInst &I);
				void visitIntrinsicInst(IntrinsicInst &I);
				};

				} // namespace

				char AMDGPUAtomicOptimizer::ID = 0;

				char &llvm::AMDGPUAtomicOptimizerID = AMDGPUAtomicOptimizer::ID;

				bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) {
				if (skipFunction(F)) {
				return false;
				}

				DA = &getAnalysis<LegacyDivergenceAnalysis>();
				DL = &F.getParent()->getDataLayout();
				DominatorTreeWrapperPass *const DTW =
				getAnalysisIfAvailable<DominatorTreeWrapperPass>();
				DT = DTW ? &DTW->getDomTree() : nullptr;
				const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
				const TargetMachine &TM = TPC.getTM<TargetMachine>();
				const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
				HasDPP = ST.hasDPP();

				visit(F);

				const bool Changed = !ToReplace.empty();

				for (ReplacementInfo &Info : ToReplace) {
				optimizeAtomic(*Info.I, Info.Op, Info.ValIdx, Info.ValDivergent);
				}

				ToReplace.clear();

				return Changed;
				}

				void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) {
				// Early exit for unhandled address space atomic instructions.
				switch (I.getPointerAddressSpace()) {
				default:
				return;
				case AMDGPUAS::GLOBAL_ADDRESS:
				case AMDGPUAS::LOCAL_ADDRESS:
				break;
				}

				Instruction::BinaryOps Op;

				switch (I.getOperation()) {
				default:
				return;
				case AtomicRMWInst::Add:
				Op = Instruction::Add;
				break;
				case AtomicRMWInst::Sub:
				Op = Instruction::Sub;
				break;
				}

				const unsigned PtrIdx = 0;
				const unsigned ValIdx = 1;

				// If the pointer operand is divergent, then each lane is doing an atomic
				// operation on a different address, and we cannot optimize that.
				if (DA->isDivergent(I.getOperand(PtrIdx))) {
				return;
				}

				const bool ValDivergent = DA->isDivergent(I.getOperand(ValIdx));

				// If the value operand is divergent, each lane is contributing a different
				// value to the atomic calculation. We can only optimize divergent values if
				// we have DPP available on our subtarget, and the atomic operation is 32
				// bits.
				if (ValDivergent && (!HasDPP \|\| (DL->getTypeSizeInBits(I.getType()) != 32))) {
				return;
				}

				// If we get here, we can optimize the atomic using a single wavefront-wide
				// atomic operation to do the calculation for the entire wavefront, so
				// remember the instruction so we can come back to it.
				const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent};

				ToReplace.push_back(Info);
				}

				void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) {
				Instruction::BinaryOps Op;

				switch (I.getIntrinsicID()) {
				default:
				return;
				case Intrinsic::amdgcn_buffer_atomic_add:
				case Intrinsic::amdgcn_struct_buffer_atomic_add:
				case Intrinsic::amdgcn_raw_buffer_atomic_add:
				Op = Instruction::Add;
				break;
				case Intrinsic::amdgcn_buffer_atomic_sub:
				case Intrinsic::amdgcn_struct_buffer_atomic_sub:
				case Intrinsic::amdgcn_raw_buffer_atomic_sub:
				Op = Instruction::Sub;
				break;
				}

				const unsigned ValIdx = 0;

				const bool ValDivergent = DA->isDivergent(I.getOperand(ValIdx));

				// If the value operand is divergent, each lane is contributing a different
				// value to the atomic calculation. We can only optimize divergent values if
				// we have DPP available on our subtarget, and the atomic operation is 32
				// bits.
				if (ValDivergent && (!HasDPP \|\| (DL->getTypeSizeInBits(I.getType()) != 32))) {
				return;
				}

				// If any of the other arguments to the intrinsic are divergent, we can't
				// optimize the operation.
				for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) {
				if (DA->isDivergent(I.getOperand(Idx))) {
				return;
				}
				}

				// If we get here, we can optimize the atomic using a single wavefront-wide
				// atomic operation to do the calculation for the entire wavefront, so
				// remember the instruction so we can come back to it.
				const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent};

				ToReplace.push_back(Info);
				}

				void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
				Instruction::BinaryOps Op,
				unsigned ValIdx,
				bool ValDivergent) const {
				LLVMContext &Context = I.getContext();

				// Start building just before the instruction.
				IRBuilder<> B(&I);

				Type *const Ty = I.getType();
				const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty);
				Type *const VecTy = VectorType::get(B.getInt32Ty(), 2);

				// This is the value in the atomic operation we need to combine in order to
				// reduce the number of atomic operations.
				Value *const V = I.getOperand(ValIdx);

				// We need to know how many lanes are active within the wavefront, and we do
				// this by getting the exec register, which tells us all the lanes that are
				// active.
				MDNode *const RegName =
				llvm::MDNode::get(Context, llvm::MDString::get(Context, "exec"));
				Value *const Metadata = llvm::MetadataAsValue::get(Context, RegName);
				CallInst *const Exec =
				B.CreateIntrinsic(Intrinsic::read_register, {B.getInt64Ty()}, {Metadata});
				setConvergent(Exec);

				// We need to know how many lanes are active within the wavefront that are
				// below us. If we counted each lane linearly starting from 0, a lane is
				// below us only if its associated index was less than ours. We do this by
				// using the mbcnt intrinsic.
				Value *const BitCast = B.CreateBitCast(Exec, VecTy);
				Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0));
				Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1));
				CallInst *const PartialMbcnt = B.CreateIntrinsic(
				Intrinsic::amdgcn_mbcnt_lo, {}, {ExtractLo, B.getInt32(0)});
				CallInst *const Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {},
				{ExtractHi, PartialMbcnt});

				Value *const MbcntCast = B.CreateIntCast(Mbcnt, Ty, false);

				Value *LaneOffset = nullptr;
				Value *NewV = nullptr;

				// If we have a divergent value in each lane, we need to combine the value
				// using DPP.
				if (ValDivergent) {
				// First we need to set all inactive invocations to 0, so that they can
				// correctly contribute to the final result.
				CallInst *const SetInactive = B.CreateIntrinsic(
				Intrinsic::amdgcn_set_inactive, Ty, {V, B.getIntN(TyBitWidth, 0)});
				setConvergent(SetInactive);
				NewV = SetInactive;

				const unsigned Iters = 6;
				const unsigned DPPCtrl[Iters] = {DPP_ROW_SR1, DPP_ROW_SR2,
				DPP_ROW_SR4, DPP_ROW_SR8,
				DPP_ROW_BCAST15, DPP_ROW_BCAST31};
				const unsigned RowMask[Iters] = {0xf, 0xf, 0xf, 0xf, 0xa, 0xc};

				// This loop performs an inclusive scan across the wavefront, with all lanes
				// active (by using the WWM intrinsic).
				for (unsigned Idx = 0; Idx < Iters; Idx++) {
				CallInst *const DPP = B.CreateIntrinsic(Intrinsic::amdgcn_mov_dpp, Ty,
				{NewV, B.getInt32(DPPCtrl[Idx]),
				B.getInt32(RowMask[Idx]),
				B.getInt32(0xf), B.getFalse()});
				setConvergent(DPP);
				Value *const WWM = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, DPP);

				NewV = B.CreateBinOp(Op, NewV, WWM);
				NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV);
				}

				// NewV has returned the inclusive scan of V, but for the lane offset we
				// require an exclusive scan. We do this by shifting the values from the
				// entire wavefront right by 1, and by setting the bound_ctrl (last argument
				// to the intrinsic below) to true, we can guarantee that 0 will be shifted
				// into the 0'th invocation.
				CallInst *const DPP =
				B.CreateIntrinsic(Intrinsic::amdgcn_mov_dpp, {Ty},
				{NewV, B.getInt32(DPP_WF_SR1), B.getInt32(0xf),
				B.getInt32(0xf), B.getTrue()});
				setConvergent(DPP);
				LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, DPP);

				// Read the value from the last lane, which has accumlated the values of
				// each active lane in the wavefront. This will be our new value with which
				// we will provide to the atomic operation.
				if (TyBitWidth == 64) {
				Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty());
				Value *const ExtractHi =
				B.CreateTrunc(B.CreateLShr(NewV, B.getInt64(32)), B.getInt32Ty());
				CallInst *const ReadLaneLo = B.CreateIntrinsic(
				Intrinsic::amdgcn_readlane, {}, {ExtractLo, B.getInt32(63)});
				setConvergent(ReadLaneLo);
				CallInst *const ReadLaneHi = B.CreateIntrinsic(
				Intrinsic::amdgcn_readlane, {}, {ExtractHi, B.getInt32(63)});
				setConvergent(ReadLaneHi);
				Value *const PartialInsert = B.CreateInsertElement(
				UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0));
				Value *const Insert =
				B.CreateInsertElement(PartialInsert, ReadLaneHi, B.getInt32(1));
				NewV = B.CreateBitCast(Insert, Ty);
				} else if (TyBitWidth == 32) {
				CallInst *const ReadLane = B.CreateIntrinsic(Intrinsic::amdgcn_readlane,
				{}, {NewV, B.getInt32(63)});
				setConvergent(ReadLane);
				NewV = ReadLane;
				} else {
				llvm_unreachable("Unhandled atomic bit width");
				}
				} else {
				// Get the total number of active lanes we have by using popcount.
				Instruction *const Ctpop = B.CreateUnaryIntrinsic(Intrinsic::ctpop, Exec);
				Value *const CtpopCast = B.CreateIntCast(Ctpop, Ty, false);

				// Calculate the new value we will be contributing to the atomic operation
				// for the entire wavefront.
				NewV = B.CreateMul(V, CtpopCast);
				LaneOffset = B.CreateMul(V, MbcntCast);
				}

				// We only want a single lane to enter our new control flow, and we do this
				// by checking if there are any active lanes below us. Only one lane will
				// have 0 active lanes below us, so that will be the only one to progress.
				Value *const Cond = B.CreateICmpEQ(MbcntCast, B.getIntN(TyBitWidth, 0));

				// Store I's original basic block before we split the block.
				BasicBlock *const EntryBB = I.getParent();

				// We need to introduce some new control flow to force a single lane to be
				// active. We do this by splitting I's basic block at I, and introducing the
				// new block such that:
				// entry --> single_lane -\
				// \------------------> exit
				Instruction *const SingleLaneTerminator =
				SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr);

				// Move the IR builder into single_lane next.
				B.SetInsertPoint(SingleLaneTerminator);

				// Clone the original atomic operation into single lane, replacing the
				// original value with our newly created one.
				Instruction *const NewI = I.clone();
				B.Insert(NewI);
				NewI->setOperand(ValIdx, NewV);

				// Move the IR builder into exit next, and start inserting just before the
				// original instruction.
				B.SetInsertPoint(&I);

				// Create a PHI node to get our new atomic result into the exit block.
				PHINode *const PHI = B.CreatePHI(Ty, 2);
				PHI->addIncoming(UndefValue::get(Ty), EntryBB);
				PHI->addIncoming(NewI, SingleLaneTerminator->getParent());

				// We need to broadcast the value who was the lowest active lane (the first
				// lane) to all other lanes in the wavefront. We use an intrinsic for this,
				// but have to handle 64-bit broadcasts with two calls to this intrinsic.
				Value *BroadcastI = nullptr;

				if (TyBitWidth == 64) {
				Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty());
				Value *const ExtractHi =
				B.CreateTrunc(B.CreateLShr(PHI, B.getInt64(32)), B.getInt32Ty());
				CallInst *const ReadFirstLaneLo =
				B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
				setConvergent(ReadFirstLaneLo);
				CallInst *const ReadFirstLaneHi =
				B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);
				setConvergent(ReadFirstLaneHi);
				Value *const PartialInsert = B.CreateInsertElement(
				UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
				Value *const Insert =
				B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1));
				BroadcastI = B.CreateBitCast(Insert, Ty);
				} else if (TyBitWidth == 32) {
				CallInst *const ReadFirstLane =
				B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI);
				setConvergent(ReadFirstLane);
				BroadcastI = ReadFirstLane;
				} else {
				llvm_unreachable("Unhandled atomic bit width");
				}

				// Now that we have the result of our single atomic operation, we need to
				// get our individual lane's slice into the result. We use the lane offset we
				// previously calculated combined with the atomic result value we got from the
				// first lane, to get our lane's index into the atomic result.
				Value *const Result = B.CreateBinOp(Op, BroadcastI, LaneOffset);

				// Replace the original atomic instruction with the new one.
				I.replaceAllUsesWith(Result);

				// And delete the original.
				I.eraseFromParent();
				}

				void AMDGPUAtomicOptimizer::setConvergent(CallInst *const CI) const {
				CI->addAttribute(AttributeList::FunctionIndex, Attribute::Convergent);
				}

				INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE,
				"AMDGPU atomic optimizations", false, false)
				INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
				INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
				INITIALIZE_PASS_END(AMDGPUAtomicOptimizer, DEBUG_TYPE,
				"AMDGPU atomic optimizations", false, false)

				FunctionPass *llvm::createAMDGPUAtomicOptimizerPass() {
				return new AMDGPUAtomicOptimizer();
				}

llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Show First 20 Lines • Show All 132 Lines • ▼ Show 20 Lines	static cl::opt<bool> EnableLibCallSimplify(
cl::Hidden);		cl::Hidden);

static cl::opt<bool> EnableLowerKernelArguments(		static cl::opt<bool> EnableLowerKernelArguments(
"amdgpu-ir-lower-kernel-arguments",		"amdgpu-ir-lower-kernel-arguments",
cl::desc("Lower kernel argument loads in IR pass"),		cl::desc("Lower kernel argument loads in IR pass"),
cl::init(true),		cl::init(true),
cl::Hidden);		cl::Hidden);

		// Enable atomic optimization
		static cl::opt<bool> EnableAtomicOptimizations(
		"amdgpu-atomic-optimizations",
		cl::desc("Enable atomic optimizations"),
		cl::init(false),
		cl::Hidden);

extern "C" void LLVMInitializeAMDGPUTarget() {		extern "C" void LLVMInitializeAMDGPUTarget() {
// Register the target		// Register the target
RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());		RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());		RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());

PassRegistry *PR = PassRegistry::getPassRegistry();		PassRegistry *PR = PassRegistry::getPassRegistry();
initializeR600ClauseMergePassPass(*PR);		initializeR600ClauseMergePassPass(*PR);
initializeR600ControlFlowFinalizerPass(*PR);		initializeR600ControlFlowFinalizerPass(*PR);
Show All 9 Lines	extern "C" void LLVMInitializeAMDGPUTarget() {
initializeSIPeepholeSDWAPass(*PR);		initializeSIPeepholeSDWAPass(*PR);
initializeSIShrinkInstructionsPass(*PR);		initializeSIShrinkInstructionsPass(*PR);
initializeSIOptimizeExecMaskingPreRAPass(*PR);		initializeSIOptimizeExecMaskingPreRAPass(*PR);
initializeSILoadStoreOptimizerPass(*PR);		initializeSILoadStoreOptimizerPass(*PR);
initializeAMDGPUAlwaysInlinePass(*PR);		initializeAMDGPUAlwaysInlinePass(*PR);
initializeAMDGPUAnnotateKernelFeaturesPass(*PR);		initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
initializeAMDGPUAnnotateUniformValuesPass(*PR);		initializeAMDGPUAnnotateUniformValuesPass(*PR);
initializeAMDGPUArgumentUsageInfoPass(*PR);		initializeAMDGPUArgumentUsageInfoPass(*PR);
		initializeAMDGPUAtomicOptimizerPass(*PR);
initializeAMDGPULowerKernelArgumentsPass(*PR);		initializeAMDGPULowerKernelArgumentsPass(*PR);
initializeAMDGPULowerKernelAttributesPass(*PR);		initializeAMDGPULowerKernelAttributesPass(*PR);
initializeAMDGPULowerIntrinsicsPass(*PR);		initializeAMDGPULowerIntrinsicsPass(*PR);
initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);		initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
initializeAMDGPUPromoteAllocaPass(*PR);		initializeAMDGPUPromoteAllocaPass(*PR);
initializeAMDGPUCodeGenPreparePass(*PR);		initializeAMDGPUCodeGenPreparePass(*PR);
initializeAMDGPURewriteOutArgumentsPass(*PR);		initializeAMDGPURewriteOutArgumentsPass(*PR);
initializeAMDGPUUnifyMetadataPass(*PR);		initializeAMDGPUUnifyMetadataPass(*PR);
▲ Show 20 Lines • Show All 568 Lines • ▼ Show 20 Lines	ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
if (ST.enableSIScheduler())		if (ST.enableSIScheduler())
return createSIMachineScheduler(C);		return createSIMachineScheduler(C);
return createGCNMaxOccupancyMachineScheduler(C);		return createGCNMaxOccupancyMachineScheduler(C);
}		}

bool GCNPassConfig::addPreISel() {		bool GCNPassConfig::addPreISel() {
AMDGPUPassConfig::addPreISel();		AMDGPUPassConfig::addPreISel();

		if (EnableAtomicOptimizations) {
		addPass(createAMDGPUAtomicOptimizerPass());
		}

// FIXME: We need to run a pass to propagate the attributes when calls are		// FIXME: We need to run a pass to propagate the attributes when calls are
// supported.		// supported.
addPass(createAMDGPUAnnotateKernelFeaturesPass());		addPass(createAMDGPUAnnotateKernelFeaturesPass());

// Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit		// Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
// regions formed by them.		// regions formed by them.
addPass(&AMDGPUUnifyDivergentExitNodesID);		addPass(&AMDGPUUnifyDivergentExitNodesID);
if (!LateCFGStructurize) {		if (!LateCFGStructurize) {
▲ Show 20 Lines • Show All 144 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt

	Show All 31 Lines

	add_llvm_target(AMDGPUCodeGen			add_llvm_target(AMDGPUCodeGen
	AMDGPUAliasAnalysis.cpp			AMDGPUAliasAnalysis.cpp
	AMDGPUAlwaysInlinePass.cpp			AMDGPUAlwaysInlinePass.cpp
	AMDGPUAnnotateKernelFeatures.cpp			AMDGPUAnnotateKernelFeatures.cpp
	AMDGPUAnnotateUniformValues.cpp			AMDGPUAnnotateUniformValues.cpp
	AMDGPUArgumentUsageInfo.cpp			AMDGPUArgumentUsageInfo.cpp
	AMDGPUAsmPrinter.cpp			AMDGPUAsmPrinter.cpp
				AMDGPUAtomicOptimizer.cpp
	AMDGPUCallLowering.cpp			AMDGPUCallLowering.cpp
	AMDGPUCodeGenPrepare.cpp			AMDGPUCodeGenPrepare.cpp
	AMDGPUFrameLowering.cpp			AMDGPUFrameLowering.cpp
	AMDGPUHSAMetadataStreamer.cpp			AMDGPUHSAMetadataStreamer.cpp
	AMDGPUInstrInfo.cpp			AMDGPUInstrInfo.cpp
	AMDGPUInstructionSelector.cpp			AMDGPUInstructionSelector.cpp
	AMDGPUIntrinsicInfo.cpp			AMDGPUIntrinsicInfo.cpp
	AMDGPUISelDAGToDAG.cpp			AMDGPUISelDAGToDAG.cpp
	▲ Show 20 Lines • Show All 79 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/atomic_load_add.ll

	; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,SICIVI,FUNC %s			; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
	; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,SICIVI,FUNC %s			; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
	; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,GFX9,FUNC %s			; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,GFX9,FUNC %s
	; RUN: llc -march=r600 -mcpu=redwood < %s \| FileCheck -check-prefixes=R600,FUNC %s			; RUN: llc -march=r600 -mcpu=redwood -amdgpu-atomic-optimizations=false < %s \| FileCheck -check-prefixes=R600,FUNC %s

	; FUNC-LABEL: {{^}}atomic_add_local:			; FUNC-LABEL: {{^}}atomic_add_local:
	; SICIVI: s_mov_b32 m0			; SICIVI: s_mov_b32 m0
	; GFX9-NOT: m0			; GFX9-NOT: m0
	; R600: LDS_ADD *			; R600: LDS_ADD *
	; GCN: ds_add_u32			; GCN: ds_add_u32
	define amdgpu_kernel void @atomic_add_local(i32 addrspace(3)* %local) {			define amdgpu_kernel void @atomic_add_local(i32 addrspace(3)* %local) {
	%unused = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst			%unused = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
	Show All 39 Lines

llvm/trunk/test/CodeGen/AMDGPU/atomic_load_sub.ll

	; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,SICIVI,FUNC %s			; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,SICIVI,FUNC %s
	; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,SICIVI,FUNC %s			; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,SICIVI,FUNC %s
	; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,FUNC %s			; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,FUNC %s
	; RUN: llc -march=r600 -mcpu=redwood < %s \| FileCheck -enable-var-scope -check-prefixes=R600,FUNC %s			; RUN: llc -march=r600 -mcpu=redwood -amdgpu-atomic-optimizations=false < %s \| FileCheck -enable-var-scope -check-prefixes=R600,FUNC %s

	; FUNC-LABEL: {{^}}atomic_sub_local:			; FUNC-LABEL: {{^}}atomic_sub_local:
	; SICIVI: s_mov_b32 m0			; SICIVI: s_mov_b32 m0
	; GFX9-NOT: m0			; GFX9-NOT: m0

	; R600: LDS_SUB *			; R600: LDS_SUB *
	; GCN: ds_sub_u32			; GCN: ds_sub_u32
	define amdgpu_kernel void @atomic_sub_local(i32 addrspace(3)* %local) {			define amdgpu_kernel void @atomic_sub_local(i32 addrspace(3)* %local) {
	Show All 40 Lines

llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll

				; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s
				; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
				; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s

				declare i32 @llvm.amdgcn.workitem.id.x()
				declare i32 @llvm.amdgcn.buffer.atomic.add(i32, <4 x i32>, i32, i32, i1)
				declare i32 @llvm.amdgcn.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i1)

				; Show that what the atomic optimization pass will do for raw buffers.

				; GCN-LABEL: add_i32_constant:
				; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
				; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
				; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
				; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
				; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
				; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
				; GCN: buffer_atomic_add v[[value]]
				define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
				entry:
				%old = call i32 @llvm.amdgcn.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i1 0)
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: add_i32_uniform:
				; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
				; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
				; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
				; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
				; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
				; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
				; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
				; GCN: buffer_atomic_add v[[value]]
				define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) {
				entry:
				%old = call i32 @llvm.amdgcn.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i1 0)
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: add_i32_varying_vdata:
				; GFX7LESS-NOT: v_mbcnt_lo_u32_b32
				; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
				; GFX7LESS-NOT: s_bcnt1_i32_b64
				; GFX7LESS: buffer_atomic_add v{{[0-9]+}}
				; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
				; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
				; GFX8MORE: buffer_atomic_add v[[value]]
				define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
				entry:
				%lane = call i32 @llvm.amdgcn.workitem.id.x()
				%old = call i32 @llvm.amdgcn.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i1 0)
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: add_i32_varying_offset:
				; GCN-NOT: v_mbcnt_lo_u32_b32
				; GCN-NOT: v_mbcnt_hi_u32_b32
				; GCN-NOT: s_bcnt1_i32_b64
				; GCN: buffer_atomic_add v{{[0-9]+}}
				define amdgpu_kernel void @add_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) {
				entry:
				%lane = call i32 @llvm.amdgcn.workitem.id.x()
				%old = call i32 @llvm.amdgcn.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i1 0)
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: sub_i32_constant:
				; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
				; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
				; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
				; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
				; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
				; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
				; GCN: buffer_atomic_sub v[[value]]
				define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
				entry:
				%old = call i32 @llvm.amdgcn.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i1 0)
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: sub_i32_uniform:
				; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
				; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
				; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
				; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
				; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
				; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
				; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
				; GCN: buffer_atomic_sub v[[value]]
				define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) {
				entry:
				%old = call i32 @llvm.amdgcn.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i1 0)
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: sub_i32_varying_vdata:
				; GFX7LESS-NOT: v_mbcnt_lo_u32_b32
				; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
				; GFX7LESS-NOT: s_bcnt1_i32_b64
				; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
				; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
				; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
				; GFX8MORE: buffer_atomic_sub v[[value]]
				define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
				entry:
				%lane = call i32 @llvm.amdgcn.workitem.id.x()
				%old = call i32 @llvm.amdgcn.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i1 0)
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: sub_i32_varying_offset:
				; GCN-NOT: v_mbcnt_lo_u32_b32
				; GCN-NOT: v_mbcnt_hi_u32_b32
				; GCN-NOT: s_bcnt1_i32_b64
				; GCN: buffer_atomic_sub v{{[0-9]+}}
				define amdgpu_kernel void @sub_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) {
				entry:
				%lane = call i32 @llvm.amdgcn.workitem.id.x()
				%old = call i32 @llvm.amdgcn.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i1 0)
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll

				; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s
				; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
				; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s

				declare i32 @llvm.amdgcn.workitem.id.x()

				; Show that what the atomic optimization pass will do for global pointers.

				; GCN-LABEL: add_i32_constant:
				; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
				; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
				; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
				; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
				; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
				; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
				; GCN: {{flat\|buffer\|global}}_atomic_add v[[value]]
				define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {
				entry:
				%old = atomicrmw add i32 addrspace(1)* %inout, i32 5 acq_rel
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: add_i32_uniform:
				; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
				; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
				; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
				; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
				; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
				; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
				; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
				; GCN: {{flat\|buffer\|global}}_atomic_add v[[value]]
				define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %additive) {
				entry:
				%old = atomicrmw add i32 addrspace(1)* %inout, i32 %additive acq_rel
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: add_i32_varying:
				; GFX7LESS-NOT: v_mbcnt_lo_u32_b32
				; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
				; GFX7LESS-NOT: s_bcnt1_i32_b64
				; GFX7LESS: buffer_atomic_add v{{[0-9]+}}
				; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
				; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
				; GFX8MORE: buffer_atomic_add v[[value]]
				define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {
				entry:
				%lane = call i32 @llvm.amdgcn.workitem.id.x()
				%old = atomicrmw add i32 addrspace(1)* %inout, i32 %lane acq_rel
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: add_i64_constant:
				; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
				; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
				; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
				; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
				; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
				; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5
				; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5
				; GCN: {{flat\|buffer\|global}}_atomic_add_x2 v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}
				define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) {
				entry:
				%old = atomicrmw add i64 addrspace(1)* %inout, i64 5 acq_rel
				store i64 %old, i64 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: add_i64_uniform:
				; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
				; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
				; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
				; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
				; GCN: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
				; GCN: {{flat\|buffer\|global}}_atomic_add_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}
				define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %additive) {
				entry:
				%old = atomicrmw add i64 addrspace(1)* %inout, i64 %additive acq_rel
				store i64 %old, i64 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: add_i64_varying:
				; GCN-NOT: v_mbcnt_lo_u32_b32
				; GCN-NOT: v_mbcnt_hi_u32_b32
				; GCN-NOT: s_bcnt1_i32_b64
				; GCN: {{flat\|buffer\|global}}_atomic_add_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}
				define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) {
				entry:
				%lane = call i32 @llvm.amdgcn.workitem.id.x()
				%zext = zext i32 %lane to i64
				%old = atomicrmw add i64 addrspace(1)* %inout, i64 %zext acq_rel
				store i64 %old, i64 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: sub_i32_constant:
				; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
				; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
				; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
				; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
				; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
				; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
				; GCN: {{flat\|buffer\|global}}_atomic_sub v[[value]]
				define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {
				entry:
				%old = atomicrmw sub i32 addrspace(1)* %inout, i32 5 acq_rel
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: sub_i32_uniform:
				; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
				; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
				; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
				; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
				; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
				; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
				; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
				; GCN: {{flat\|buffer\|global}}_atomic_sub v[[value]]
				define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %subitive) {
				entry:
				%old = atomicrmw sub i32 addrspace(1)* %inout, i32 %subitive acq_rel
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: sub_i32_varying:
				; GFX7LESS-NOT: v_mbcnt_lo_u32_b32
				; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
				; GFX7LESS-NOT: s_bcnt1_i32_b64
				; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
				; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
				; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
				; GFX8MORE: buffer_atomic_sub v[[value]]
				define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {
				entry:
				%lane = call i32 @llvm.amdgcn.workitem.id.x()
				%old = atomicrmw sub i32 addrspace(1)* %inout, i32 %lane acq_rel
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: sub_i64_constant:
				; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
				; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
				; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
				; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
				; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
				; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5
				; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5
				; GCN: {{flat\|buffer\|global}}_atomic_sub_x2 v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}
				define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) {
				entry:
				%old = atomicrmw sub i64 addrspace(1)* %inout, i64 5 acq_rel
				store i64 %old, i64 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: sub_i64_uniform:
				; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
				; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
				; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
				; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
				; GCN: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
				; GCN: {{flat\|buffer\|global}}_atomic_sub_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}
				define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %subitive) {
				entry:
				%old = atomicrmw sub i64 addrspace(1)* %inout, i64 %subitive acq_rel
				store i64 %old, i64 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: sub_i64_varying:
				; GCN-NOT: v_mbcnt_lo_u32_b32
				; GCN-NOT: v_mbcnt_hi_u32_b32
				; GCN-NOT: s_bcnt1_i32_b64
				; GCN: {{flat\|buffer\|global}}_atomic_sub_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}
				define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) {
				entry:
				%lane = call i32 @llvm.amdgcn.workitem.id.x()
				%zext = zext i32 %lane to i64
				%old = atomicrmw sub i64 addrspace(1)* %inout, i64 %zext acq_rel
				store i64 %old, i64 addrspace(1)* %out
				ret void
				}

llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll

				; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s
				; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
				; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s

				declare i32 @llvm.amdgcn.workitem.id.x()

				@local_var32 = addrspace(3) global i32 undef, align 4
				@local_var64 = addrspace(3) global i64 undef, align 8

				; Show that what the atomic optimization pass will do for local pointers.

				; GCN-LABEL: add_i32_constant:
				; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
				; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
				; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
				; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
				; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
				; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
				; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
				define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
				entry:
				%old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: add_i32_uniform:
				; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
				; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
				; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
				; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
				; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
				; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
				; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
				; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
				define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) {
				entry:
				%old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: add_i32_varying:
				; GFX7LESS-NOT: v_mbcnt_lo_u32_b32
				; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
				; GFX7LESS-NOT: s_bcnt1_i32_b64
				; GFX7LESS: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
				; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
				; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
				; GFX8MORE: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
				define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
				entry:
				%lane = call i32 @llvm.amdgcn.workitem.id.x()
				%old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: add_i64_constant:
				; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
				; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
				; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
				; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
				; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
				; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5
				; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5
				; GCN: ds_add_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}
				define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
				entry:
				%old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel
				store i64 %old, i64 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: add_i64_uniform:
				; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
				; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
				; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
				; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
				; GCN: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
				; GCN: ds_add_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}
				define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) {
				entry:
				%old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel
				store i64 %old, i64 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: add_i64_varying:
				; GCN-NOT: v_mbcnt_lo_u32_b32
				; GCN-NOT: v_mbcnt_hi_u32_b32
				; GCN-NOT: s_bcnt1_i32_b64
				; GCN: ds_add_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}
				define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) {
				entry:
				%lane = call i32 @llvm.amdgcn.workitem.id.x()
				%zext = zext i32 %lane to i64
				%old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel
				store i64 %old, i64 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: sub_i32_constant:
				; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
				; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
				; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
				; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
				; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
				; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
				; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
				define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
				entry:
				%old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: sub_i32_uniform:
				; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
				; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
				; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
				; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
				; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
				; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
				; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
				; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
				define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) {
				entry:
				%old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: sub_i32_varying:
				; GFX7LESS-NOT: v_mbcnt_lo_u32_b32
				; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
				; GFX7LESS-NOT: s_bcnt1_i32_b64
				; GFX7LESS: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
				; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
				; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
				; GFX8MORE: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]]
				define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
				entry:
				%lane = call i32 @llvm.amdgcn.workitem.id.x()
				%old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: sub_i64_constant:
				; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
				; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
				; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
				; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
				; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
				; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5
				; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5
				; GCN: ds_sub_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}}
				define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
				entry:
				%old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel
				store i64 %old, i64 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: sub_i64_uniform:
				; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
				; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
				; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
				; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
				; GCN: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
				; GCN: ds_sub_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}
				define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) {
				entry:
				%old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel
				store i64 %old, i64 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: sub_i64_varying:
				; GCN-NOT: v_mbcnt_lo_u32_b32
				; GCN-NOT: v_mbcnt_hi_u32_b32
				; GCN-NOT: s_bcnt1_i32_b64
				; GCN: ds_sub_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}
				define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) {
				entry:
				%lane = call i32 @llvm.amdgcn.workitem.id.x()
				%zext = zext i32 %lane to i64
				%old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel
				store i64 %old, i64 addrspace(1)* %out
				ret void
				}

llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll

				; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s
				; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
				; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s

				declare i32 @llvm.amdgcn.workitem.id.x()
				declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32)
				declare i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32)

				; Show that what the atomic optimization pass will do for raw buffers.

				; GCN-LABEL: add_i32_constant:
				; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
				; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
				; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
				; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
				; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
				; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
				; GCN: buffer_atomic_add v[[value]]
				define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
				entry:
				%old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0)
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: add_i32_uniform:
				; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
				; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
				; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
				; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
				; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
				; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
				; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
				; GCN: buffer_atomic_add v[[value]]
				define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) {
				entry:
				%old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i32 0)
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: add_i32_varying_vdata:
				; GFX7LESS-NOT: v_mbcnt_lo_u32_b32
				; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
				; GFX7LESS-NOT: s_bcnt1_i32_b64
				; GFX7LESS: buffer_atomic_add v{{[0-9]+}}
				; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
				; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
				; GFX8MORE: buffer_atomic_add v[[value]]
				define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
				entry:
				%lane = call i32 @llvm.amdgcn.workitem.id.x()
				%old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0)
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: add_i32_varying_offset:
				; GCN-NOT: v_mbcnt_lo_u32_b32
				; GCN-NOT: v_mbcnt_hi_u32_b32
				; GCN-NOT: s_bcnt1_i32_b64
				; GCN: buffer_atomic_add v{{[0-9]+}}
				define amdgpu_kernel void @add_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) {
				entry:
				%lane = call i32 @llvm.amdgcn.workitem.id.x()
				%old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0)
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: sub_i32_constant:
				; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
				; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
				; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
				; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
				; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
				; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
				; GCN: buffer_atomic_sub v[[value]]
				define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
				entry:
				%old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0)
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: sub_i32_uniform:
				; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
				; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
				; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
				; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
				; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
				; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
				; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
				; GCN: buffer_atomic_sub v[[value]]
				define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) {
				entry:
				%old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i32 0)
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: sub_i32_varying_vdata:
				; GFX7LESS-NOT: v_mbcnt_lo_u32_b32
				; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
				; GFX7LESS-NOT: s_bcnt1_i32_b64
				; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
				; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
				; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
				; GFX8MORE: buffer_atomic_sub v[[value]]
				define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
				entry:
				%lane = call i32 @llvm.amdgcn.workitem.id.x()
				%old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0)
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: sub_i32_varying_offset:
				; GCN-NOT: v_mbcnt_lo_u32_b32
				; GCN-NOT: v_mbcnt_hi_u32_b32
				; GCN-NOT: s_bcnt1_i32_b64
				; GCN: buffer_atomic_sub v{{[0-9]+}}
				define amdgpu_kernel void @sub_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) {
				entry:
				%lane = call i32 @llvm.amdgcn.workitem.id.x()
				%old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0)
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll

				; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s
				; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s
				; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s

				declare i32 @llvm.amdgcn.workitem.id.x()
				declare i32 @llvm.amdgcn.struct.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32, i32)
				declare i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32, i32)

				; Show that what the atomic optimization pass will do for struct buffers.

				; GCN-LABEL: add_i32_constant:
				; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
				; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
				; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
				; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
				; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
				; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
				; GCN: buffer_atomic_add v[[value]]
				define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
				entry:
				%old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: add_i32_uniform:
				; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
				; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
				; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
				; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
				; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
				; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
				; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
				; GCN: buffer_atomic_add v[[value]]
				define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) {
				entry:
				%old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: add_i32_varying_vdata:
				; GFX7LESS-NOT: v_mbcnt_lo_u32_b32
				; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
				; GFX7LESS-NOT: s_bcnt1_i32_b64
				; GFX7LESS: buffer_atomic_add v{{[0-9]+}}
				; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
				; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
				; GFX8MORE: buffer_atomic_add v[[value]]
				define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
				entry:
				%lane = call i32 @llvm.amdgcn.workitem.id.x()
				%old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: add_i32_varying_vindex:
				; GCN-NOT: v_mbcnt_lo_u32_b32
				; GCN-NOT: v_mbcnt_hi_u32_b32
				; GCN-NOT: s_bcnt1_i32_b64
				; GCN: buffer_atomic_add v{{[0-9]+}}
				define amdgpu_kernel void @add_i32_varying_vindex(i32 addrspace(1)* %out, <4 x i32> %inout) {
				entry:
				%lane = call i32 @llvm.amdgcn.workitem.id.x()
				%old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0, i32 0)
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: add_i32_varying_offset:
				; GCN-NOT: v_mbcnt_lo_u32_b32
				; GCN-NOT: v_mbcnt_hi_u32_b32
				; GCN-NOT: s_bcnt1_i32_b64
				; GCN: buffer_atomic_add v{{[0-9]+}}
				define amdgpu_kernel void @add_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) {
				entry:
				%lane = call i32 @llvm.amdgcn.workitem.id.x()
				%old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 0, i32 %lane, i32 0, i32 0)
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: sub_i32_constant:
				; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
				; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
				; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
				; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
				; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
				; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5
				; GCN: buffer_atomic_sub v[[value]]
				define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
				entry:
				%old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: sub_i32_uniform:
				; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec
				; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0
				; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]]
				; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]]
				; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}}
				; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]]
				; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
				; GCN: buffer_atomic_sub v[[value]]
				define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) {
				entry:
				%old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: sub_i32_varying_vdata:
				; GFX7LESS-NOT: v_mbcnt_lo_u32_b32
				; GFX7LESS-NOT: v_mbcnt_hi_u32_b32
				; GFX7LESS-NOT: s_bcnt1_i32_b64
				; GFX7LESS: buffer_atomic_sub v{{[0-9]+}}
				; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63
				; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]]
				; GFX8MORE: buffer_atomic_sub v[[value]]
				define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
				entry:
				%lane = call i32 @llvm.amdgcn.workitem.id.x()
				%old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: sub_i32_varying_vindex:
				; GCN-NOT: v_mbcnt_lo_u32_b32
				; GCN-NOT: v_mbcnt_hi_u32_b32
				; GCN-NOT: s_bcnt1_i32_b64
				; GCN: buffer_atomic_sub v{{[0-9]+}}
				define amdgpu_kernel void @sub_i32_varying_vindex(i32 addrspace(1)* %out, <4 x i32> %inout) {
				entry:
				%lane = call i32 @llvm.amdgcn.workitem.id.x()
				%old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0, i32 0)
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: sub_i32_varying_offset:
				; GCN-NOT: v_mbcnt_lo_u32_b32
				; GCN-NOT: v_mbcnt_hi_u32_b32
				; GCN-NOT: s_bcnt1_i32_b64
				; GCN: buffer_atomic_sub v{{[0-9]+}}
				define amdgpu_kernel void @sub_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) {
				entry:
				%lane = call i32 @llvm.amdgcn.workitem.id.x()
				%old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 0, i32 %lane, i32 0, i32 0)
				store i32 %old, i32 addrspace(1)* %out
				ret void
				}

llvm/trunk/test/CodeGen/AMDGPU/global_atomics.ll

	; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s			; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s
	; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,VI,SIVI %s			; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=false -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,VI,SIVI %s
	; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s			; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s

	; GCN-LABEL: {{^}}atomic_add_i32_offset:			; GCN-LABEL: {{^}}atomic_add_i32_offset:
	; SIVI: buffer_atomic_add v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}			; SIVI: buffer_atomic_add v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
	; GFX9: global_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}}			; GFX9: global_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off offset:16{{$}}
	define amdgpu_kernel void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in) {			define amdgpu_kernel void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in) {
	entry:			entry:
	%gep = getelementptr i32, i32 addrspace(1)* %out, i64 4			%gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
	%val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst			%val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
	▲ Show 20 Lines • Show All 1,215 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/global_atomics_i64.ll

	; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIVI %s			; RUN: llc -march=amdgcn -mcpu=bonaire -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIVI %s
	; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,VI,CIVI %s			; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=false -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,VI,CIVI %s
	; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s			; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s

	; GCN-LABEL: {{^}}atomic_add_i64_offset:			; GCN-LABEL: {{^}}atomic_add_i64_offset:
	; CIVI: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}			; CIVI: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}

	; GFX9: global_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], off offset:32{{$}}			; GFX9: global_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], off offset:32{{$}}
	define amdgpu_kernel void @atomic_add_i64_offset(i64 addrspace(1)* %out, i64 %in) {			define amdgpu_kernel void @atomic_add_i64_offset(i64 addrspace(1)* %out, i64 %in) {
	entry:			entry:
	%gep = getelementptr i64, i64 addrspace(1)* %out, i64 4			%gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
	▲ Show 20 Lines • Show All 1,168 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll

	;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs \| FileCheck %s -check-prefix=CHECK -check-prefix=SICI			;RUN: llc < %s -march=amdgcn -mcpu=verde -amdgpu-atomic-optimizations=false -verify-machineinstrs \| FileCheck %s -check-prefix=CHECK -check-prefix=SICI
	;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs \| FileCheck %s -check-prefix=CHECK -check-prefix=VI			;RUN: llc < %s -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=false -verify-machineinstrs \| FileCheck %s -check-prefix=CHECK -check-prefix=VI

	;CHECK-LABEL: {{^}}test1:			;CHECK-LABEL: {{^}}test1:
	;CHECK-NOT: s_waitcnt			;CHECK-NOT: s_waitcnt
	;CHECK: buffer_atomic_swap v0, off, s[0:3], 0 glc			;CHECK: buffer_atomic_swap v0, off, s[0:3], 0 glc
	;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc			;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc
	;CHECK: s_waitcnt vmcnt(0)			;CHECK: s_waitcnt vmcnt(0)
	;CHECK: buffer_atomic_swap v0, v1, s[0:3], 0 idxen glc			;CHECK: buffer_atomic_swap v0, v1, s[0:3], 0 idxen glc
	;CHECK: s_waitcnt vmcnt(0)			;CHECK: s_waitcnt vmcnt(0)
	▲ Show 20 Lines • Show All 119 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.ll

	;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs \| FileCheck %s -check-prefix=CHECK -check-prefix=SICI			;RUN: llc < %s -march=amdgcn -mcpu=verde -amdgpu-atomic-optimizations=false -verify-machineinstrs \| FileCheck %s -check-prefix=CHECK -check-prefix=SICI
	;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs \| FileCheck %s -check-prefix=CHECK -check-prefix=VI			;RUN: llc < %s -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=false -verify-machineinstrs \| FileCheck %s -check-prefix=CHECK -check-prefix=VI

	;CHECK-LABEL: {{^}}test1:			;CHECK-LABEL: {{^}}test1:
	;CHECK-NOT: s_waitcnt			;CHECK-NOT: s_waitcnt
	;CHECK: buffer_atomic_swap v0, off, s[0:3], 0 glc			;CHECK: buffer_atomic_swap v0, off, s[0:3], 0 glc
	;CHECK: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc			;CHECK: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc
	;CHECK: s_waitcnt vmcnt(0)			;CHECK: s_waitcnt vmcnt(0)
	;CHECK: buffer_atomic_swap v0, v1, s[0:3], 0 offen glc			;CHECK: buffer_atomic_swap v0, v1, s[0:3], 0 offen glc
	;CHECK: s_waitcnt vmcnt(0)			;CHECK: s_waitcnt vmcnt(0)
	▲ Show 20 Lines • Show All 105 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.ll

	;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs \| FileCheck %s -check-prefix=CHECK -check-prefix=SICI			;RUN: llc < %s -march=amdgcn -mcpu=verde -amdgpu-atomic-optimizations=false -verify-machineinstrs \| FileCheck %s -check-prefix=CHECK -check-prefix=SICI
	;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs \| FileCheck %s -check-prefix=CHECK -check-prefix=VI			;RUN: llc < %s -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=false -verify-machineinstrs \| FileCheck %s -check-prefix=CHECK -check-prefix=VI

	;CHECK-LABEL: {{^}}test1:			;CHECK-LABEL: {{^}}test1:
	;CHECK-NOT: s_waitcnt			;CHECK-NOT: s_waitcnt
	;CHECK: buffer_atomic_swap v0, {{v[0-9]+}}, s[0:3], 0 idxen glc			;CHECK: buffer_atomic_swap v0, {{v[0-9]+}}, s[0:3], 0 idxen glc
	;CHECK: s_waitcnt vmcnt(0)			;CHECK: s_waitcnt vmcnt(0)
	;CHECK: buffer_atomic_swap v0, {{v[0-9]+}}, s[0:3], 0 idxen glc			;CHECK: buffer_atomic_swap v0, {{v[0-9]+}}, s[0:3], 0 idxen glc
	;CHECK: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc			;CHECK: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc
	;CHECK: s_waitcnt vmcnt(0)			;CHECK: s_waitcnt vmcnt(0)
	▲ Show 20 Lines • Show All 117 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/local-atomics.ll

	; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,SI,SICIVI,FUNC %s			; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,SI,SICIVI,FUNC %s
	; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,FUNC %s			; RUN: llc -march=amdgcn -mcpu=bonaire -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,FUNC %s
	; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,SICIVI,FUNC %s			; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,SICIVI,FUNC %s
	; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,FUNC %s			; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,FUNC %s
	; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=EG,FUNC %s			; RUN: llc -march=r600 -mcpu=redwood -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=EG,FUNC %s

	; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i32:			; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i32:
	; EG: LDS_WRXCHG_RET *			; EG: LDS_WRXCHG_RET *

	; SICIVI-DAG: s_mov_b32 m0			; SICIVI-DAG: s_mov_b32 m0
	; GFX9-NOT: m0			; GFX9-NOT: m0

	; GCN-DAG: s_load_dword [[SPTR:s[0-9]+]],			; GCN-DAG: s_load_dword [[SPTR:s[0-9]+]],
	▲ Show 20 Lines • Show All 703 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/local-atomics64.ll

	; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,SI,SICIVI %s			; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s \| FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,SI,SICIVI %s
	; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,VI,SICIVI,GFX89 %s			; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s \| FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,VI,SICIVI,GFX89 %s
	; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9,GFX89 %s			; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s \| FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9,GFX89 %s

	; GCN-LABEL: {{^}}lds_atomic_xchg_ret_i64:			; GCN-LABEL: {{^}}lds_atomic_xchg_ret_i64:
	; SICIVI: s_mov_b32 m0			; SICIVI: s_mov_b32 m0
	; GFX9-NOT: m0			; GFX9-NOT: m0

	; GCN: ds_wrxchg_rtn_b64			; GCN: ds_wrxchg_rtn_b64
	; GCN: s_endpgm			; GCN: s_endpgm
	define amdgpu_kernel void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {			define amdgpu_kernel void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
	▲ Show 20 Lines • Show All 604 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Add an AMDGPU specific atomic optimizer.
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 168669

llvm/trunk/lib/Target/AMDGPU/AMDGPU.h

llvm/trunk/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp

llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt

llvm/trunk/test/CodeGen/AMDGPU/atomic_load_add.ll

llvm/trunk/test/CodeGen/AMDGPU/atomic_load_sub.ll

llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll

llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll

llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll

llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll

llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll

llvm/trunk/test/CodeGen/AMDGPU/global_atomics.ll

llvm/trunk/test/CodeGen/AMDGPU/global_atomics_i64.ll

llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll

llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.ll

llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.ll

llvm/trunk/test/CodeGen/AMDGPU/local-atomics.ll

llvm/trunk/test/CodeGen/AMDGPU/local-atomics64.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Add an AMDGPU specific atomic optimizer.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 168669

llvm/trunk/lib/Target/AMDGPU/AMDGPU.h

llvm/trunk/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp

llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt

llvm/trunk/test/CodeGen/AMDGPU/atomic_load_add.ll

llvm/trunk/test/CodeGen/AMDGPU/atomic_load_sub.ll

llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll

llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll

llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll

llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll

llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll

llvm/trunk/test/CodeGen/AMDGPU/global_atomics.ll

llvm/trunk/test/CodeGen/AMDGPU/global_atomics_i64.ll

llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll

llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.ll

llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.ll

llvm/trunk/test/CodeGen/AMDGPU/local-atomics.ll

llvm/trunk/test/CodeGen/AMDGPU/local-atomics64.ll

[AMDGPU] Add an AMDGPU specific atomic optimizer.
ClosedPublic