Index: llvm/trunk/lib/Target/AMDGPU/AMDGPU.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPU.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPU.h @@ -69,6 +69,10 @@ void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &); extern char &AMDGPUAnnotateKernelFeaturesID; +FunctionPass *createAMDGPUAtomicOptimizerPass(); +void initializeAMDGPUAtomicOptimizerPass(PassRegistry &); +extern char &AMDGPUAtomicOptimizerID; + ModulePass *createAMDGPULowerIntrinsicsPass(); void initializeAMDGPULowerIntrinsicsPass(PassRegistry &); extern char &AMDGPULowerIntrinsicsID; Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -0,0 +1,421 @@ +//===-- AMDGPUAtomicOptimizer.cpp -----------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass optimizes atomic operations by using a single lane of a wavefront +/// to perform the atomic operation, thus reducing contention on that memory +/// location. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "llvm/Analysis/LegacyDivergenceAnalysis.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +#define DEBUG_TYPE "amdgpu-atomic-optimizer" + +using namespace llvm; + +namespace { + +enum DPP_CTRL { + DPP_ROW_SR1 = 0x111, + DPP_ROW_SR2 = 0x112, + DPP_ROW_SR4 = 0x114, + DPP_ROW_SR8 = 0x118, + DPP_WF_SR1 = 0x138, + DPP_ROW_BCAST15 = 0x142, + DPP_ROW_BCAST31 = 0x143 +}; + +struct ReplacementInfo { + Instruction *I; + Instruction::BinaryOps Op; + unsigned ValIdx; + bool ValDivergent; +}; + +class AMDGPUAtomicOptimizer : public FunctionPass, + public InstVisitor { +private: + SmallVector ToReplace; + const LegacyDivergenceAnalysis *DA; + const DataLayout *DL; + DominatorTree *DT; + bool HasDPP; + + void optimizeAtomic(Instruction &I, Instruction::BinaryOps Op, + unsigned ValIdx, bool ValDivergent) const; + + void setConvergent(CallInst *const CI) const; + +public: + static char ID; + + AMDGPUAtomicOptimizer() : FunctionPass(ID) {} + + bool runOnFunction(Function &F) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addPreserved(); + AU.addRequired(); + AU.addRequired(); + } + + void visitAtomicRMWInst(AtomicRMWInst &I); + void visitIntrinsicInst(IntrinsicInst &I); +}; + +} // namespace + +char AMDGPUAtomicOptimizer::ID = 0; + +char &llvm::AMDGPUAtomicOptimizerID = AMDGPUAtomicOptimizer::ID; + +bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) { + if (skipFunction(F)) { + return false; + } + + DA = &getAnalysis(); + DL = &F.getParent()->getDataLayout(); + DominatorTreeWrapperPass *const DTW = + getAnalysisIfAvailable(); + DT = DTW ? &DTW->getDomTree() : nullptr; + const TargetPassConfig &TPC = getAnalysis(); + const TargetMachine &TM = TPC.getTM(); + const GCNSubtarget &ST = TM.getSubtarget(F); + HasDPP = ST.hasDPP(); + + visit(F); + + const bool Changed = !ToReplace.empty(); + + for (ReplacementInfo &Info : ToReplace) { + optimizeAtomic(*Info.I, Info.Op, Info.ValIdx, Info.ValDivergent); + } + + ToReplace.clear(); + + return Changed; +} + +void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) { + // Early exit for unhandled address space atomic instructions. + switch (I.getPointerAddressSpace()) { + default: + return; + case AMDGPUAS::GLOBAL_ADDRESS: + case AMDGPUAS::LOCAL_ADDRESS: + break; + } + + Instruction::BinaryOps Op; + + switch (I.getOperation()) { + default: + return; + case AtomicRMWInst::Add: + Op = Instruction::Add; + break; + case AtomicRMWInst::Sub: + Op = Instruction::Sub; + break; + } + + const unsigned PtrIdx = 0; + const unsigned ValIdx = 1; + + // If the pointer operand is divergent, then each lane is doing an atomic + // operation on a different address, and we cannot optimize that. + if (DA->isDivergent(I.getOperand(PtrIdx))) { + return; + } + + const bool ValDivergent = DA->isDivergent(I.getOperand(ValIdx)); + + // If the value operand is divergent, each lane is contributing a different + // value to the atomic calculation. We can only optimize divergent values if + // we have DPP available on our subtarget, and the atomic operation is 32 + // bits. + if (ValDivergent && (!HasDPP || (DL->getTypeSizeInBits(I.getType()) != 32))) { + return; + } + + // If we get here, we can optimize the atomic using a single wavefront-wide + // atomic operation to do the calculation for the entire wavefront, so + // remember the instruction so we can come back to it. + const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent}; + + ToReplace.push_back(Info); +} + +void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) { + Instruction::BinaryOps Op; + + switch (I.getIntrinsicID()) { + default: + return; + case Intrinsic::amdgcn_buffer_atomic_add: + case Intrinsic::amdgcn_struct_buffer_atomic_add: + case Intrinsic::amdgcn_raw_buffer_atomic_add: + Op = Instruction::Add; + break; + case Intrinsic::amdgcn_buffer_atomic_sub: + case Intrinsic::amdgcn_struct_buffer_atomic_sub: + case Intrinsic::amdgcn_raw_buffer_atomic_sub: + Op = Instruction::Sub; + break; + } + + const unsigned ValIdx = 0; + + const bool ValDivergent = DA->isDivergent(I.getOperand(ValIdx)); + + // If the value operand is divergent, each lane is contributing a different + // value to the atomic calculation. We can only optimize divergent values if + // we have DPP available on our subtarget, and the atomic operation is 32 + // bits. + if (ValDivergent && (!HasDPP || (DL->getTypeSizeInBits(I.getType()) != 32))) { + return; + } + + // If any of the other arguments to the intrinsic are divergent, we can't + // optimize the operation. + for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) { + if (DA->isDivergent(I.getOperand(Idx))) { + return; + } + } + + // If we get here, we can optimize the atomic using a single wavefront-wide + // atomic operation to do the calculation for the entire wavefront, so + // remember the instruction so we can come back to it. + const ReplacementInfo Info = {&I, Op, ValIdx, ValDivergent}; + + ToReplace.push_back(Info); +} + +void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, + Instruction::BinaryOps Op, + unsigned ValIdx, + bool ValDivergent) const { + LLVMContext &Context = I.getContext(); + + // Start building just before the instruction. + IRBuilder<> B(&I); + + Type *const Ty = I.getType(); + const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty); + Type *const VecTy = VectorType::get(B.getInt32Ty(), 2); + + // This is the value in the atomic operation we need to combine in order to + // reduce the number of atomic operations. + Value *const V = I.getOperand(ValIdx); + + // We need to know how many lanes are active within the wavefront, and we do + // this by getting the exec register, which tells us all the lanes that are + // active. + MDNode *const RegName = + llvm::MDNode::get(Context, llvm::MDString::get(Context, "exec")); + Value *const Metadata = llvm::MetadataAsValue::get(Context, RegName); + CallInst *const Exec = + B.CreateIntrinsic(Intrinsic::read_register, {B.getInt64Ty()}, {Metadata}); + setConvergent(Exec); + + // We need to know how many lanes are active within the wavefront that are + // below us. If we counted each lane linearly starting from 0, a lane is + // below us only if its associated index was less than ours. We do this by + // using the mbcnt intrinsic. + Value *const BitCast = B.CreateBitCast(Exec, VecTy); + Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0)); + Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1)); + CallInst *const PartialMbcnt = B.CreateIntrinsic( + Intrinsic::amdgcn_mbcnt_lo, {}, {ExtractLo, B.getInt32(0)}); + CallInst *const Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, + {ExtractHi, PartialMbcnt}); + + Value *const MbcntCast = B.CreateIntCast(Mbcnt, Ty, false); + + Value *LaneOffset = nullptr; + Value *NewV = nullptr; + + // If we have a divergent value in each lane, we need to combine the value + // using DPP. + if (ValDivergent) { + // First we need to set all inactive invocations to 0, so that they can + // correctly contribute to the final result. + CallInst *const SetInactive = B.CreateIntrinsic( + Intrinsic::amdgcn_set_inactive, Ty, {V, B.getIntN(TyBitWidth, 0)}); + setConvergent(SetInactive); + NewV = SetInactive; + + const unsigned Iters = 6; + const unsigned DPPCtrl[Iters] = {DPP_ROW_SR1, DPP_ROW_SR2, + DPP_ROW_SR4, DPP_ROW_SR8, + DPP_ROW_BCAST15, DPP_ROW_BCAST31}; + const unsigned RowMask[Iters] = {0xf, 0xf, 0xf, 0xf, 0xa, 0xc}; + + // This loop performs an inclusive scan across the wavefront, with all lanes + // active (by using the WWM intrinsic). + for (unsigned Idx = 0; Idx < Iters; Idx++) { + CallInst *const DPP = B.CreateIntrinsic(Intrinsic::amdgcn_mov_dpp, Ty, + {NewV, B.getInt32(DPPCtrl[Idx]), + B.getInt32(RowMask[Idx]), + B.getInt32(0xf), B.getFalse()}); + setConvergent(DPP); + Value *const WWM = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, DPP); + + NewV = B.CreateBinOp(Op, NewV, WWM); + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV); + } + + // NewV has returned the inclusive scan of V, but for the lane offset we + // require an exclusive scan. We do this by shifting the values from the + // entire wavefront right by 1, and by setting the bound_ctrl (last argument + // to the intrinsic below) to true, we can guarantee that 0 will be shifted + // into the 0'th invocation. + CallInst *const DPP = + B.CreateIntrinsic(Intrinsic::amdgcn_mov_dpp, {Ty}, + {NewV, B.getInt32(DPP_WF_SR1), B.getInt32(0xf), + B.getInt32(0xf), B.getTrue()}); + setConvergent(DPP); + LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, DPP); + + // Read the value from the last lane, which has accumlated the values of + // each active lane in the wavefront. This will be our new value with which + // we will provide to the atomic operation. + if (TyBitWidth == 64) { + Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty()); + Value *const ExtractHi = + B.CreateTrunc(B.CreateLShr(NewV, B.getInt64(32)), B.getInt32Ty()); + CallInst *const ReadLaneLo = B.CreateIntrinsic( + Intrinsic::amdgcn_readlane, {}, {ExtractLo, B.getInt32(63)}); + setConvergent(ReadLaneLo); + CallInst *const ReadLaneHi = B.CreateIntrinsic( + Intrinsic::amdgcn_readlane, {}, {ExtractHi, B.getInt32(63)}); + setConvergent(ReadLaneHi); + Value *const PartialInsert = B.CreateInsertElement( + UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0)); + Value *const Insert = + B.CreateInsertElement(PartialInsert, ReadLaneHi, B.getInt32(1)); + NewV = B.CreateBitCast(Insert, Ty); + } else if (TyBitWidth == 32) { + CallInst *const ReadLane = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, + {}, {NewV, B.getInt32(63)}); + setConvergent(ReadLane); + NewV = ReadLane; + } else { + llvm_unreachable("Unhandled atomic bit width"); + } + } else { + // Get the total number of active lanes we have by using popcount. + Instruction *const Ctpop = B.CreateUnaryIntrinsic(Intrinsic::ctpop, Exec); + Value *const CtpopCast = B.CreateIntCast(Ctpop, Ty, false); + + // Calculate the new value we will be contributing to the atomic operation + // for the entire wavefront. + NewV = B.CreateMul(V, CtpopCast); + LaneOffset = B.CreateMul(V, MbcntCast); + } + + // We only want a single lane to enter our new control flow, and we do this + // by checking if there are any active lanes below us. Only one lane will + // have 0 active lanes below us, so that will be the only one to progress. + Value *const Cond = B.CreateICmpEQ(MbcntCast, B.getIntN(TyBitWidth, 0)); + + // Store I's original basic block before we split the block. + BasicBlock *const EntryBB = I.getParent(); + + // We need to introduce some new control flow to force a single lane to be + // active. We do this by splitting I's basic block at I, and introducing the + // new block such that: + // entry --> single_lane -\ + // \------------------> exit + Instruction *const SingleLaneTerminator = + SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr); + + // Move the IR builder into single_lane next. + B.SetInsertPoint(SingleLaneTerminator); + + // Clone the original atomic operation into single lane, replacing the + // original value with our newly created one. + Instruction *const NewI = I.clone(); + B.Insert(NewI); + NewI->setOperand(ValIdx, NewV); + + // Move the IR builder into exit next, and start inserting just before the + // original instruction. + B.SetInsertPoint(&I); + + // Create a PHI node to get our new atomic result into the exit block. + PHINode *const PHI = B.CreatePHI(Ty, 2); + PHI->addIncoming(UndefValue::get(Ty), EntryBB); + PHI->addIncoming(NewI, SingleLaneTerminator->getParent()); + + // We need to broadcast the value who was the lowest active lane (the first + // lane) to all other lanes in the wavefront. We use an intrinsic for this, + // but have to handle 64-bit broadcasts with two calls to this intrinsic. + Value *BroadcastI = nullptr; + + if (TyBitWidth == 64) { + Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty()); + Value *const ExtractHi = + B.CreateTrunc(B.CreateLShr(PHI, B.getInt64(32)), B.getInt32Ty()); + CallInst *const ReadFirstLaneLo = + B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo); + setConvergent(ReadFirstLaneLo); + CallInst *const ReadFirstLaneHi = + B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi); + setConvergent(ReadFirstLaneHi); + Value *const PartialInsert = B.CreateInsertElement( + UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0)); + Value *const Insert = + B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1)); + BroadcastI = B.CreateBitCast(Insert, Ty); + } else if (TyBitWidth == 32) { + CallInst *const ReadFirstLane = + B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI); + setConvergent(ReadFirstLane); + BroadcastI = ReadFirstLane; + } else { + llvm_unreachable("Unhandled atomic bit width"); + } + + // Now that we have the result of our single atomic operation, we need to + // get our individual lane's slice into the result. We use the lane offset we + // previously calculated combined with the atomic result value we got from the + // first lane, to get our lane's index into the atomic result. + Value *const Result = B.CreateBinOp(Op, BroadcastI, LaneOffset); + + // Replace the original atomic instruction with the new one. + I.replaceAllUsesWith(Result); + + // And delete the original. + I.eraseFromParent(); +} + +void AMDGPUAtomicOptimizer::setConvergent(CallInst *const CI) const { + CI->addAttribute(AttributeList::FunctionIndex, Attribute::Convergent); +} + +INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE, + "AMDGPU atomic optimizations", false, false) +INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_END(AMDGPUAtomicOptimizer, DEBUG_TYPE, + "AMDGPU atomic optimizations", false, false) + +FunctionPass *llvm::createAMDGPUAtomicOptimizerPass() { + return new AMDGPUAtomicOptimizer(); +} Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -138,6 +138,13 @@ cl::init(true), cl::Hidden); +// Enable atomic optimization +static cl::opt EnableAtomicOptimizations( + "amdgpu-atomic-optimizations", + cl::desc("Enable atomic optimizations"), + cl::init(false), + cl::Hidden); + extern "C" void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine X(getTheAMDGPUTarget()); @@ -163,6 +170,7 @@ initializeAMDGPUAnnotateKernelFeaturesPass(*PR); initializeAMDGPUAnnotateUniformValuesPass(*PR); initializeAMDGPUArgumentUsageInfoPass(*PR); + initializeAMDGPUAtomicOptimizerPass(*PR); initializeAMDGPULowerKernelArgumentsPass(*PR); initializeAMDGPULowerKernelAttributesPass(*PR); initializeAMDGPULowerIntrinsicsPass(*PR); @@ -747,6 +755,10 @@ bool GCNPassConfig::addPreISel() { AMDGPUPassConfig::addPreISel(); + if (EnableAtomicOptimizations) { + addPass(createAMDGPUAtomicOptimizerPass()); + } + // FIXME: We need to run a pass to propagate the attributes when calls are // supported. addPass(createAMDGPUAnnotateKernelFeaturesPass()); Index: llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt +++ llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt @@ -37,6 +37,7 @@ AMDGPUAnnotateUniformValues.cpp AMDGPUArgumentUsageInfo.cpp AMDGPUAsmPrinter.cpp + AMDGPUAtomicOptimizer.cpp AMDGPUCallLowering.cpp AMDGPUCodeGenPrepare.cpp AMDGPUFrameLowering.cpp Index: llvm/trunk/test/CodeGen/AMDGPU/atomic_load_add.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/atomic_load_add.ll +++ llvm/trunk/test/CodeGen/AMDGPU/atomic_load_add.ll @@ -1,7 +1,7 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=R600,FUNC %s +; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s +; RUN: llc -march=r600 -mcpu=redwood -amdgpu-atomic-optimizations=false < %s | FileCheck -check-prefixes=R600,FUNC %s ; FUNC-LABEL: {{^}}atomic_add_local: ; SICIVI: s_mov_b32 m0 Index: llvm/trunk/test/CodeGen/AMDGPU/atomic_load_sub.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/atomic_load_sub.ll +++ llvm/trunk/test/CodeGen/AMDGPU/atomic_load_sub.ll @@ -1,7 +1,7 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SICIVI,FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SICIVI,FUNC %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,FUNC %s -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefixes=R600,FUNC %s +; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SICIVI,FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SICIVI,FUNC %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,FUNC %s +; RUN: llc -march=r600 -mcpu=redwood -amdgpu-atomic-optimizations=false < %s | FileCheck -enable-var-scope -check-prefixes=R600,FUNC %s ; FUNC-LABEL: {{^}}atomic_sub_local: ; SICIVI: s_mov_b32 m0 Index: llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -0,0 +1,129 @@ +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s + +declare i32 @llvm.amdgcn.workitem.id.x() +declare i32 @llvm.amdgcn.buffer.atomic.add(i32, <4 x i32>, i32, i32, i1) +declare i32 @llvm.amdgcn.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i1) + +; Show that what the atomic optimization pass will do for raw buffers. + +; GCN-LABEL: add_i32_constant: +; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 +; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] +; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 +; GCN: buffer_atomic_add v[[value]] +define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) { +entry: + %old = call i32 @llvm.amdgcn.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i1 0) + store i32 %old, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: add_i32_uniform: +; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 +; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] +; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] +; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] +; GCN: buffer_atomic_add v[[value]] +define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) { +entry: + %old = call i32 @llvm.amdgcn.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i1 0) + store i32 %old, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: add_i32_varying_vdata: +; GFX7LESS-NOT: v_mbcnt_lo_u32_b32 +; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 +; GFX7LESS-NOT: s_bcnt1_i32_b64 +; GFX7LESS: buffer_atomic_add v{{[0-9]+}} +; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 +; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] +; GFX8MORE: buffer_atomic_add v[[value]] +define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) { +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %old = call i32 @llvm.amdgcn.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i1 0) + store i32 %old, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: add_i32_varying_offset: +; GCN-NOT: v_mbcnt_lo_u32_b32 +; GCN-NOT: v_mbcnt_hi_u32_b32 +; GCN-NOT: s_bcnt1_i32_b64 +; GCN: buffer_atomic_add v{{[0-9]+}} +define amdgpu_kernel void @add_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) { +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %old = call i32 @llvm.amdgcn.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i1 0) + store i32 %old, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: sub_i32_constant: +; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 +; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] +; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 +; GCN: buffer_atomic_sub v[[value]] +define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) { +entry: + %old = call i32 @llvm.amdgcn.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i1 0) + store i32 %old, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: sub_i32_uniform: +; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 +; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] +; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] +; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] +; GCN: buffer_atomic_sub v[[value]] +define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) { +entry: + %old = call i32 @llvm.amdgcn.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i1 0) + store i32 %old, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: sub_i32_varying_vdata: +; GFX7LESS-NOT: v_mbcnt_lo_u32_b32 +; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 +; GFX7LESS-NOT: s_bcnt1_i32_b64 +; GFX7LESS: buffer_atomic_sub v{{[0-9]+}} +; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 +; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] +; GFX8MORE: buffer_atomic_sub v[[value]] +define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) { +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %old = call i32 @llvm.amdgcn.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i1 0) + store i32 %old, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: sub_i32_varying_offset: +; GCN-NOT: v_mbcnt_lo_u32_b32 +; GCN-NOT: v_mbcnt_hi_u32_b32 +; GCN-NOT: s_bcnt1_i32_b64 +; GCN: buffer_atomic_sub v{{[0-9]+}} +define amdgpu_kernel void @sub_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) { +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %old = call i32 @llvm.amdgcn.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i1 0) + store i32 %old, i32 addrspace(1)* %out + ret void +} Index: llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -0,0 +1,189 @@ +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s + +declare i32 @llvm.amdgcn.workitem.id.x() + +; Show that what the atomic optimization pass will do for global pointers. + +; GCN-LABEL: add_i32_constant: +; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 +; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] +; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 +; GCN: {{flat|buffer|global}}_atomic_add v[[value]] +define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { +entry: + %old = atomicrmw add i32 addrspace(1)* %inout, i32 5 acq_rel + store i32 %old, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: add_i32_uniform: +; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 +; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] +; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] +; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] +; GCN: {{flat|buffer|global}}_atomic_add v[[value]] +define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %additive) { +entry: + %old = atomicrmw add i32 addrspace(1)* %inout, i32 %additive acq_rel + store i32 %old, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: add_i32_varying: +; GFX7LESS-NOT: v_mbcnt_lo_u32_b32 +; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 +; GFX7LESS-NOT: s_bcnt1_i32_b64 +; GFX7LESS: buffer_atomic_add v{{[0-9]+}} +; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 +; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] +; GFX8MORE: buffer_atomic_add v[[value]] +define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %old = atomicrmw add i32 addrspace(1)* %inout, i32 %lane acq_rel + store i32 %old, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: add_i64_constant: +; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 +; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] +; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5 +; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5 +; GCN: {{flat|buffer|global}}_atomic_add_x2 v{{\[}}[[value_lo]]:[[value_hi]]{{\]}} +define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { +entry: + %old = atomicrmw add i64 addrspace(1)* %inout, i64 5 acq_rel + store i64 %old, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: add_i64_uniform: +; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 +; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] +; GCN: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN: {{flat|buffer|global}}_atomic_add_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} +define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %additive) { +entry: + %old = atomicrmw add i64 addrspace(1)* %inout, i64 %additive acq_rel + store i64 %old, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: add_i64_varying: +; GCN-NOT: v_mbcnt_lo_u32_b32 +; GCN-NOT: v_mbcnt_hi_u32_b32 +; GCN-NOT: s_bcnt1_i32_b64 +; GCN: {{flat|buffer|global}}_atomic_add_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} +define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %zext = zext i32 %lane to i64 + %old = atomicrmw add i64 addrspace(1)* %inout, i64 %zext acq_rel + store i64 %old, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: sub_i32_constant: +; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 +; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] +; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 +; GCN: {{flat|buffer|global}}_atomic_sub v[[value]] +define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { +entry: + %old = atomicrmw sub i32 addrspace(1)* %inout, i32 5 acq_rel + store i32 %old, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: sub_i32_uniform: +; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 +; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] +; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] +; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] +; GCN: {{flat|buffer|global}}_atomic_sub v[[value]] +define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %subitive) { +entry: + %old = atomicrmw sub i32 addrspace(1)* %inout, i32 %subitive acq_rel + store i32 %old, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: sub_i32_varying: +; GFX7LESS-NOT: v_mbcnt_lo_u32_b32 +; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 +; GFX7LESS-NOT: s_bcnt1_i32_b64 +; GFX7LESS: buffer_atomic_sub v{{[0-9]+}} +; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 +; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] +; GFX8MORE: buffer_atomic_sub v[[value]] +define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %old = atomicrmw sub i32 addrspace(1)* %inout, i32 %lane acq_rel + store i32 %old, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: sub_i64_constant: +; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 +; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] +; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5 +; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5 +; GCN: {{flat|buffer|global}}_atomic_sub_x2 v{{\[}}[[value_lo]]:[[value_hi]]{{\]}} +define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { +entry: + %old = atomicrmw sub i64 addrspace(1)* %inout, i64 5 acq_rel + store i64 %old, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: sub_i64_uniform: +; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 +; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] +; GCN: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN: {{flat|buffer|global}}_atomic_sub_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} +define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %subitive) { +entry: + %old = atomicrmw sub i64 addrspace(1)* %inout, i64 %subitive acq_rel + store i64 %old, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: sub_i64_varying: +; GCN-NOT: v_mbcnt_lo_u32_b32 +; GCN-NOT: v_mbcnt_hi_u32_b32 +; GCN-NOT: s_bcnt1_i32_b64 +; GCN: {{flat|buffer|global}}_atomic_sub_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} +define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %zext = zext i32 %lane to i64 + %old = atomicrmw sub i64 addrspace(1)* %inout, i64 %zext acq_rel + store i64 %old, i64 addrspace(1)* %out + ret void +} Index: llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -0,0 +1,192 @@ +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s + +declare i32 @llvm.amdgcn.workitem.id.x() + +@local_var32 = addrspace(3) global i32 undef, align 4 +@local_var64 = addrspace(3) global i64 undef, align 8 + +; Show that what the atomic optimization pass will do for local pointers. + +; GCN-LABEL: add_i32_constant: +; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 +; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] +; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 +; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] +define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { +entry: + %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 5 acq_rel + store i32 %old, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: add_i32_uniform: +; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 +; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] +; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] +; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] +; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] +define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive) { +entry: + %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %additive acq_rel + store i32 %old, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: add_i32_varying: +; GFX7LESS-NOT: v_mbcnt_lo_u32_b32 +; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 +; GFX7LESS-NOT: s_bcnt1_i32_b64 +; GFX7LESS: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 +; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] +; GFX8MORE: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] +define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel + store i32 %old, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: add_i64_constant: +; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 +; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] +; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5 +; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5 +; GCN: ds_add_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}} +define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { +entry: + %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 5 acq_rel + store i64 %old, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: add_i64_uniform: +; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 +; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] +; GCN: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN: ds_add_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} +define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) { +entry: + %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %additive acq_rel + store i64 %old, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: add_i64_varying: +; GCN-NOT: v_mbcnt_lo_u32_b32 +; GCN-NOT: v_mbcnt_hi_u32_b32 +; GCN-NOT: s_bcnt1_i32_b64 +; GCN: ds_add_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} +define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) { +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %zext = zext i32 %lane to i64 + %old = atomicrmw add i64 addrspace(3)* @local_var64, i64 %zext acq_rel + store i64 %old, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: sub_i32_constant: +; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 +; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] +; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 +; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] +define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { +entry: + %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 5 acq_rel + store i32 %old, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: sub_i32_uniform: +; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 +; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] +; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] +; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] +; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] +define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive) { +entry: + %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %subitive acq_rel + store i32 %old, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: sub_i32_varying: +; GFX7LESS-NOT: v_mbcnt_lo_u32_b32 +; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 +; GFX7LESS-NOT: s_bcnt1_i32_b64 +; GFX7LESS: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 +; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] +; GFX8MORE: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] +define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel + store i32 %old, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: sub_i64_constant: +; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 +; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] +; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5 +; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5 +; GCN: ds_sub_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}} +define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { +entry: + %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 5 acq_rel + store i64 %old, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: sub_i64_uniform: +; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 +; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] +; GCN: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN: ds_sub_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} +define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) { +entry: + %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %subitive acq_rel + store i64 %old, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: sub_i64_varying: +; GCN-NOT: v_mbcnt_lo_u32_b32 +; GCN-NOT: v_mbcnt_hi_u32_b32 +; GCN-NOT: s_bcnt1_i32_b64 +; GCN: ds_sub_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} +define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) { +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %zext = zext i32 %lane to i64 + %old = atomicrmw sub i64 addrspace(3)* @local_var64, i64 %zext acq_rel + store i64 %old, i64 addrspace(1)* %out + ret void +} Index: llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -0,0 +1,129 @@ +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s + +declare i32 @llvm.amdgcn.workitem.id.x() +declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32) +declare i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32) + +; Show that what the atomic optimization pass will do for raw buffers. + +; GCN-LABEL: add_i32_constant: +; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 +; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] +; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 +; GCN: buffer_atomic_add v[[value]] +define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) { +entry: + %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0) + store i32 %old, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: add_i32_uniform: +; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 +; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] +; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] +; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] +; GCN: buffer_atomic_add v[[value]] +define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) { +entry: + %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i32 0) + store i32 %old, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: add_i32_varying_vdata: +; GFX7LESS-NOT: v_mbcnt_lo_u32_b32 +; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 +; GFX7LESS-NOT: s_bcnt1_i32_b64 +; GFX7LESS: buffer_atomic_add v{{[0-9]+}} +; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 +; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] +; GFX8MORE: buffer_atomic_add v[[value]] +define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) { +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0) + store i32 %old, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: add_i32_varying_offset: +; GCN-NOT: v_mbcnt_lo_u32_b32 +; GCN-NOT: v_mbcnt_hi_u32_b32 +; GCN-NOT: s_bcnt1_i32_b64 +; GCN: buffer_atomic_add v{{[0-9]+}} +define amdgpu_kernel void @add_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) { +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0) + store i32 %old, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: sub_i32_constant: +; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 +; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] +; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 +; GCN: buffer_atomic_sub v[[value]] +define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) { +entry: + %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0) + store i32 %old, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: sub_i32_uniform: +; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 +; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] +; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] +; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] +; GCN: buffer_atomic_sub v[[value]] +define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) { +entry: + %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i32 0) + store i32 %old, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: sub_i32_varying_vdata: +; GFX7LESS-NOT: v_mbcnt_lo_u32_b32 +; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 +; GFX7LESS-NOT: s_bcnt1_i32_b64 +; GFX7LESS: buffer_atomic_sub v{{[0-9]+}} +; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 +; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] +; GFX8MORE: buffer_atomic_sub v[[value]] +define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) { +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0) + store i32 %old, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: sub_i32_varying_offset: +; GCN-NOT: v_mbcnt_lo_u32_b32 +; GCN-NOT: v_mbcnt_hi_u32_b32 +; GCN-NOT: s_bcnt1_i32_b64 +; GCN: buffer_atomic_sub v{{[0-9]+}} +define amdgpu_kernel void @sub_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) { +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0) + store i32 %old, i32 addrspace(1)* %out + ret void +} Index: llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ llvm/trunk/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -0,0 +1,155 @@ +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s + +declare i32 @llvm.amdgcn.workitem.id.x() +declare i32 @llvm.amdgcn.struct.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32, i32) +declare i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32, i32) + +; Show that what the atomic optimization pass will do for struct buffers. + +; GCN-LABEL: add_i32_constant: +; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 +; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] +; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 +; GCN: buffer_atomic_add v[[value]] +define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) { +entry: + %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0) + store i32 %old, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: add_i32_uniform: +; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 +; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] +; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] +; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] +; GCN: buffer_atomic_add v[[value]] +define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) { +entry: + %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0) + store i32 %old, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: add_i32_varying_vdata: +; GFX7LESS-NOT: v_mbcnt_lo_u32_b32 +; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 +; GFX7LESS-NOT: s_bcnt1_i32_b64 +; GFX7LESS: buffer_atomic_add v{{[0-9]+}} +; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 +; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] +; GFX8MORE: buffer_atomic_add v[[value]] +define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) { +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0) + store i32 %old, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: add_i32_varying_vindex: +; GCN-NOT: v_mbcnt_lo_u32_b32 +; GCN-NOT: v_mbcnt_hi_u32_b32 +; GCN-NOT: s_bcnt1_i32_b64 +; GCN: buffer_atomic_add v{{[0-9]+}} +define amdgpu_kernel void @add_i32_varying_vindex(i32 addrspace(1)* %out, <4 x i32> %inout) { +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0, i32 0) + store i32 %old, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: add_i32_varying_offset: +; GCN-NOT: v_mbcnt_lo_u32_b32 +; GCN-NOT: v_mbcnt_hi_u32_b32 +; GCN-NOT: s_bcnt1_i32_b64 +; GCN: buffer_atomic_add v{{[0-9]+}} +define amdgpu_kernel void @add_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) { +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 0, i32 %lane, i32 0, i32 0) + store i32 %old, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: sub_i32_constant: +; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 +; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] +; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 +; GCN: buffer_atomic_sub v[[value]] +define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) { +entry: + %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0) + store i32 %old, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: sub_i32_uniform: +; GCN: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 +; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] +; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] +; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] +; GCN: buffer_atomic_sub v[[value]] +define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) { +entry: + %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0) + store i32 %old, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: sub_i32_varying_vdata: +; GFX7LESS-NOT: v_mbcnt_lo_u32_b32 +; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 +; GFX7LESS-NOT: s_bcnt1_i32_b64 +; GFX7LESS: buffer_atomic_sub v{{[0-9]+}} +; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 +; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] +; GFX8MORE: buffer_atomic_sub v[[value]] +define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) { +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0) + store i32 %old, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: sub_i32_varying_vindex: +; GCN-NOT: v_mbcnt_lo_u32_b32 +; GCN-NOT: v_mbcnt_hi_u32_b32 +; GCN-NOT: s_bcnt1_i32_b64 +; GCN: buffer_atomic_sub v{{[0-9]+}} +define amdgpu_kernel void @sub_i32_varying_vindex(i32 addrspace(1)* %out, <4 x i32> %inout) { +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0, i32 0) + store i32 %old, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: sub_i32_varying_offset: +; GCN-NOT: v_mbcnt_lo_u32_b32 +; GCN-NOT: v_mbcnt_hi_u32_b32 +; GCN-NOT: s_bcnt1_i32_b64 +; GCN: buffer_atomic_sub v{{[0-9]+}} +define amdgpu_kernel void @sub_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) { +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 0, i32 %lane, i32 0, i32 0) + store i32 %old, i32 addrspace(1)* %out + ret void +} Index: llvm/trunk/test/CodeGen/AMDGPU/global_atomics.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/global_atomics.ll +++ llvm/trunk/test/CodeGen/AMDGPU/global_atomics.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SIVI %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s +; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=false -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SIVI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s ; GCN-LABEL: {{^}}atomic_add_i32_offset: ; SIVI: buffer_atomic_add v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} Index: llvm/trunk/test/CodeGen/AMDGPU/global_atomics_i64.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/global_atomics_i64.ll +++ llvm/trunk/test/CodeGen/AMDGPU/global_atomics_i64.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIVI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,CIVI %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=bonaire -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIVI %s +; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=false -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,CIVI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s ; GCN-LABEL: {{^}}atomic_add_i64_offset: ; CIVI: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}} Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll @@ -1,5 +1,5 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI +;RUN: llc < %s -march=amdgcn -mcpu=verde -amdgpu-atomic-optimizations=false -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI +;RUN: llc < %s -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=false -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI ;CHECK-LABEL: {{^}}test1: ;CHECK-NOT: s_waitcnt Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.ll @@ -1,5 +1,5 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI +;RUN: llc < %s -march=amdgcn -mcpu=verde -amdgpu-atomic-optimizations=false -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI +;RUN: llc < %s -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=false -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI ;CHECK-LABEL: {{^}}test1: ;CHECK-NOT: s_waitcnt Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.ll @@ -1,5 +1,5 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI +;RUN: llc < %s -march=amdgcn -mcpu=verde -amdgpu-atomic-optimizations=false -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI +;RUN: llc < %s -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=false -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI ;CHECK-LABEL: {{^}}test1: ;CHECK-NOT: s_waitcnt Index: llvm/trunk/test/CodeGen/AMDGPU/local-atomics.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/local-atomics.ll +++ llvm/trunk/test/CodeGen/AMDGPU/local-atomics.ll @@ -1,8 +1,8 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SICIVI,FUNC %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,SICIVI,FUNC %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,FUNC %s -; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=EG,FUNC %s +; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SICIVI,FUNC %s +; RUN: llc -march=amdgcn -mcpu=bonaire -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,SICIVI,FUNC %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,FUNC %s +; RUN: llc -march=r600 -mcpu=redwood -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=EG,FUNC %s ; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i32: ; EG: LDS_WRXCHG_RET * Index: llvm/trunk/test/CodeGen/AMDGPU/local-atomics64.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/local-atomics64.ll +++ llvm/trunk/test/CodeGen/AMDGPU/local-atomics64.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,SI,SICIVI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,VI,SICIVI,GFX89 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9,GFX89 %s +; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,SI,SICIVI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,VI,SICIVI,GFX89 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9,GFX89 %s ; GCN-LABEL: {{^}}lds_atomic_xchg_ret_i64: ; SICIVI: s_mov_b32 m0