diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -15,6 +15,7 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" +#include "SIDefines.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/IRBuilder.h" @@ -24,20 +25,10 @@ #define DEBUG_TYPE "amdgpu-atomic-optimizer" using namespace llvm; +using namespace llvm::AMDGPU; namespace { -enum DPP_CTRL { - DPP_ROW_SR1 = 0x111, - DPP_ROW_SR2 = 0x112, - DPP_ROW_SR3 = 0x113, - DPP_ROW_SR4 = 0x114, - DPP_ROW_SR8 = 0x118, - DPP_WF_SR1 = 0x138, - DPP_ROW_BCAST15 = 0x142, - DPP_ROW_BCAST31 = 0x143 -}; - struct ReplacementInfo { Instruction *I; AtomicRMWInst::BinOp Op; @@ -52,9 +43,12 @@ const LegacyDivergenceAnalysis *DA; const DataLayout *DL; DominatorTree *DT; - bool HasDPP; + const GCNSubtarget *ST; bool IsPixelShader; + Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, + Value *const Identity) const; + Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const; void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx, bool ValDivergent) const; @@ -93,8 +87,7 @@ DT = DTW ? &DTW->getDomTree() : nullptr; const TargetPassConfig &TPC = getAnalysis(); const TargetMachine &TM = TPC.getTM(); - const GCNSubtarget &ST = TM.getSubtarget(F); - HasDPP = ST.hasDPP(); + ST = &TM.getSubtarget(F); IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS; visit(F); @@ -152,7 +145,8 @@ // value to the atomic calculation. We can only optimize divergent values if // we have DPP available on our subtarget, and the atomic operation is 32 // bits. - if (ValDivergent && (!HasDPP || (DL->getTypeSizeInBits(I.getType()) != 32))) { + if (ValDivergent && + (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) { return; } @@ -225,7 +219,8 @@ // value to the atomic calculation. We can only optimize divergent values if // we have DPP available on our subtarget, and the atomic operation is 32 // bits. - if (ValDivergent && (!HasDPP || (DL->getTypeSizeInBits(I.getType()) != 32))) { + if (ValDivergent && + (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) { return; } @@ -282,6 +277,111 @@ return B.CreateSelect(Cond, LHS, RHS); } +// Use the builder to create an inclusive scan of V across the wavefront, with +// all lanes active. +Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, + Value *V, Value *const Identity) const { + Type *const Ty = V->getType(); + Module *M = B.GetInsertBlock()->getModule(); + Function *UpdateDPP = + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); + Function *PermLaneX16 = + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_permlanex16, {}); + Function *ReadLane = + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); + + for (unsigned Idx = 0; Idx < 4; Idx++) { + V = buildNonAtomicBinOp( + B, Op, V, + B.CreateCall(UpdateDPP, + {Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx), + B.getInt32(0xf), B.getInt32(0xf), B.getFalse()})); + } + if (ST->hasDPPBroadcasts()) { + // GFX9 has DPP row broadcast operations. + V = buildNonAtomicBinOp( + B, Op, V, + B.CreateCall(UpdateDPP, + {Identity, V, B.getInt32(DPP::BCAST15), B.getInt32(0xa), + B.getInt32(0xf), B.getFalse()})); + V = buildNonAtomicBinOp( + B, Op, V, + B.CreateCall(UpdateDPP, + {Identity, V, B.getInt32(DPP::BCAST31), B.getInt32(0xc), + B.getInt32(0xf), B.getFalse()})); + } else { + // On GFX10 all DPP operations are confined to a single row. To get cross- + // row operations we have to use permlane or readlane. + + // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes + // 48..63). + Value *const PermX = + B.CreateCall(PermLaneX16, {V, V, B.getInt32(-1), B.getInt32(-1), + B.getFalse(), B.getFalse()}); + V = buildNonAtomicBinOp( + B, Op, V, + B.CreateCall(UpdateDPP, + {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID), + B.getInt32(0xa), B.getInt32(0xf), B.getFalse()})); + if (!ST->isWave32()) { + // Combine lane 31 into lanes 32..63. + Value *const Lane31 = B.CreateCall(ReadLane, {V, B.getInt32(31)}); + V = buildNonAtomicBinOp( + B, Op, V, + B.CreateCall(UpdateDPP, + {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID), + B.getInt32(0xc), B.getInt32(0xf), B.getFalse()})); + } + } + return V; +} + +// Use the builder to create a shift right of V across the wavefront, with all +// lanes active, to turn an inclusive scan into an exclusive scan. +Value *AMDGPUAtomicOptimizer::buildShiftRight(IRBuilder<> &B, Value *V, + Value *const Identity) const { + Type *const Ty = V->getType(); + Module *M = B.GetInsertBlock()->getModule(); + Function *UpdateDPP = + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); + Function *ReadLane = + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); + Function *WriteLane = + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {}); + + if (ST->hasDPPWavefrontShifts()) { + // GFX9 has DPP wavefront shift operations. + V = B.CreateCall(UpdateDPP, + {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf), + B.getInt32(0xf), B.getFalse()}); + } else { + // On GFX10 all DPP operations are confined to a single row. To get cross- + // row operations we have to use permlane or readlane. + Value *Old = V; + V = B.CreateCall(UpdateDPP, + {Identity, V, B.getInt32(DPP::ROW_SHR0 + 1), + B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}); + + // Copy the old lane 15 to the new lane 16. + V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}), + B.getInt32(16), V}); + + if (!ST->isWave32()) { + // Copy the old lane 31 to the new lane 32. + V = B.CreateCall( + WriteLane, + {B.CreateCall(ReadLane, {Old, B.getInt32(31)}), B.getInt32(32), V}); + + // Copy the old lane 47 to the new lane 48. + V = B.CreateCall( + WriteLane, + {B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V}); + } + } + + return V; +} + static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op, unsigned BitWidth) { switch (Op) { @@ -345,23 +445,29 @@ // We need to know how many lanes are active within the wavefront, and we do // this by doing a ballot of active lanes. + Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize()); CallInst *const Ballot = B.CreateIntrinsic( - Intrinsic::amdgcn_icmp, {B.getInt64Ty(), B.getInt32Ty()}, + Intrinsic::amdgcn_icmp, {WaveTy, B.getInt32Ty()}, {B.getInt32(1), B.getInt32(0), B.getInt32(CmpInst::ICMP_NE)}); // We need to know how many lanes are active within the wavefront that are // below us. If we counted each lane linearly starting from 0, a lane is // below us only if its associated index was less than ours. We do this by // using the mbcnt intrinsic. - Value *const BitCast = B.CreateBitCast(Ballot, VecTy); - Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0)); - Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1)); - CallInst *const PartialMbcnt = B.CreateIntrinsic( - Intrinsic::amdgcn_mbcnt_lo, {}, {ExtractLo, B.getInt32(0)}); - Value *const Mbcnt = - B.CreateIntCast(B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, - {ExtractHi, PartialMbcnt}), - Ty, false); + Value *Mbcnt; + if (ST->isWave32()) { + Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, + {Ballot, B.getInt32(0)}); + } else { + Value *const BitCast = B.CreateBitCast(Ballot, VecTy); + Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0)); + Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1)); + Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, + {ExtractLo, B.getInt32(0)}); + Mbcnt = + B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt}); + } + Mbcnt = B.CreateIntCast(Mbcnt, Ty, false); Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth)); @@ -373,45 +479,25 @@ if (ValDivergent) { // First we need to set all inactive invocations to the identity value, so // that they can correctly contribute to the final result. - CallInst *const SetInactive = - B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); - - ExclScan = - B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, Ty, - {Identity, SetInactive, B.getInt32(DPP_WF_SR1), - B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}); - - const unsigned Iters = 6; - const unsigned DPPCtrl[Iters] = {DPP_ROW_SR1, DPP_ROW_SR2, - DPP_ROW_SR4, DPP_ROW_SR8, - DPP_ROW_BCAST15, DPP_ROW_BCAST31}; - const unsigned RowMask[Iters] = {0xf, 0xf, 0xf, 0xf, 0xa, 0xc}; - const unsigned BankMask[Iters] = {0xf, 0xf, 0xe, 0xc, 0xf, 0xf}; - - // This loop performs an exclusive scan across the wavefront, with all lanes - // active (by using the WWM intrinsic). - for (unsigned Idx = 0; Idx < Iters; Idx++) { - CallInst *const DPP = B.CreateIntrinsic( - Intrinsic::amdgcn_update_dpp, Ty, - {Identity, ExclScan, B.getInt32(DPPCtrl[Idx]), - B.getInt32(RowMask[Idx]), B.getInt32(BankMask[Idx]), B.getFalse()}); - - ExclScan = buildNonAtomicBinOp(B, Op, ExclScan, DPP); - } + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); - NewV = buildNonAtomicBinOp(B, Op, SetInactive, ExclScan); + const AtomicRMWInst::BinOp ScanOp = + Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op; + NewV = buildScan(B, ScanOp, NewV, Identity); + ExclScan = buildShiftRight(B, NewV, Identity); // Read the value from the last lane, which has accumlated the values of // each active lane in the wavefront. This will be our new value which we // will provide to the atomic operation. + Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); if (TyBitWidth == 64) { Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty()); Value *const ExtractHi = - B.CreateTrunc(B.CreateLShr(NewV, B.getInt64(32)), B.getInt32Ty()); + B.CreateTrunc(B.CreateLShr(NewV, 32), B.getInt32Ty()); CallInst *const ReadLaneLo = B.CreateIntrinsic( - Intrinsic::amdgcn_readlane, {}, {ExtractLo, B.getInt32(63)}); + Intrinsic::amdgcn_readlane, {}, {ExtractLo, LastLaneIdx}); CallInst *const ReadLaneHi = B.CreateIntrinsic( - Intrinsic::amdgcn_readlane, {}, {ExtractHi, B.getInt32(63)}); + Intrinsic::amdgcn_readlane, {}, {ExtractHi, LastLaneIdx}); Value *const PartialInsert = B.CreateInsertElement( UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0)); Value *const Insert = @@ -419,7 +505,7 @@ NewV = B.CreateBitCast(Insert, Ty); } else if (TyBitWidth == 32) { NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, - {NewV, B.getInt32(63)}); + {NewV, LastLaneIdx}); } else { llvm_unreachable("Unhandled atomic bit width"); } @@ -506,7 +592,7 @@ if (TyBitWidth == 64) { Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty()); Value *const ExtractHi = - B.CreateTrunc(B.CreateLShr(PHI, B.getInt64(32)), B.getInt32Ty()); + B.CreateTrunc(B.CreateLShr(PHI, 32), B.getInt32Ty()); CallInst *const ReadFirstLaneLo = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo); CallInst *const ReadFirstLaneHi = diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -949,6 +949,14 @@ return HasDPP; } + bool hasDPPBroadcasts() const { + return HasDPP && getGeneration() < GFX10; + } + + bool hasDPPWavefrontShifts() const { + return HasDPP && getGeneration() < GFX10; + } + bool hasDPP8() const { return HasDPP8; } diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -444,6 +444,7 @@ enum DppCtrl : unsigned { QUAD_PERM_FIRST = 0, + QUAD_PERM_ID = 0xE4, // identity permutation QUAD_PERM_LAST = 0xFF, DPP_UNUSED1 = 0x100, ROW_SHL0 = 0x100, diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -1,6 +1,8 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX7LESS %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX89 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX89 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN32,GFX8MORE,GFX8MORE32 %s declare i32 @llvm.amdgcn.workitem.id.x() declare i32 @llvm.amdgcn.buffer.atomic.add(i32, <4 x i32>, i32, i32, i1) @@ -9,11 +11,13 @@ ; Show that what the atomic optimization pass will do for raw buffers. ; GCN-LABEL: add_i32_constant: -; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 -; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 -; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] -; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] -; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 +; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 +; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] +; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} ; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 ; GCN: buffer_atomic_add v[[value]] define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) { @@ -24,11 +28,13 @@ } ; GCN-LABEL: add_i32_uniform: -; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 -; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 -; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] -; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] -; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 +; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 +; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] +; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} ; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] ; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GCN: buffer_atomic_add v[[value]] @@ -44,14 +50,15 @@ ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 ; GFX7LESS-NOT: s_bcnt1_i32_b64 ; GFX7LESS: buffer_atomic_add v{{[0-9]+}} -; GFX8MORE: v_mov_b32_dpp v[[wave_shr1:[0-9]+]], v{{[0-9]+}} wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:4 row_mask:0xf bank_mask:0xe -; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:8 row_mask:0xf bank_mask:0xc -; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 +; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:8 row_mask:0xf bank_mask:0xf +; GFX89: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX89: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 +; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 +; GFX89: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GFX8MORE: buffer_atomic_add v[[value]] define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) { @@ -76,11 +83,13 @@ } ; GCN-LABEL: sub_i32_constant: -; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 -; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 -; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] -; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] -; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 +; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 +; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] +; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} ; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 ; GCN: buffer_atomic_sub v[[value]] define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) { @@ -91,11 +100,13 @@ } ; GCN-LABEL: sub_i32_uniform: -; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 -; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 -; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] -; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] -; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 +; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 +; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] +; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} ; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] ; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GCN: buffer_atomic_sub v[[value]] @@ -111,14 +122,15 @@ ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 ; GFX7LESS-NOT: s_bcnt1_i32_b64 ; GFX7LESS: buffer_atomic_sub v{{[0-9]+}} -; GFX8MORE: v_mov_b32_dpp v[[wave_shr1:[0-9]+]], v{{[0-9]+}} wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v[[wave_shr1]] row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:4 row_mask:0xf bank_mask:0xe -; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:8 row_mask:0xf bank_mask:0xc -; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 +; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:1 row_mask:0xf bank_mask:0xf +; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:2 row_mask:0xf bank_mask:0xf +; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:4 row_mask:0xf bank_mask:0xf +; GFX8MORE: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_shr:8 row_mask:0xf bank_mask:0xf +; GFX89: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX89: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 +; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 +; GFX89: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} wave_shr:1 row_mask:0xf bank_mask:0xf ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GFX8MORE: buffer_atomic_sub v[[value]] define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) { diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -1,17 +1,21 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX7LESS %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN32,GFX8MORE,GFX8MORE32 %s declare i32 @llvm.amdgcn.workitem.id.x() ; Show that what the atomic optimization pass will do for global pointers. ; GCN-LABEL: add_i32_constant: -; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 -; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 -; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] -; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] -; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 +; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 +; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] +; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} ; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 ; GCN: {{flat|buffer|global}}_atomic_add v[[value]] define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { @@ -22,11 +26,13 @@ } ; GCN-LABEL: add_i32_uniform: -; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 -; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 -; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] -; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] -; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 +; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 +; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] +; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} ; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] ; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GCN: {{flat|buffer|global}}_atomic_add v[[value]] @@ -42,9 +48,10 @@ ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 ; GFX7LESS-NOT: s_bcnt1_i32_b64 ; GFX7LESS: buffer_atomic_add v{{[0-9]+}} -; GFX8MORE: v_add_u32_dpp -; GFX8MORE: v_add_u32_dpp -; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 +; DPPCOMB: v_add_u32_dpp +; DPPCOMB: v_add_u32_dpp +; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 +; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GFX8MORE: buffer_atomic_add v[[value]] define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { @@ -56,11 +63,13 @@ } ; GCN-LABEL: add_i64_constant: -; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 -; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 -; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] -; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] -; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 +; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 +; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] +; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} ; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5 ; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5 ; GCN: {{flat|buffer|global}}_atomic_add_x2 v{{\[}}[[value_lo]]:[[value_hi]]{{\]}} @@ -72,11 +81,13 @@ } ; GCN-LABEL: add_i64_uniform: -; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 -; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 -; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] -; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] -; GCN: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 +; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 +; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s{{[0-9]+}}, s[[exec_lo]] +; GCN64: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} ; GCN: {{flat|buffer|global}}_atomic_add_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %additive) { entry: @@ -100,11 +111,13 @@ } ; GCN-LABEL: sub_i32_constant: -; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 -; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 -; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] -; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] -; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 +; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 +; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] +; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} ; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 ; GCN: {{flat|buffer|global}}_atomic_sub v[[value]] define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { @@ -115,11 +128,13 @@ } ; GCN-LABEL: sub_i32_uniform: -; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 -; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 -; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] -; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] -; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 +; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 +; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] +; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} ; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] ; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GCN: {{flat|buffer|global}}_atomic_sub v[[value]] @@ -135,9 +150,10 @@ ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 ; GFX7LESS-NOT: s_bcnt1_i32_b64 ; GFX7LESS: buffer_atomic_sub v{{[0-9]+}} -; GFX8MORE: v_sub{{(rev)?}}_u32_dpp -; GFX8MORE: v_sub{{(rev)?}}_u32_dpp -; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 +; DPPCOMB: v_add_u32_dpp +; DPPCOMB: v_add_u32_dpp +; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 +; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GFX8MORE: buffer_atomic_sub v[[value]] define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { @@ -149,11 +165,13 @@ } ; GCN-LABEL: sub_i64_constant: -; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 -; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 -; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] -; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] -; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 +; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 +; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] +; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} ; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5 ; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5 ; GCN: {{flat|buffer|global}}_atomic_sub_x2 v{{\[}}[[value_lo]]:[[value_hi]]{{\]}} @@ -165,11 +183,13 @@ } ; GCN-LABEL: sub_i64_uniform: -; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 -; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 -; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] -; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] -; GCN: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 +; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 +; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s{{[0-9]+}}, s[[exec_lo]] +; GCN64: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} ; GCN: {{flat|buffer|global}}_atomic_sub_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %subitive) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -1,6 +1,9 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX7LESS %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX1064 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN32,GFX8MORE,GFX8MORE32,GFX1032 %s declare i32 @llvm.amdgcn.workitem.id.x() @@ -10,11 +13,13 @@ ; Show that what the atomic optimization pass will do for local pointers. ; GCN-LABEL: add_i32_constant: -; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 -; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 -; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] -; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] -; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 +; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 +; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] +; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} ; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 ; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) { @@ -25,11 +30,13 @@ } ; GCN-LABEL: add_i32_uniform: -; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 -; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 -; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] -; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] -; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 +; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 +; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] +; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} ; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] ; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] @@ -45,9 +52,10 @@ ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 ; GFX7LESS-NOT: s_bcnt1_i32_b64 ; GFX7LESS: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GFX8MORE: v_add_u32_dpp -; GFX8MORE: v_add_u32_dpp -; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 +; DPPCOMB: v_add_u32_dpp +; DPPCOMB: v_add_u32_dpp +; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 +; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GFX8MORE: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { @@ -58,12 +66,156 @@ ret void } +define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) { +; GFX1032-LABEL: add_i32_varying_gfx1032: +; GFX1032: v_mov_b32_e32 v2, v0 +; GFX1032: s_or_saveexec_b32 s2, -1 +; GFX1032: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032: v_mov_b32_e32 v1, 0 +; GFX1032: s_mov_b32 exec_lo, s2 +; GFX1032: v_cmp_ne_u32_e64 s2, 1, 0 +; GFX1032: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX1032: s_not_b32 exec_lo, exec_lo +; GFX1032: v_mov_b32_e32 v2, 0 +; GFX1032: s_not_b32 exec_lo, exec_lo +; GFX1032: s_or_saveexec_b32 s4, -1 +; GFX1032: v_mov_b32_e32 v3, v1 +; GFX1032: v_mov_b32_e32 v4, v1 +; GFX1032: s_mov_b32 s2, -1 +; GFX1032: v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032: v_add_nc_u32_e32 v2, v2, v3 +; GFX1032: v_mov_b32_e32 v3, v1 +; GFX1032: v_mov_b32_dpp v3, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032: v_add_nc_u32_e32 v2, v2, v3 +; GFX1032: v_mov_b32_e32 v3, v1 +; GFX1032: v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032: v_add_nc_u32_e32 v2, v2, v3 +; GFX1032: v_mov_b32_e32 v3, v1 +; GFX1032: v_mov_b32_dpp v3, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032: v_add_nc_u32_e32 v2, v2, v3 +; GFX1032: v_mov_b32_e32 v3, v2 +; GFX1032: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1032: v_mov_b32_dpp v4, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032: v_add_nc_u32_e32 v2, v2, v4 +; GFX1032: v_readlane_b32 s3, v2, 31 +; GFX1032: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032: v_readlane_b32 s5, v2, 15 +; GFX1032: v_writelane_b32 v1, s5, 16 +; GFX1032: s_mov_b32 exec_lo, s4 +; GFX1032: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032: s_and_saveexec_b32 s4, vcc_lo +; GFX1032: s_cbranch_execz BB3_2 +; GFX1032: BB3_1: +; GFX1032: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX1032: v_mov_b32_e32 v5, s3 +; GFX1032: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032: s_waitcnt_vscnt null, 0x0 +; GFX1032: ds_add_rtn_u32 v0, v0, v5 +; GFX1032: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032: buffer_gl0_inv +; GFX1032: buffer_gl1_inv +; GFX1032: BB3_2: +; GFX1032: v_nop +; GFX1032: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032: v_readfirstlane_b32 s3, v0 +; GFX1032: v_mov_b32_e32 v0, v1 +; GFX1032: v_add_nc_u32_e32 v0, s3, v0 +; GFX1032: s_mov_b32 s3, 0x31016000 +; GFX1032: s_nop 1 +; GFX1032: s_waitcnt lgkmcnt(0) +; GFX1032: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032: s_endpgm +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel + store i32 %old, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) { +; GFX1064-LABEL: add_i32_varying_gfx1064: +; GFX1064: v_mov_b32_e32 v2, v0 +; GFX1064: s_or_saveexec_b64 s[2:3], -1 +; GFX1064: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064: v_mov_b32_e32 v1, 0 +; GFX1064: s_mov_b64 exec, s[2:3] +; GFX1064: v_cmp_ne_u32_e64 s[2:3], 1, 0 +; GFX1064: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX1064: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 +; GFX1064: s_not_b64 exec, exec +; GFX1064: v_mov_b32_e32 v2, 0 +; GFX1064: s_not_b64 exec, exec +; GFX1064: s_or_saveexec_b64 s[4:5], -1 +; GFX1064: v_mov_b32_e32 v3, v1 +; GFX1064: v_mov_b32_e32 v4, v1 +; GFX1064: s_mov_b32 s2, -1 +; GFX1064: v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064: v_add_nc_u32_e32 v2, v2, v3 +; GFX1064: v_mov_b32_e32 v3, v1 +; GFX1064: v_mov_b32_dpp v3, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064: v_add_nc_u32_e32 v2, v2, v3 +; GFX1064: v_mov_b32_e32 v3, v1 +; GFX1064: v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064: v_add_nc_u32_e32 v2, v2, v3 +; GFX1064: v_mov_b32_e32 v3, v1 +; GFX1064: v_mov_b32_dpp v3, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064: v_add_nc_u32_e32 v2, v2, v3 +; GFX1064: v_mov_b32_e32 v3, v2 +; GFX1064: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1064: v_mov_b32_dpp v4, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064: v_add_nc_u32_e32 v2, v2, v4 +; GFX1064: v_mov_b32_e32 v4, v1 +; GFX1064: v_readlane_b32 s3, v2, 31 +; GFX1064: v_mov_b32_e32 v3, s3 +; GFX1064: v_mov_b32_dpp v4, v3 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064: v_add_nc_u32_e32 v2, v2, v4 +; GFX1064: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064: v_readlane_b32 s3, v2, 15 +; GFX1064: v_readlane_b32 s6, v2, 31 +; GFX1064: v_writelane_b32 v1, s3, 16 +; GFX1064: v_readlane_b32 s3, v2, 63 +; GFX1064: v_writelane_b32 v1, s6, 32 +; GFX1064: v_readlane_b32 s6, v2, 47 +; GFX1064: v_writelane_b32 v1, s6, 48 +; GFX1064: s_mov_b64 exec, s[4:5] +; GFX1064: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064: s_and_saveexec_b64 s[4:5], vcc +; GFX1064: s_cbranch_execz BB4_2 +; GFX1064: BB4_1: +; GFX1064: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX1064: v_mov_b32_e32 v5, s3 +; GFX1064: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064: s_waitcnt_vscnt null, 0x0 +; GFX1064: ds_add_rtn_u32 v0, v0, v5 +; GFX1064: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064: buffer_gl0_inv +; GFX1064: buffer_gl1_inv +; GFX1064: BB4_2: +; GFX1064: v_nop +; GFX1064: s_or_b64 exec, exec, s[4:5] +; GFX1064: v_readfirstlane_b32 s3, v0 +; GFX1064: v_mov_b32_e32 v0, v1 +; GFX1064: v_add_nc_u32_e32 v0, s3, v0 +; GFX1064: s_mov_b32 s3, 0x31016000 +; GFX1064: s_nop 1 +; GFX1064: s_waitcnt lgkmcnt(0) +; GFX1064: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064: s_endpgm +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel + store i32 %old, i32 addrspace(1)* %out + ret void +} + ; GCN-LABEL: add_i64_constant: -; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 -; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 -; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] -; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] -; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 +; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 +; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] +; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} ; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5 ; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5 ; GCN: ds_add_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}} @@ -75,11 +227,13 @@ } ; GCN-LABEL: add_i64_uniform: -; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 -; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 -; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] -; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] -; GCN: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 +; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 +; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s{{[0-9]+}}, s[[exec_lo]] +; GCN64: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} ; GCN: ds_add_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive) { entry: @@ -103,11 +257,13 @@ } ; GCN-LABEL: sub_i32_constant: -; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 -; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 -; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] -; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] -; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 +; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 +; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] +; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} ; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 ; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { @@ -118,11 +274,13 @@ } ; GCN-LABEL: sub_i32_uniform: -; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 -; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 -; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] -; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] -; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 +; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 +; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] +; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} ; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] ; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] @@ -138,9 +296,10 @@ ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 ; GFX7LESS-NOT: s_bcnt1_i32_b64 ; GFX7LESS: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; GFX8MORE: v_sub{{(rev)?}}_u32_dpp -; GFX8MORE: v_sub{{(rev)?}}_u32_dpp -; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 +; DPPCOMB: v_add_u32_dpp +; DPPCOMB: v_add_u32_dpp +; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 +; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GFX8MORE: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { @@ -152,11 +311,13 @@ } ; GCN-LABEL: sub_i64_constant: -; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 -; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 -; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] -; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] -; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 +; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 +; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] +; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} ; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5 ; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5 ; GCN: ds_sub_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}} @@ -168,11 +329,13 @@ } ; GCN-LABEL: sub_i64_uniform: -; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 -; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 -; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] -; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] -; GCN: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 +; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 +; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s{{[0-9]+}}, s[[exec_lo]] +; GCN64: s_bcnt1_i32_b64 s{{[0-9]+}}, s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} ; GCN: ds_sub_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive) { entry: @@ -196,7 +359,8 @@ } ; GCN-LABEL: and_i32_varying: -; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 +; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 +; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GFX8MORE: ds_and_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { @@ -208,7 +372,8 @@ } ; GCN-LABEL: or_i32_varying: -; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 +; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 +; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GFX8MORE: ds_or_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { @@ -220,7 +385,8 @@ } ; GCN-LABEL: xor_i32_varying: -; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 +; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 +; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GFX8MORE: ds_xor_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { @@ -232,7 +398,8 @@ } ; GCN-LABEL: max_i32_varying: -; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 +; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 +; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GFX8MORE: ds_max_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { @@ -244,10 +411,11 @@ } ; GCN-LABEL: max_i64_constant: -; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 -; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 -; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] -; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] +; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 +; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 +; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] ; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], 5 ; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], 0 ; GCN: ds_max_rtn_i64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}} @@ -259,7 +427,8 @@ } ; GCN-LABEL: min_i32_varying: -; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 +; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 +; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GFX8MORE: ds_min_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { @@ -271,10 +440,11 @@ } ; GCN-LABEL: min_i64_constant: -; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 -; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 -; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] -; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] +; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 +; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 +; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] ; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], 5 ; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], 0 ; GCN: ds_min_rtn_i64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}} @@ -286,7 +456,8 @@ } ; GCN-LABEL: umax_i32_varying: -; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 +; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 +; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GFX8MORE: ds_max_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { @@ -298,10 +469,11 @@ } ; GCN-LABEL: umax_i64_constant: -; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 -; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 -; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] -; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] +; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 +; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 +; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] ; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], 5 ; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], 0 ; GCN: ds_max_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}} @@ -313,7 +485,8 @@ } ; GCN-LABEL: umin_i32_varying: -; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 +; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 +; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GFX8MORE: ds_min_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v[[value]] define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { @@ -325,10 +498,11 @@ } ; GCN-LABEL: umin_i64_constant: -; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 -; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 -; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] -; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] +; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 +; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 +; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] ; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], 5 ; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], 0 ; GCN: ds_min_rtn_u64 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v{{[0-9]+}}, v{{\[}}[[value_lo]]:[[value_hi]]{{\]}} diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -1,6 +1,8 @@ -; RUN: llc -mtriple=amdgcn-- -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s -; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s -; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s +; RUN: llc -mtriple=amdgcn-- -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX7LESS %s +; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64 %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN32,GFX8MORE,GFX8MORE32 %s declare i1 @llvm.amdgcn.wqm.vote(i1) declare i32 @llvm.amdgcn.buffer.atomic.add(i32, <4 x i32>, i32, i32, i1) @@ -10,11 +12,13 @@ ; GCN-LABEL: add_i32_constant: ; GCN-LABEL: BB0_1: -; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 -; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 -; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] -; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] -; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 +; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 +; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] +; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} ; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 ; GCN: buffer_atomic_add v[[value]] ; GCN: v_readfirstlane_b32 s{{[0-9]+}}, v[[value]] @@ -36,13 +40,15 @@ ; GCN-LABEL: add_i32_varying: ; GFX7LESS-NOT: v_mbcnt_lo_u32_b32 ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 -; GFX8MORE: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 -; GFX8MORE: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 -; GFX8MORE: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] -; GFX8MORE: v_add_u32_dpp -; GFX8MORE: v_add_u32_dpp -; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 -; GFX8MORE: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] +; GFX8MORE32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 +; GFX8MORE64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 +; GFX8MORE: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 +; GFX8MORE64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] +; DPPCOMB: v_add_u32_dpp +; DPPCOMB: v_add_u32_dpp +; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 +; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 +; GFX8MORE: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GFX8MORE: buffer_atomic_add v[[value]] ; GFX8MORE: v_readfirstlane_b32 s{{[0-9]+}}, v[[value]] diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -1,6 +1,8 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX7LESS %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN32,GFX8MORE,GFX8MORE32 %s declare i32 @llvm.amdgcn.workitem.id.x() declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32) @@ -9,11 +11,13 @@ ; Show that what the atomic optimization pass will do for raw buffers. ; GCN-LABEL: add_i32_constant: -; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 -; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 -; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] -; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] -; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 +; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 +; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] +; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} ; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 ; GCN: buffer_atomic_add v[[value]] define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) { @@ -24,11 +28,13 @@ } ; GCN-LABEL: add_i32_uniform: -; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 -; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 -; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] -; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] -; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 +; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 +; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] +; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} ; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] ; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GCN: buffer_atomic_add v[[value]] @@ -44,9 +50,10 @@ ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 ; GFX7LESS-NOT: s_bcnt1_i32_b64 ; GFX7LESS: buffer_atomic_add v{{[0-9]+}} -; GFX8MORE: v_add_u32_dpp -; GFX8MORE: v_add_u32_dpp -; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 +; DPPCOMB: v_add_u32_dpp +; DPPCOMB: v_add_u32_dpp +; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 +; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GFX8MORE: buffer_atomic_add v[[value]] define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) { @@ -71,11 +78,13 @@ } ; GCN-LABEL: sub_i32_constant: -; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 -; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 -; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] -; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] -; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 +; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 +; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] +; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} ; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 ; GCN: buffer_atomic_sub v[[value]] define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) { @@ -86,11 +95,13 @@ } ; GCN-LABEL: sub_i32_uniform: -; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 -; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 -; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] -; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] -; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 +; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 +; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] +; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} ; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] ; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GCN: buffer_atomic_sub v[[value]] @@ -106,9 +117,10 @@ ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 ; GFX7LESS-NOT: s_bcnt1_i32_b64 ; GFX7LESS: buffer_atomic_sub v{{[0-9]+}} -; GFX8MORE: v_sub{{(rev)?}}_u32_dpp -; GFX8MORE: v_sub{{(rev)?}}_u32_dpp -; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 +; DPPCOMB: v_add_u32_dpp +; DPPCOMB: v_add_u32_dpp +; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 +; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GFX8MORE: buffer_atomic_sub v[[value]] define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) { diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -1,6 +1,8 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX7LESS %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8MORE %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX7LESS %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,DPPCOMB %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN32,GFX8MORE,GFX8MORE32 %s declare i32 @llvm.amdgcn.workitem.id.x() declare i32 @llvm.amdgcn.struct.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32, i32) @@ -9,11 +11,13 @@ ; Show that what the atomic optimization pass will do for struct buffers. ; GCN-LABEL: add_i32_constant: -; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 -; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 -; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] -; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] -; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 +; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 +; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] +; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} ; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 ; GCN: buffer_atomic_add v[[value]] define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) { @@ -24,11 +28,13 @@ } ; GCN-LABEL: add_i32_uniform: -; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 -; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 -; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] -; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] -; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 +; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 +; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] +; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} ; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] ; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GCN: buffer_atomic_add v[[value]] @@ -44,9 +50,10 @@ ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 ; GFX7LESS-NOT: s_bcnt1_i32_b64 ; GFX7LESS: buffer_atomic_add v{{[0-9]+}} -; GFX8MORE: v_add_u32_dpp -; GFX8MORE: v_add_u32_dpp -; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 +; DPPCOMB: v_add_u32_dpp +; DPPCOMB: v_add_u32_dpp +; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 +; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GFX8MORE: buffer_atomic_add v[[value]] define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) { @@ -84,11 +91,13 @@ } ; GCN-LABEL: sub_i32_constant: -; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 -; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 -; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] -; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] -; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 +; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 +; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] +; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} ; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 ; GCN: buffer_atomic_sub v[[value]] define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) { @@ -99,11 +108,13 @@ } ; GCN-LABEL: sub_i32_uniform: -; GCN: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 -; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_lo:[0-9]+]], s[[exec_lo]], 0 -; GCN: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt_hi:[0-9]+]], s[[exec_hi]], v[[mbcnt_lo]] -; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc, 0, v[[mbcnt_hi]] -; GCN: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} +; GCN32: v_cmp_ne_u32_e64 s[[exec_lo:[0-9]+]], 1, 0 +; GCN64: v_cmp_ne_u32_e64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, 1, 0 +; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 +; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] +; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] +; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} ; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] ; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GCN: buffer_atomic_sub v[[value]] @@ -119,9 +130,10 @@ ; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 ; GFX7LESS-NOT: s_bcnt1_i32_b64 ; GFX7LESS: buffer_atomic_sub v{{[0-9]+}} -; GFX8MORE: v_sub{{(rev)?}}_u32_dpp -; GFX8MORE: v_sub{{(rev)?}}_u32_dpp -; GFX8MORE: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 +; DPPCOMB: v_add_u32_dpp +; DPPCOMB: v_add_u32_dpp +; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 +; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 ; GFX8MORE: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GFX8MORE: buffer_atomic_sub v[[value]] define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {