Index: lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -58,8 +58,6 @@ void optimizeAtomic(Instruction &I, Instruction::BinaryOps Op, unsigned ValIdx, bool ValDivergent) const; - void setConvergent(CallInst *const CI) const; - public: static char ID; @@ -253,7 +251,6 @@ CallInst *const Ballot = B.CreateIntrinsic(Intrinsic::amdgcn_icmp, {B.getInt32Ty()}, {B.getInt32(1), B.getInt32(0), B.getInt32(33)}); - setConvergent(Ballot); // We need to know how many lanes are active within the wavefront that are // below us. If we counted each lane linearly starting from 0, a lane is @@ -281,13 +278,11 @@ // correctly contribute to the final result. CallInst *const SetInactive = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); - setConvergent(SetInactive); CallInst *const FirstDPP = B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, Ty, {Identity, SetInactive, B.getInt32(DPP_WF_SR1), B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}); - setConvergent(FirstDPP); NewV = FirstDPP; const unsigned Iters = 7; @@ -305,7 +300,6 @@ Intrinsic::amdgcn_update_dpp, Ty, {Identity, UpdateValue, B.getInt32(DPPCtrl[Idx]), B.getInt32(RowMask[Idx]), B.getInt32(BankMask[Idx]), B.getFalse()}); - setConvergent(DPP); NewV = B.CreateBinOp(Op, NewV, DPP); } @@ -322,10 +316,8 @@ B.CreateTrunc(B.CreateLShr(NewV, B.getInt64(32)), B.getInt32Ty()); CallInst *const ReadLaneLo = B.CreateIntrinsic( Intrinsic::amdgcn_readlane, {}, {ExtractLo, B.getInt32(63)}); - setConvergent(ReadLaneLo); CallInst *const ReadLaneHi = B.CreateIntrinsic( Intrinsic::amdgcn_readlane, {}, {ExtractHi, B.getInt32(63)}); - setConvergent(ReadLaneHi); Value *const PartialInsert = B.CreateInsertElement( UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0)); Value *const Insert = @@ -334,7 +326,6 @@ } else if (TyBitWidth == 32) { CallInst *const ReadLane = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, {NewV, B.getInt32(63)}); - setConvergent(ReadLane); NewV = ReadLane; } else { llvm_unreachable("Unhandled atomic bit width"); @@ -398,20 +389,16 @@ B.CreateTrunc(B.CreateLShr(PHI, B.getInt64(32)), B.getInt32Ty()); CallInst *const ReadFirstLaneLo = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo); - setConvergent(ReadFirstLaneLo); CallInst *const ReadFirstLaneHi = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi); - setConvergent(ReadFirstLaneHi); Value *const PartialInsert = B.CreateInsertElement( UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0)); Value *const Insert = B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1)); BroadcastI = B.CreateBitCast(Insert, Ty); } else if (TyBitWidth == 32) { - CallInst *const ReadFirstLane = - B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI); - setConvergent(ReadFirstLane); - BroadcastI = ReadFirstLane; + + BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI); } else { llvm_unreachable("Unhandled atomic bit width"); } @@ -439,10 +426,6 @@ I.eraseFromParent(); } -void AMDGPUAtomicOptimizer::setConvergent(CallInst *const CI) const { - CI->addAttribute(AttributeList::FunctionIndex, Attribute::Convergent); -} - INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE, "AMDGPU atomic optimizations", false, false) INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)