Index: llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -47,12 +47,17 @@ DominatorTree *DT; const GCNSubtarget *ST; bool IsPixelShader; + bool IsComputeKernel; Value *buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, Value *const Identity) const; Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, Value *const Identity) const; Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const; + std::pair + buildScanIteratively(IRBuilder<> &B, AtomicRMWInst::BinOp Op, + Value *const Identity, Value *const Ballot, Value *V, + Instruction &I) const; void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx, bool ValDivergent) const; @@ -93,6 +98,7 @@ const TargetMachine &TM = TPC.getTM(); ST = &TM.getSubtarget(F); IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS; + IsComputeKernel = F.getCallingConv() == CallingConv::AMDGPU_KERNEL; visit(F); @@ -430,6 +436,73 @@ return V; } +// Use the builder to create an exclusive scan and compute the final reduced +// value using an iterative approach. This provides an alternative +// implementation to DPP which uses WMM for scan computations. This API iterate +// over lanes to read, compute and update (conditionally) the value using +// readlane and writelane intrinsics. +std::pair AMDGPUAtomicOptimizer::buildScanIteratively( + IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *const Identity, + Value *const Ballot, Value *V, Instruction &I) const { + Type *const Ty = I.getType(); + const bool NeedResult = !I.use_empty(); + Module *M = B.GetInsertBlock()->getModule(); + Function *WriteLaneDecl = + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {}); + Value *ReducedValue = Identity; + Value *PartialSumTillPreviousLane = Identity; + BasicBlock *CurrentBasicBlock = nullptr; + PHINode *DestWrite = nullptr; + Value *Scan = V; + Value *DestWriteOfPreviousLane = V; + Instruction *TerminatorInConditionalWritelaneBlock = nullptr; + unsigned WaveFrontSize = ST->isWave32() ? 32 : 64; + Type *const BallotTy = Ballot->getType(); + const unsigned TyBitWidth = DL->getTypeSizeInBits(BallotTy); + + for (unsigned LaneIdx = 0; LaneIdx < WaveFrontSize; LaneIdx++) { + // Iterate over all the lanes of a wavefront to compute the partial sum. If + // the lane is not active select the Identity value in computation, + // otherwise consider value extracted using readlane. + Value *Mask = + B.CreateShl(B.getIntN(TyBitWidth, 1), B.getIntN(TyBitWidth, LaneIdx)); + Value *BallotAndMask = B.CreateAnd(Ballot, Mask); + Value *IsBitSet = B.CreateICmpEQ(BallotAndMask, Mask); + Value *Select = + B.CreateSelect(IsBitSet, + B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, + {V, B.getInt32(LaneIdx)}), + Identity); + ReducedValue = buildNonAtomicBinOp(B, Op, ReducedValue, Select); + + // Perform the writelane conditionally on only active lanes if the + // intermidiate scan results are required. + if (NeedResult) { + CurrentBasicBlock = I.getParent(); + // Split the current basic block into IfThen to perform writelane + // conditionally on active lanes only + TerminatorInConditionalWritelaneBlock = + SplitBlockAndInsertIfThen(IsBitSet, &I, false, nullptr, DT, nullptr); + + // Write exclusive scan (partial sum till the previous lane) result into a + // current lane + B.SetInsertPoint(TerminatorInConditionalWritelaneBlock); + Scan = B.CreateCall(WriteLaneDecl, {PartialSumTillPreviousLane, + B.getInt32(LaneIdx), Scan}); + + B.SetInsertPoint(&I); + DestWrite = B.CreatePHI(Ty, 2, "DestWrite"); + DestWrite->addIncoming(DestWriteOfPreviousLane, CurrentBasicBlock); + DestWrite->addIncoming( + Scan, TerminatorInConditionalWritelaneBlock->getParent()); + // Values used for the next iteration + DestWriteOfPreviousLane = DestWrite; + PartialSumTillPreviousLane = ReducedValue; + } + } + return std::make_pair(Scan, ReducedValue); +} + static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op, unsigned BitWidth) { switch (Op) { @@ -531,33 +604,39 @@ // If we have a divergent value in each lane, we need to combine the value // using DPP. if (ValDivergent) { - // First we need to set all inactive invocations to the identity value, so - // that they can correctly contribute to the final result. - NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); - const AtomicRMWInst::BinOp ScanOp = Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op; - if (!NeedResult && ST->hasPermLaneX16()) { - // On GFX10 the permlanex16 instruction helps us build a reduction without - // too many readlanes and writelanes, which are generally bad for - // performance. - NewV = buildReduction(B, ScanOp, NewV, Identity); + if (!IsComputeKernel) { + // First we need to set all inactive invocations to the identity value, so + // that they can correctly contribute to the final result. + NewV = + B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); + const AtomicRMWInst::BinOp ScanOp = + Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op; + if (!NeedResult && ST->hasPermLaneX16()) { + // On GFX10 the permlanex16 instruction helps us build a reduction + // without too many readlanes and writelanes, which are generally bad + // for performance. + NewV = buildReduction(B, ScanOp, NewV, Identity); + } else { + NewV = buildScan(B, ScanOp, NewV, Identity); + if (NeedResult) + ExclScan = buildShiftRight(B, NewV, Identity); + // Read the value from the last lane, which has accumulated the values + // of each active lane in the wavefront. This will be our new value + // which we will provide to the atomic operation. + Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); + assert(TyBitWidth == 32); + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, + {NewV, LastLaneIdx}); + } + // Finally mark the readlanes in the WWM section. + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV); } else { - NewV = buildScan(B, ScanOp, NewV, Identity); - if (NeedResult) - ExclScan = buildShiftRight(B, NewV, Identity); - - // Read the value from the last lane, which has accumulated the values of - // each active lane in the wavefront. This will be our new value which we - // will provide to the atomic operation. - Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); - assert(TyBitWidth == 32); - NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, - {NewV, LastLaneIdx}); + // Alternative implementation for scan + std::tie(ExclScan, NewV) = + buildScanIteratively(B, ScanOp, Identity, Ballot, V, I); } - - // Finally mark the readlanes in the WWM section. - NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV); } else { switch (Op) { default: @@ -594,11 +673,19 @@ } } - // We only want a single lane to enter our new control flow, and we do this - // by checking if there are any active lanes below us. Only one lane will - // have 0 active lanes below us, so that will be the only one to progress. - Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getIntN(TyBitWidth, 0)); - + Value *Cond = nullptr; + if (ValDivergent && IsComputeKernel) { + // Only the first active lane will enter the new control flow to update the + // value. + CallInst *const FirstActiveLane = + B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, Mbcnt); + Cond = B.CreateICmpEQ(Mbcnt, FirstActiveLane); + } else { + // We only want a single lane to enter our new control flow, and we do this + // by checking if there are any active lanes below us. Only one lane will + // have 0 active lanes below us, so that will be the only one to progress. + Cond = B.CreateICmpEQ(Mbcnt, B.getIntN(TyBitWidth, 0)); + } // Store I's original basic block before we split the block. BasicBlock *const EntryBB = I.getParent(); @@ -660,8 +747,12 @@ // from the first lane, to get our lane's index into the atomic result. Value *LaneOffset = nullptr; if (ValDivergent) { - LaneOffset = - B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan); + if (!IsComputeKernel) { + LaneOffset = + B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan); + } else { + LaneOffset = ExclScan; + } } else { switch (Op) { default: Index: llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -270,11 +270,10 @@ cl::init(true), cl::Hidden); // Enable atomic optimization -static cl::opt EnableAtomicOptimizations( - "amdgpu-atomic-optimizations", - cl::desc("Enable atomic optimizations"), - cl::init(false), - cl::Hidden); +static cl::opt + EnableAtomicOptimizations("amdgpu-atomic-optimizations", + cl::desc("Enable atomic optimizations"), + cl::init(true), cl::Hidden); // Enable Mode register optimization static cl::opt EnableSIModeRegisterPass(