Index: llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -47,12 +47,15 @@ DominatorTree *DT; const GCNSubtarget *ST; bool IsPixelShader; + bool IsComputeKernel; Value *buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, Value *const Identity) const; Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, Value *const Identity) const; Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const; + std::pair buildScanIteratively(IRBuilder<> &B, AtomicRMWInst::BinOp Op, + Value *const Identity, Value *const Ballot, Value *V, Instruction &I) const; void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx, bool ValDivergent) const; @@ -93,7 +96,8 @@ const TargetMachine &TM = TPC.getTM(); ST = &TM.getSubtarget(F); IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS; - + IsComputeKernel = F.getCallingConv() == CallingConv::AMDGPU_KERNEL; + visit(F); const bool Changed = !ToReplace.empty(); @@ -430,6 +434,54 @@ return V; } +// Use the builder to create an exclusive scan and compute the final reduced value using an iterative approach. +// This provides an alternative implementation to DPP which uses WMM for scan computations. This API iterate over +// lanes to read, compute and update (conditionally) the value using readlane and writelane intrinsics. +std::pair AMDGPUAtomicOptimizer::buildScanIteratively(IRBuilder<> &B, AtomicRMWInst::BinOp Op, + Value *const Identity, Value *const Ballot, Value *V, Instruction &I) const { + Type *const Ty = I.getType(); + const bool NeedResult = !I.use_empty(); + Module *M = B.GetInsertBlock()->getModule(); + Function *WriteLaneDecl = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {}); + Value *ReducedValue = Identity; + Value *PartialSumTillPreviousLane = Identity; + BasicBlock * CurrentBasicBlock = nullptr; + PHINode * DestWrite = nullptr; + Value *Scan = V; + Value *DestWriteOfPreviousLane = V; + Instruction *TerminatorInConditionalWritelaneBlock = nullptr; + unsigned WaveFrontSize = ST->isWave32() ? 32 : 64; + for(unsigned LaneIdx = 0; LaneIdx < WaveFrontSize; LaneIdx++) { + // Iterate over all the lanes of a wavefront to compute the partial sum. If the lane is not active select the Identity value + // in computation, otherwise consider value extracted using readlane. + Value *Mask = B.CreateShl(B.getInt64(1), B.getInt64(LaneIdx)); + Value *BallotAndMask = B.CreateAnd(Ballot, Mask); + Value *IsBitSet = B.CreateICmpEQ(BallotAndMask, Mask); + Value *Select = B.CreateSelect(IsBitSet, B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, {V, B.getInt32(LaneIdx)}), Identity); + ReducedValue = buildNonAtomicBinOp(B, Op, ReducedValue, Select); + + // Perform the writelane conditionally on only active lanes if the intermidiate scan results are required. + if (NeedResult) { + CurrentBasicBlock = I.getParent(); + // Split the current basic block into IfThen to perform writelane conditionally on active lanes only + TerminatorInConditionalWritelaneBlock = SplitBlockAndInsertIfThen(IsBitSet, &I, false, nullptr, DT, nullptr); + + // Write exclusive scan (partial sum till the previous lane) result into a current lane + B.SetInsertPoint(TerminatorInConditionalWritelaneBlock); + Scan = B.CreateCall(WriteLaneDecl, {PartialSumTillPreviousLane, B.getInt32(LaneIdx), Scan}); + + B.SetInsertPoint(&I); + DestWrite = B.CreatePHI(Ty, 2, "DestWrite"); + DestWrite->addIncoming(DestWriteOfPreviousLane, CurrentBasicBlock); + DestWrite->addIncoming(Scan, TerminatorInConditionalWritelaneBlock->getParent()); + // Values used for the next iteration + DestWriteOfPreviousLane = DestWrite; + PartialSumTillPreviousLane = ReducedValue; + } + } + return std::make_pair(Scan, ReducedValue); +} + static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op, unsigned BitWidth) { switch (Op) { @@ -530,34 +582,38 @@ // If we have a divergent value in each lane, we need to combine the value // using DPP. - if (ValDivergent) { - // First we need to set all inactive invocations to the identity value, so - // that they can correctly contribute to the final result. - NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); - + if (ValDivergent) { const AtomicRMWInst::BinOp ScanOp = - Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op; - if (!NeedResult && ST->hasPermLaneX16()) { - // On GFX10 the permlanex16 instruction helps us build a reduction without - // too many readlanes and writelanes, which are generally bad for - // performance. - NewV = buildReduction(B, ScanOp, NewV, Identity); + Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op; + if(!IsComputeKernel) { + // First we need to set all inactive invocations to the identity value, so + // that they can correctly contribute to the final result. + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); + const AtomicRMWInst::BinOp ScanOp = + Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op; + if (!NeedResult && ST->hasPermLaneX16()) { + // On GFX10 the permlanex16 instruction helps us build a reduction without + // too many readlanes and writelanes, which are generally bad for + // performance. + NewV = buildReduction(B, ScanOp, NewV, Identity); + } else { + NewV = buildScan(B, ScanOp, NewV, Identity); + if (NeedResult) + ExclScan = buildShiftRight(B, NewV, Identity); + // Read the value from the last lane, which has accumulated the values of + // each active lane in the wavefront. This will be our new value which we + // will provide to the atomic operation. + Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); + assert(TyBitWidth == 32); + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, + {NewV, LastLaneIdx}); + } + // Finally mark the readlanes in the WWM section. + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV); } else { - NewV = buildScan(B, ScanOp, NewV, Identity); - if (NeedResult) - ExclScan = buildShiftRight(B, NewV, Identity); - - // Read the value from the last lane, which has accumulated the values of - // each active lane in the wavefront. This will be our new value which we - // will provide to the atomic operation. - Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); - assert(TyBitWidth == 32); - NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, - {NewV, LastLaneIdx}); + // Alternative implementation for scan + std::tie(ExclScan, NewV) = buildScanIteratively(B, ScanOp, Identity, Ballot, V, I); } - - // Finally mark the readlanes in the WWM section. - NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV); } else { switch (Op) { default: @@ -594,10 +650,19 @@ } } - // We only want a single lane to enter our new control flow, and we do this - // by checking if there are any active lanes below us. Only one lane will - // have 0 active lanes below us, so that will be the only one to progress. - Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getIntN(TyBitWidth, 0)); + + Value * Cond = nullptr; + if(ValDivergent && IsComputeKernel) { + // Only the first active lane will enter the new control flow to update the value. + CallInst *const FirstActiveLane = + B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, Mbcnt); + Cond = B.CreateICmpEQ(Mbcnt, FirstActiveLane); + } else { + // We only want a single lane to enter our new control flow, and we do this + // by checking if there are any active lanes below us. Only one lane will + // have 0 active lanes below us, so that will be the only one to progress. + Cond = B.CreateICmpEQ(Mbcnt, B.getIntN(TyBitWidth, 0)); + } // Store I's original basic block before we split the block. BasicBlock *const EntryBB = I.getParent(); @@ -660,8 +725,12 @@ // from the first lane, to get our lane's index into the atomic result. Value *LaneOffset = nullptr; if (ValDivergent) { - LaneOffset = - B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan); + if(!IsComputeKernel) { + LaneOffset = + B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan); + } else { + LaneOffset = ExclScan; + } } else { switch (Op) { default: Index: llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -273,7 +273,7 @@ static cl::opt EnableAtomicOptimizations( "amdgpu-atomic-optimizations", cl::desc("Enable atomic optimizations"), - cl::init(false), + cl::init(true), cl::Hidden); // Enable Mode register optimization