Index: lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp =================================================================== --- lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -423,6 +423,10 @@ }; } // anonymous namespace +/// +/// NVPTX API calls. +/// + /// Get the GPU warp size. static llvm::Value *getNVPTXWarpSize(CodeGenFunction &CGF) { return CGF.EmitRuntimeCall( @@ -431,6 +435,14 @@ "nvptx_warp_size"); } +/// Get the id of the current block on the GPU. +static llvm::Value *getNVPTXBlockID(CodeGenFunction &CGF) { + return CGF.EmitRuntimeCall( + llvm::Intrinsic::getDeclaration( + &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x), + "nvptx_block_id"); +} + /// Get the id of the current thread on the GPU. static llvm::Value *getNVPTXThreadID(CodeGenFunction &CGF) { return CGF.EmitRuntimeCall( @@ -521,6 +533,32 @@ Bld.CreateNot(Mask), "master_tid"); } +/// Get number of OMP workers for parallel region after subtracting +/// the master warp. +static llvm::Value *getNumWorkers(CodeGenFunction &CGF) { + CGBuilderTy &Bld = CGF.Builder; + return Bld.CreateNUWSub(getNVPTXNumThreads(CGF), Bld.getInt32(32), + "num_workers"); +} + +/// Get thread id in team. +/// FIXME: Remove the expensive remainder operation. +static llvm::Value *getTeamThreadId(CodeGenFunction &CGF) { + CGBuilderTy &Bld = CGF.Builder; + // N % M = N & (M-1) it M is a power of 2. The master Id is expected to be a + // power fo two in all cases. + auto *Mask = Bld.CreateNUWSub(getMasterThreadID(CGF), Bld.getInt32(1)); + return Bld.CreateAnd(getNVPTXThreadID(CGF), Mask, "team_tid"); +} + +/// Get global thread id. +static llvm::Value *getGlobalThreadId(CodeGenFunction &CGF) { + assert(CGF.CurFn && "No function in current CodeGenFunction."); + CGBuilderTy &Bld = CGF.Builder; + return Bld.CreateAdd(Bld.CreateMul(getNVPTXBlockID(CGF), getNumWorkers(CGF)), + getTeamThreadId(CGF), "global_tid"); +} + CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState( CodeGenModule &CGM, SourceLocation Loc) : WorkerFn(nullptr), CGFI(nullptr), Loc(Loc) { @@ -2876,9 +2914,15 @@ // Get the array of arguments. SmallVector Args; - // TODO: suppport SIMD and pass actual values - Args.emplace_back( - llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo())); + // First argument is the global thread ID. + Address GlobalThreadIDAddr = + CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, "global_tid"); + CGF.EmitStoreOfScalar(getGlobalThreadId(CGF), GlobalThreadIDAddr, + /*Volatile=*/false, + Ctx.getPointerType(Ctx.VoidPtrTy)); + Args.emplace_back(GlobalThreadIDAddr.getPointer()); + + // TODO: suppport SIMD and pass actual value Args.emplace_back( llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));