Index: lib/CodeGen/CGOpenMPRuntime.h =================================================================== --- lib/CodeGen/CGOpenMPRuntime.h +++ lib/CodeGen/CGOpenMPRuntime.h @@ -893,6 +893,32 @@ OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen, bool HasCancel = false); + + /// Emits reduction function. + /// \param ArgsType Array type containing pointers to reduction variables. + /// \param Privates List of private copies for original reduction arguments. + /// \param LHSExprs List of LHS in \a ReductionOps reduction operations. + /// \param RHSExprs List of RHS in \a ReductionOps reduction operations. + /// \param ReductionOps List of reduction operations in form 'LHS binop RHS' + /// or 'operator binop(LHS, RHS)'. + llvm::Value *emitReductionFunction(CodeGenModule &CGM, llvm::Type *ArgsType, + ArrayRef Privates, + ArrayRef LHSExprs, + ArrayRef RHSExprs, + ArrayRef ReductionOps); + + /// Emits single reduction combiner + void emitSingleReductionCombiner(CodeGenFunction &CGF, + const Expr *ReductionOp, + const Expr *PrivateRef, + const DeclRefExpr *LHS, + const DeclRefExpr *RHS); + + struct ReductionOptionsTy { + bool WithNowait; + bool SimpleReduction; + OpenMPDirectiveKind ReductionKind; + }; /// \brief Emit a code for reduction clause. Next code should be emitted for /// reduction: /// \code @@ -929,14 +955,18 @@ /// \param RHSExprs List of RHS in \a ReductionOps reduction operations. /// \param ReductionOps List of reduction operations in form 'LHS binop RHS' /// or 'operator binop(LHS, RHS)'. - /// \param WithNowait true if parent directive has also nowait clause, false - /// otherwise. + /// \param Options List of options for reduction codegen: + /// WithNowait true if parent directive has also nowait clause, false + /// otherwise. + /// SimpleReduction Emit reduction operation only. Used for omp simd + /// directive on the host. + /// ReductionKind The kind of reduction to perform. virtual void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef Privates, ArrayRef LHSExprs, ArrayRef RHSExprs, ArrayRef ReductionOps, - bool WithNowait, bool SimpleReduction); + ReductionOptionsTy Options); /// \brief Emit code for 'taskwait' directive. virtual void emitTaskwaitCall(CodeGenFunction &CGF, SourceLocation Loc); Index: lib/CodeGen/CGOpenMPRuntime.cpp =================================================================== --- lib/CodeGen/CGOpenMPRuntime.cpp +++ lib/CodeGen/CGOpenMPRuntime.cpp @@ -4257,12 +4257,10 @@ CGF.EmitIgnoredExpr(ReductionOp); } -static llvm::Value *emitReductionFunction(CodeGenModule &CGM, - llvm::Type *ArgsType, - ArrayRef Privates, - ArrayRef LHSExprs, - ArrayRef RHSExprs, - ArrayRef ReductionOps) { +llvm::Value *CGOpenMPRuntime::emitReductionFunction( + CodeGenModule &CGM, llvm::Type *ArgsType, ArrayRef Privates, + ArrayRef LHSExprs, ArrayRef RHSExprs, + ArrayRef ReductionOps) { auto &C = CGM.getContext(); // void reduction_func(void *LHSArg, void *RHSArg); @@ -4345,11 +4343,11 @@ return Fn; } -static void emitSingleReductionCombiner(CodeGenFunction &CGF, - const Expr *ReductionOp, - const Expr *PrivateRef, - const DeclRefExpr *LHS, - const DeclRefExpr *RHS) { +void CGOpenMPRuntime::emitSingleReductionCombiner(CodeGenFunction &CGF, + const Expr *ReductionOp, + const Expr *PrivateRef, + const DeclRefExpr *LHS, + const DeclRefExpr *RHS) { if (PrivateRef->getType()->isArrayType()) { // Emit reduction for array section. auto *LHSVar = cast(LHS->getDecl()); @@ -4369,9 +4367,13 @@ ArrayRef LHSExprs, ArrayRef RHSExprs, ArrayRef ReductionOps, - bool WithNowait, bool SimpleReduction) { + ReductionOptionsTy Options) { if (!CGF.HaveInsertPoint()) return; + + bool WithNowait = Options.WithNowait; + bool SimpleReduction = Options.SimpleReduction; + // Next code should be emitted for reduction: // // static kmp_critical_name lock = { 0 }; @@ -4513,12 +4515,13 @@ }; auto &&CodeGen = [&Privates, &LHSExprs, &RHSExprs, &ReductionOps]( CodeGenFunction &CGF, PrePostActionTy &Action) { + auto &RT = CGF.CGM.getOpenMPRuntime(); auto IPriv = Privates.begin(); auto ILHS = LHSExprs.begin(); auto IRHS = RHSExprs.begin(); for (auto *E : ReductionOps) { - emitSingleReductionCombiner(CGF, E, *IPriv, cast(*ILHS), - cast(*IRHS)); + RT.emitSingleReductionCombiner(CGF, E, *IPriv, cast(*ILHS), + cast(*IRHS)); ++IPriv; ++ILHS; ++IRHS; Index: lib/CodeGen/CGOpenMPRuntimeNVPTX.h =================================================================== --- lib/CodeGen/CGOpenMPRuntimeNVPTX.h +++ lib/CodeGen/CGOpenMPRuntimeNVPTX.h @@ -67,12 +67,6 @@ /// \brief Signal termination of Spmd mode execution. void emitSpmdEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST); - /// \brief Returns specified OpenMP runtime function for the current OpenMP - /// implementation. Specialized for the NVPTX device. - /// \param Function OpenMP runtime function. - /// \return Specified function. - llvm::Constant *createNVPTXRuntimeFunction(unsigned Function); - // // Base class overrides. // @@ -248,7 +242,32 @@ ArrayRef CapturedVars, const Expr *IfCond) override; -public: + /// Emit a code for reduction clause. + /// + /// \param Privates List of private copies for original reduction arguments. + /// \param LHSExprs List of LHS in \a ReductionOps reduction operations. + /// \param RHSExprs List of RHS in \a ReductionOps reduction operations. + /// \param ReductionOps List of reduction operations in form 'LHS binop RHS' + /// or 'operator binop(LHS, RHS)'. + /// \param Options List of options for reduction codegen: + /// WithNowait true if parent directive has also nowait clause, false + /// otherwise. + /// SimpleReduction Emit reduction operation only. Used for omp simd + /// directive on the host. + /// ReductionKind The kind of reduction to perform. + virtual void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, + ArrayRef Privates, + ArrayRef LHSExprs, + ArrayRef RHSExprs, + ArrayRef ReductionOps, + ReductionOptionsTy Options) override; + + /// Returns specified OpenMP runtime function for the current OpenMP + /// implementation. Specialized for the NVPTX device. + /// \param Function OpenMP runtime function. + /// \return Specified function. + llvm::Constant *createNVPTXRuntimeFunction(unsigned Function); + /// Target codegen is specialized based on two programming models: the /// 'generic' fork-join model of OpenMP, and a more GPU efficient 'spmd' /// model for constructs like 'target parallel' that support it. Index: lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp =================================================================== --- lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -44,6 +44,20 @@ /// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 /// global_tid); OMPRTL_NVPTX__kmpc_end_serialized_parallel, + /// \brief Call to int32_t __kmpc_shuffle_int32(int32_t element, + /// int16_t lane_offset, int16_t warp_size); + OMPRTL_NVPTX__kmpc_shuffle_int32, + /// \brief Call to int64_t __kmpc_shuffle_int64(int64_t element, + /// int16_t lane_offset, int16_t warp_size); + OMPRTL_NVPTX__kmpc_shuffle_int64, + /// \brief Call to __kmpc_nvptx_parallel_reduce_nowait(kmp_int32 + /// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data, + /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t + /// lane_offset, int16_t shortCircuit), + /// void (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num)); + OMPRTL_NVPTX__kmpc_parallel_reduce_nowait, + /// \brief Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid); + OMPRTL_NVPTX__kmpc_end_reduce_nowait }; /// Pre(post)-action for different OpenMP constructs specialized for NVPTX. @@ -100,6 +114,25 @@ } ~ExecutionModeRAII() { Mode = SavedMode; } }; + +/// GPU Configuration: This information can be derived from cuda registers, +/// however, providing compile time constants helps generate more efficient +/// code. For all practical purposes this is fine because the configuration +/// is the same for all known NVPTX architectures. +enum MachineConfiguration : unsigned { + WarpSize = 32, + /// Number of bits required to represent a lane identifier, which is + /// computed as log_2(WarpSize). + LaneIDBits = 5, + LaneIDMask = WarpSize - 1, +}; + +enum NamedBarrier : unsigned { + /// Synchronize on this barrier #ID using a named barrier primitive. + /// Only the subset of active threads in a parallel region arrive at the + /// barrier. + NB_Parallel = 1, +}; } // anonymous namespace /// Get the GPU warp size. @@ -120,6 +153,23 @@ llvm::None, "nvptx_tid"); } +/// Get the id of the warp in the block. +/// We assume that the warp size is 32, which is always the case +/// on the NVPTX device, to generate more efficient code. +static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) { + CGBuilderTy &Bld = CGF.Builder; + return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits, "nvptx_warp_id"); +} + +/// Get the id of the current lane in the Warp. +/// We assume that the warp size is 32, which is always the case +/// on the NVPTX device, to generate more efficient code. +static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) { + CGBuilderTy &Bld = CGF.Builder; + return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask), + "nvptx_lane_id"); +} + /// Get the maximum number of threads in a block of the GPU. static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) { CGBuilderTy &Bld = CGF.Builder; @@ -136,9 +186,25 @@ &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0)); } +/// Get barrier #ID to synchronize selected (multiple of warp size) threads in +/// a CTA. +static void getNVPTXBarrier(CodeGenFunction &CGF, int ID, + llvm::Value *NumThreads) { + CGBuilderTy &Bld = CGF.Builder; + llvm::Value *Args[] = {Bld.getInt32(ID), NumThreads}; + Bld.CreateCall(llvm::Intrinsic::getDeclaration(&CGF.CGM.getModule(), + llvm::Intrinsic::nvvm_barrier), + Args); +} + /// Synchronize all GPU threads in a block. static void syncCTAThreads(CodeGenFunction &CGF) { getNVPTXCTABarrier(CGF); } +/// Synchronize worker threads in a parallel region. +static void syncParallelThreads(CodeGenFunction &CGF, llvm::Value *NumThreads) { + return getNVPTXBarrier(CGF, NB_Parallel, NumThreads); +} + /// Get the value of the thread_limit clause in the teams directive. /// For the 'generic' execution mode, the runtime encodes thread_limit in /// the launch parameters, always starting thread_limit+warpSize threads per @@ -583,6 +649,60 @@ RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel"); break; } + case OMPRTL_NVPTX__kmpc_shuffle_int32: { + // Build int32_t __kmpc_shuffle_int32(int32_t element, + // int16_t lane_offset, int16_t warp_size); + llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty}; + llvm::FunctionType *FnTy = + llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false); + RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int32"); + break; + } + case OMPRTL_NVPTX__kmpc_shuffle_int64: { + // Build int64_t __kmpc_shuffle_int64(int64_t element, + // int16_t lane_offset, int16_t warp_size); + llvm::Type *TypeParams[] = {CGM.Int64Ty, CGM.Int16Ty, CGM.Int16Ty}; + llvm::FunctionType *FnTy = + llvm::FunctionType::get(CGM.Int64Ty, TypeParams, /*isVarArg*/ false); + RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int64"); + break; + } + case OMPRTL_NVPTX__kmpc_parallel_reduce_nowait: { + // Build int32_t kmpc_nvptx_parallel_reduce_nowait(kmp_int32 global_tid, + // kmp_int32 num_vars, size_t reduce_size, void* reduce_data, + // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t + // lane_offset, int16_t Algorithm Version), + // void (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num)); + llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty, + CGM.Int16Ty, CGM.Int16Ty}; + auto *ShuffleReduceFnTy = + llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams, + /*isVarArg=*/false); + llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty}; + auto *InterWarpCopyFnTy = + llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams, + /*isVarArg=*/false); + llvm::Type *TypeParams[] = {CGM.Int32Ty, + CGM.Int32Ty, + CGM.SizeTy, + CGM.VoidPtrTy, + ShuffleReduceFnTy->getPointerTo(), + InterWarpCopyFnTy->getPointerTo()}; + llvm::FunctionType *FnTy = + llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false); + RTLFn = CGM.CreateRuntimeFunction( + FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait"); + break; + } + case OMPRTL_NVPTX__kmpc_end_reduce_nowait: { + // Build __kmpc_end_reduce_nowait(kmp_int32 global_tid); + llvm::Type *TypeParams[] = {CGM.Int32Ty}; + llvm::FunctionType *FnTy = + llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false); + RTLFn = CGM.CreateRuntimeFunction( + FnTy, /*Name=*/"__kmpc_nvptx_end_reduce_nowait"); + break; + } } return RTLFn; } @@ -805,3 +925,890 @@ OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs); } + +/// This function creates calls to one of two shuffle functions to copy +/// variables between lanes in a warp. +static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF, + QualType ElemTy, + llvm::Value *Elem, + llvm::Value *Offset) { + auto &CGM = CGF.CGM; + auto &C = CGM.getContext(); + auto &Bld = CGF.Builder; + CGOpenMPRuntimeNVPTX &RT = + *(static_cast(&CGM.getOpenMPRuntime())); + + unsigned Size = CGM.getContext().getTypeSizeInChars(ElemTy).getQuantity(); + assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction."); + + OpenMPRTLFunctionNVPTX ShuffleFn = Size <= 4 + ? OMPRTL_NVPTX__kmpc_shuffle_int32 + : OMPRTL_NVPTX__kmpc_shuffle_int64; + + // Cast all types to 32- or 64-bit values before calling shuffle routines. + auto CastTy = Size <= 4 ? CGM.Int32Ty : CGM.Int64Ty; + auto *ElemCast = Bld.CreateSExtOrBitCast(Elem, CastTy); + auto *WarpSize = CGF.EmitScalarConversion( + getNVPTXWarpSize(CGF), C.getIntTypeForBitwidth(32, /* Signed */ true), + C.getIntTypeForBitwidth(16, /* Signed */ true), SourceLocation()); + + auto *ShuffledVal = + CGF.EmitRuntimeCall(RT.createNVPTXRuntimeFunction(ShuffleFn), + {ElemCast, Offset, WarpSize}); + + return Bld.CreateTruncOrBitCast(ShuffledVal, CGF.ConvertTypeForMem(ElemTy)); +} + +namespace { +enum CopyAction : unsigned { + // RemoteLaneToThread: Copy over a Reduce list from a remote lane in + // the warp using shuffle instructions. + RemoteLaneToThread, + // ThreadCopy: Make a copy of a Reduce list on the thread's stack. + ThreadCopy, +}; +} // namespace + +/// Emit instructions to copy a Reduce list, which contains partially +/// aggregated values, in the specified direction. +static void emitReductionListCopy(CopyAction Action, CodeGenFunction &CGF, + QualType ReductionArrayTy, + ArrayRef Privates, + Address SrcBase, Address DestBase, + llvm::Value *RemoteLaneOffset = nullptr) { + + auto &CGM = CGF.CGM; + auto &C = CGM.getContext(); + auto &Bld = CGF.Builder; + + // Iterates, element-by-element, through the source Reduce list and + // make a copy. + unsigned Idx = 0; + for (auto &Private : Privates) { + Address SrcElementAddr = Address::invalid(); + Address DestElementAddr = Address::invalid(); + Address DestElementPtrAddr = Address::invalid(); + // Should we shuffle in an element from a remote lane? + bool ShuffleInElement = false; + // Set to true to update the pointer in the dest Reduce list to a + // newly created element. + bool UpdateDestListPtr = false; + + switch (Action) { + case RemoteLaneToThread: { + // Step 1.1: Get the address for the src element in the Reduce list. + Address SrcElementPtrAddr = + Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize()); + llvm::Value *SrcElementPtrPtr = CGF.EmitLoadOfScalar( + SrcElementPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation()); + SrcElementAddr = + Address(SrcElementPtrPtr, C.getTypeAlignInChars(Private->getType())); + + // Step 1.2: Create a temporary to store the element in the destination + // Reduce list. + DestElementPtrAddr = + Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize()); + DestElementAddr = + CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element"); + ShuffleInElement = true; + UpdateDestListPtr = true; + break; + } + case ThreadCopy: { + // Step 1.1: Get the address for the src element in the Reduce list. + Address SrcElementPtrAddr = + Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize()); + llvm::Value *SrcElementPtrPtr = CGF.EmitLoadOfScalar( + SrcElementPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation()); + SrcElementAddr = + Address(SrcElementPtrPtr, C.getTypeAlignInChars(Private->getType())); + + // Step 1.2: Get the address for dest element. The destination + // element has already been created on the thread's stack. + DestElementPtrAddr = + Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize()); + llvm::Value *DestElementPtr = + CGF.EmitLoadOfScalar(DestElementPtrAddr, /*Volatile=*/false, + C.VoidPtrTy, SourceLocation()); + Address DestElemAddr = + Address(DestElementPtr, C.getTypeAlignInChars(Private->getType())); + DestElementAddr = Bld.CreateElementBitCast( + DestElemAddr, CGF.ConvertTypeForMem(Private->getType())); + break; + } + } + + // Regardless of src and dest of copy, we emit the load of src + // element as this is required in all directions + SrcElementAddr = Bld.CreateElementBitCast( + SrcElementAddr, CGF.ConvertTypeForMem(Private->getType())); + llvm::Value *Elem = + CGF.EmitLoadOfScalar(SrcElementAddr, /*Volatile=*/false, + Private->getType(), SourceLocation()); + + // Now that all active lanes have read the element in the + // Reduce list, shuffle over the value from the remote lane. + if (ShuffleInElement) { + Elem = createRuntimeShuffleFunction(CGF, Private->getType(), Elem, + RemoteLaneOffset); + } + + // Store the source element value to the dest element address. + CGF.EmitStoreOfScalar(Elem, DestElementAddr, /*Volatile=*/false, + Private->getType()); + + // Step 3.1: Modify reference in dest Reduce list as needed. + // Modifying the reference in Reduce list to point to the newly + // created element. The element is live in the current function + // scope and that of functions it invokes (i.e., reduce_function). + // RemoteReduceData[i] = (void*)&RemoteElem + if (UpdateDestListPtr) { + CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast( + DestElementAddr.getPointer(), CGF.VoidPtrTy), + DestElementPtrAddr, /*Volatile=*/false, + C.VoidPtrTy); + } + + Idx++; + } +} + +// This function emits a helper that gathers Reduce lists from the first +// lane of every active warp to lanes in the first warp. +// +// void inter_warp_copy_func(void* reduce_data, num_warps) +// shared smem[warp_size]; +// For all data entries D in reduce_data: +// If (I am the first lane in each warp) +// Copy my local D to smem[warp_id] +// sync +// if (I am the first warp) +// Copy smem[thread_id] to my local D +// sync +static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM, + ArrayRef Privates, + QualType ReductionArrayTy) { + auto &C = CGM.getContext(); + auto &M = CGM.getModule(); + + // ReduceList: thread local Reduce list. + // At the stage of the computation when this function is called, partially + // aggregated values reside in the first lane of every active warp. + ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, SourceLocation(), + /*Id=*/nullptr, C.VoidPtrTy); + // NumWarps: number of warps active in the parallel region. This could + // be smaller than 32 (max warps in a CTA) for partial block reduction. + ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, SourceLocation(), + /*Id=*/nullptr, C.IntTy); + FunctionArgList Args; + Args.push_back(&ReduceListArg); + Args.push_back(&NumWarpsArg); + + auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); + auto *Fn = llvm::Function::Create( + CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, + "_omp_reduction_inter_warp_copy_func", &CGM.getModule()); + CGM.SetInternalFunctionAttributes(/*DC=*/nullptr, Fn, CGFI); + CodeGenFunction CGF(CGM); + // We don't need debug information in this function as nothing here refers to + // user code. + CGF.disableDebugInfo(); + CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args); + + auto &Bld = CGF.Builder; + + // This array is used as a medium to transfer, one reduce element at a time, + // the data from the first lane of every warp to lanes in the first warp + // in order to perform the final step of a reduction in a parallel region + // (reduction across warps). The array is placed in NVPTX __shared__ memory + // for reduced latency, as well as to have a distinct copy for concurrently + // executing target regions. The array is declared with common linkage so + // as to be shared across compilation units. + const char *TransferMediumName = + "__openmp_nvptx_data_transfer_temporary_storage"; + llvm::GlobalVariable *TransferMedium = + M.getGlobalVariable(TransferMediumName); + if (!TransferMedium) { + auto *Ty = llvm::ArrayType::get(CGM.Int64Ty, WarpSize); + unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared); + TransferMedium = new llvm::GlobalVariable( + M, Ty, + /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage, + llvm::Constant::getNullValue(Ty), TransferMediumName, + /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal, + SharedAddressSpace); + } + + // Get the CUDA thread id of the current OpenMP thread on the GPU. + auto *ThreadID = getNVPTXThreadID(CGF); + // nvptx_lane_id = nvptx_id % warpsize + auto *LaneID = getNVPTXLaneID(CGF); + // nvptx_warp_id = nvptx_id / warpsize + auto *WarpID = getNVPTXWarpID(CGF); + + Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); + Address LocalReduceList( + Bld.CreatePointerBitCastOrAddrSpaceCast( + CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false, + C.VoidPtrTy, SourceLocation()), + CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()), + CGF.getPointerAlign()); + + unsigned Idx = 0; + for (auto &Private : Privates) { + // + // Warp master copies reduce element to transfer medium in __shared__ + // memory. + // + llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then"); + llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else"); + llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont"); + + // if (lane_id == 0) + auto IsWarpMaster = + Bld.CreateICmpEQ(LaneID, Bld.getInt32(0), "warp_master"); + Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB); + CGF.EmitBlock(ThenBB); + + // Reduce element = LocalReduceList[i] + Address ElemPtrPtrAddr = + Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize()); + llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar( + ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation()); + // elemptr = (type[i]*)(elemptrptr) + Address ElemPtr = + Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType())); + ElemPtr = Bld.CreateElementBitCast( + ElemPtr, CGF.ConvertTypeForMem(Private->getType())); + // elem = *elemptr + llvm::Value *Elem = CGF.EmitLoadOfScalar( + ElemPtr, /*Volatile=*/false, Private->getType(), SourceLocation()); + + // Get pointer to location in transfer medium. + // MediumPtr = &medium[warp_id] + llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP( + TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID}); + Address MediumPtr(MediumPtrVal, C.getTypeAlignInChars(Private->getType())); + // Casting to actual data type. + // MediumPtr = (type[i]*)MediumPtrAddr; + MediumPtr = Bld.CreateElementBitCast( + MediumPtr, CGF.ConvertTypeForMem(Private->getType())); + + //*MediumPtr = elem + Bld.CreateStore(Elem, MediumPtr); + + Bld.CreateBr(MergeBB); + + CGF.EmitBlock(ElseBB); + Bld.CreateBr(MergeBB); + + CGF.EmitBlock(MergeBB); + + Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg); + llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar( + AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, SourceLocation()); + + auto *NumActiveThreads = Bld.CreateNSWMul( + NumWarpsVal, getNVPTXWarpSize(CGF), "num_active_threads"); + // named_barrier_sync(ParallelBarrierID, num_active_threads) + syncParallelThreads(CGF, NumActiveThreads); + + // + // Warp 0 copies reduce element from transfer medium. + // + llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then"); + llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else"); + llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont"); + + // Up to 32 threads in warp 0 are active. + auto IsActiveThread = + Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread"); + Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB); + + CGF.EmitBlock(W0ThenBB); + + // SrcMediumPtr = &medium[tid] + llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP( + TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID}); + Address SrcMediumPtr(SrcMediumPtrVal, + C.getTypeAlignInChars(Private->getType())); + // SrcMediumVal = *SrcMediumPtr; + SrcMediumPtr = Bld.CreateElementBitCast( + SrcMediumPtr, CGF.ConvertTypeForMem(Private->getType())); + llvm::Value *SrcMediumValue = CGF.EmitLoadOfScalar( + SrcMediumPtr, /*Volatile=*/false, Private->getType(), SourceLocation()); + + // TargetElemPtr = (type[i]*)(SrcDataAddr[i]) + Address TargetElemPtrPtr = + Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize()); + llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar( + TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation()); + Address TargetElemPtr = + Address(TargetElemPtrVal, C.getTypeAlignInChars(Private->getType())); + TargetElemPtr = Bld.CreateElementBitCast( + TargetElemPtr, CGF.ConvertTypeForMem(Private->getType())); + + // *TargetElemPtr = SrcMediumVal; + CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false, + Private->getType()); + Bld.CreateBr(W0MergeBB); + + CGF.EmitBlock(W0ElseBB); + Bld.CreateBr(W0MergeBB); + + CGF.EmitBlock(W0MergeBB); + + // While warp 0 copies values from transfer medium, all other warps must + // wait. + syncParallelThreads(CGF, NumActiveThreads); + Idx++; + } + + CGF.FinishFunction(); + return Fn; +} + +/// Emit a helper that reduces data across two OpenMP threads (lanes) +/// in the same warp. It uses shuffle instructions to copy over data from +/// a remote lane's stack. The reduction algorithm performed is specified +/// by the fourth parameter. +/// +/// Algorithm Versions. +/// Full Warp Reduce (argument value 0): +/// This algorithm assumes that all 32 lanes are active and gathers +/// data from these 32 lanes, producing a single resultant value. +/// Contiguous Partial Warp Reduce (argument value 1): +/// This algorithm assumes that only a *contiguous* subset of lanes +/// are active. This happens for the last warp in a parallel region +/// when the user specified num_threads is not an integer multiple of +/// 32. This contiguous subset always starts with the zeroth lane. +/// Partial Warp Reduce (argument value 2): +/// This algorithm gathers data from any number of lanes at any position. +/// All reduced values are stored in the lowest possible lane. The set +/// of problems every algorithm addresses is a super set of those +/// addressable by algorithms with a lower version number. Overhead +/// increases as algorithm version increases. +/// +/// Terminology +/// Reduce element: +/// Reduce element refers to the individual data field with primitive +/// data types to be combined and reduced across threads. +/// Reduce list: +/// Reduce list refers to a collection of local, thread-private +/// reduce elements. +/// Remote Reduce list: +/// Remote Reduce list refers to a collection of remote (relative to +/// the current thread) reduce elements. +/// +/// We distinguish between three states of threads that are important to +/// the implementation of this function. +/// Alive threads: +/// Threads in a warp executing the SIMT instruction, as distinguished from +/// threads that are inactive due to divergent control flow. +/// Active threads: +/// The minimal set of threads that has to be alive upon entry to this +/// function. The computation is correct iff active threads are alive. +/// Some threads are alive but they are not active because they do not +/// contribute to the computation in any useful manner. Turning them off +/// may introduce control flow overheads without any tangible benefits. +/// Effective threads: +/// In order to comply with the argument requirements of the shuffle +/// function, we must keep all lanes holding data alive. But at most +/// half of them perform value aggregation; we refer to this half of +/// threads as effective. The other half is simply handing off their +/// data. +/// +/// Procedure +/// Value shuffle: +/// In this step active threads transfer data from higher lane positions +/// in the warp to lower lane positions, creating Remote Reduce list. +/// Value aggregation: +/// In this step, effective threads combine their thread local Reduce list +/// with Remote Reduce list and store the result in the thread local +/// Reduce list. +/// Value copy: +/// In this step, we deal with the assumption made by algorithm 2 +/// (i.e. contiguity assumption). When we have an odd number of lanes +/// active, say 2k+1, only k threads will be effective and therefore k +/// new values will be produced. However, the Reduce list owned by the +/// (2k+1)th thread is ignored in the value aggregation. Therefore +/// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so +/// that the contiguity assumption still holds. +static llvm::Value * +emitShuffleAndReduceFunction(CodeGenModule &CGM, + ArrayRef Privates, + QualType ReductionArrayTy, llvm::Value *ReduceFn) { + auto &C = CGM.getContext(); + + // Thread local Reduce list used to host the values of data to be reduced. + ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, SourceLocation(), + /*Id=*/nullptr, C.VoidPtrTy); + // Current lane id; could be logical. + ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, SourceLocation(), + /*Id=*/nullptr, C.ShortTy); + // Offset of the remote source lane relative to the current lane. + ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, SourceLocation(), + /*Id=*/nullptr, C.ShortTy); + // Algorithm version. This is expected to be known at compile time. + ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, SourceLocation(), + /*Id=*/nullptr, C.ShortTy); + FunctionArgList Args; + Args.push_back(&ReduceListArg); + Args.push_back(&LaneIDArg); + Args.push_back(&RemoteLaneOffsetArg); + Args.push_back(&AlgoVerArg); + + auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); + auto *Fn = llvm::Function::Create( + CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, + "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule()); + CGM.SetInternalFunctionAttributes(/*D=*/nullptr, Fn, CGFI); + CodeGenFunction CGF(CGM); + // We don't need debug information in this function as nothing here refers to + // user code. + CGF.disableDebugInfo(); + CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args); + + auto &Bld = CGF.Builder; + + Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); + Address LocalReduceList( + Bld.CreatePointerBitCastOrAddrSpaceCast( + CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false, + C.VoidPtrTy, SourceLocation()), + CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()), + CGF.getPointerAlign()); + + Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg); + llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar( + AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation()); + + Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg); + llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar( + AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation()); + + Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg); + llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar( + AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation()); + + // Create a local thread-private variable to host the Reduce list + // from a remote lane. + Address RemoteReduceList = + CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list"); + + // This loop iterates through the list of reduce elements and copies, + // element by element, from a remote lane in the warp to RemoteReduceList, + // hosted on the thread's stack. + emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates, + LocalReduceList, RemoteReduceList, + RemoteLaneOffsetArgVal); + + // The actions to be performed on the Remote Reduce list is dependent + // on the algorithm version. + // + // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 && + // LaneId % 2 == 0 && Offset > 0): + // do the reduction value aggregation + // + // The thread local variable Reduce list is mutated in place to host the + // reduced data, which is the aggregated value produced from local and + // remote lanes. + // + // Note that AlgoVer is expected to be a constant integer known at compile + // time. + // When AlgoVer==0, the first conjunction evaluates to true, making + // the entire predicate true during compile time. + // When AlgoVer==1, the second conjunction has only the second part to be + // evaluated during runtime. Other conjunctions evaluates to false + // during compile time. + // When AlgoVer==2, the third conjunction has only the second part to be + // evaluated during runtime. Other conjunctions evaluates to false + // during compile time. + auto CondAlgo0 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(0)); + + auto CondAlgo1 = + Bld.CreateAnd(Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1)), + Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal)); + + auto CondAlgo2 = Bld.CreateAnd( + Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2)), + Bld.CreateICmpEQ(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1)), + Bld.getInt16(0))); + CondAlgo2 = Bld.CreateAnd( + CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0))); + + auto CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1); + CondReduce = Bld.CreateOr(CondReduce, CondAlgo2); + + llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then"); + llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else"); + llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont"); + Bld.CreateCondBr(CondReduce, ThenBB, ElseBB); + + CGF.EmitBlock(ThenBB); + // reduce_function(LocalReduceList, RemoteReduceList) + llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( + LocalReduceList.getPointer(), CGF.VoidPtrTy); + llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( + RemoteReduceList.getPointer(), CGF.VoidPtrTy); + CGF.EmitCallOrInvoke(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr}); + Bld.CreateBr(MergeBB); + + CGF.EmitBlock(ElseBB); + Bld.CreateBr(MergeBB); + + CGF.EmitBlock(MergeBB); + + // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local + // Reduce list. + auto CondCopy = + Bld.CreateAnd(Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1)), + Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal)); + + llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then"); + llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else"); + llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont"); + Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB); + + CGF.EmitBlock(CpyThenBB); + emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates, + RemoteReduceList, LocalReduceList); + Bld.CreateBr(CpyMergeBB); + + CGF.EmitBlock(CpyElseBB); + Bld.CreateBr(CpyMergeBB); + + CGF.EmitBlock(CpyMergeBB); + + CGF.FinishFunction(); + return Fn; +} + +/// +/// Design of OpenMP reductions on the GPU +/// +/// Consider a typical OpenMP program with one or more reduction +/// clauses: +/// +/// float foo; +/// double bar; +/// #pragma omp target teams distribute parallel for \ +/// reduction(+:foo) reduction(*:bar) +/// for (int i = 0; i < N; i++) { +/// foo += A[i]; bar *= B[i]; +/// } +/// +/// where 'foo' and 'bar' are reduced across all OpenMP threads in +/// all teams. In our OpenMP implementation on the NVPTX device an +/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads +/// within a team are mapped to CUDA threads within a threadblock. +/// Our goal is to efficiently aggregate values across all OpenMP +/// threads such that: +/// +/// - the compiler and runtime are logically concise, and +/// - the reduction is performed efficiently in a hierarchical +/// manner as follows: within OpenMP threads in the same warp, +/// across warps in a threadblock, and finally across teams on +/// the NVPTX device. +/// +/// Introduction to Decoupling +/// +/// We would like to decouple the compiler and the runtime so that the +/// latter is ignorant of the reduction variables (number, data types) +/// and the reduction operators. This allows a simpler interface +/// and implementation while still attaining good performance. +/// +/// Pseudocode for the aforementioned OpenMP program generated by the +/// compiler is as follows: +/// +/// 1. Create private copies of reduction variables on each OpenMP +/// thread: 'foo_private', 'bar_private' +/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned +/// to it and writes the result in 'foo_private' and 'bar_private' +/// respectively. +/// 3. Call the OpenMP runtime on the GPU to reduce within a team +/// and store the result on the team master: +/// +/// __kmpc_nvptx_parallel_reduce_nowait(..., +/// reduceData, shuffleReduceFn, interWarpCpyFn) +/// +/// where: +/// struct ReduceData { +/// double *foo; +/// double *bar; +/// } reduceData +/// reduceData.foo = &foo_private +/// reduceData.bar = &bar_private +/// +/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two +/// auxiliary functions generated by the compiler that operate on +/// variables of type 'ReduceData'. They aid the runtime perform +/// algorithmic steps in a data agnostic manner. +/// +/// 'shuffleReduceFn' is a pointer to a function that reduces data +/// of type 'ReduceData' across two OpenMP threads (lanes) in the +/// same warp. It takes the following arguments as input: +/// +/// a. variable of type 'ReduceData' on the calling lane, +/// b. its lane_id, +/// c. an offset relative to the current lane_id to generate a +/// remote_lane_id. The remote lane contains the second +/// variable of type 'ReduceData' that is to be reduced. +/// d. an algorithm version parameter determining which reduction +/// algorithm to use. +/// +/// 'shuffleReduceFn' retrieves data from the remote lane using +/// efficient GPU shuffle intrinsics and reduces, using the +/// algorithm specified by the 4th parameter, the two operands +/// element-wise. The result is written to the first operand. +/// +/// Different reduction algorithms are implemented in different +/// runtime functions, all calling 'shuffleReduceFn' to perform +/// the essential reduction step. Therefore, based on the 4th +/// parameter, this function behaves slightly differently to +/// cooperate with the runtime to ensure correctness under +/// different circumstances. +/// +/// 'InterWarpCpyFn' is a pointer to a function that transfers +/// reduced variables across warps. It tunnels, through CUDA +/// shared memory, the thread-private data of type 'ReduceData' +/// from lane 0 of each warp to a lane in the first warp. +/// 5. if ret == 1: +/// The team master of the last team stores the reduced +/// result to the globals in memory. +/// foo += reduceData.foo; bar *= reduceData.bar +/// +/// +/// Warp Reduction Algorithms +/// +/// On the warp level, we have three algorithms implemented in the +/// OpenMP runtime depending on the number of active lanes: +/// +/// Full Warp Reduction +/// +/// The reduce algorithm within a warp where all lanes are active +/// is implemented in the runtime as follows: +/// +/// full_warp_reduce(void *reduce_data, +/// kmp_ShuffleReductFctPtr ShuffleReduceFn) { +/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2) +/// ShuffleReduceFn(reduce_data, 0, offset, 0); +/// } +/// +/// The algorithm completes in log(2, WARPSIZE) steps. +/// +/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is +/// not used therefore we save instructions by not retrieving lane_id +/// from the corresponding special registers. The 4th parameter, which +/// represents the version of the algorithm being used, is set to 0 to +/// signify full warp reduction. +/// +/// In this version, 'ShuffleReduceFn' behaves, per element, as follows: +/// +/// #reduce_elem refers to an element in the local lane's data structure +/// #remote_elem is retrieved from a remote lane +/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); +/// reduce_elem = reduce_elem REDUCE_OP remote_elem; +/// +/// Contiguous Partial Warp Reduction +/// +/// This reduce algorithm is used within a warp where only the first +/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the +/// number of OpenMP threads in a parallel region is not a multiple of +/// WARPSIZE. The algorithm is implemented in the runtime as follows: +/// +/// void +/// contiguous_partial_reduce(void *reduce_data, +/// kmp_ShuffleReductFctPtr ShuffleReduceFn, +/// int size, int lane_id) { +/// int curr_size; +/// int offset; +/// curr_size = size; +/// mask = curr_size/2; +/// while (offset>0) { +/// ShuffleReduceFn(reduce_data, lane_id, offset, 1); +/// curr_size = (curr_size+1)/2; +/// offset = curr_size/2; +/// } +/// } +/// +/// In this version, 'ShuffleReduceFn' behaves, per element, as follows: +/// +/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); +/// if (lane_id < offset) +/// reduce_elem = reduce_elem REDUCE_OP remote_elem +/// else +/// reduce_elem = remote_elem +/// +/// This algorithm assumes that the data to be reduced are located in a +/// contiguous subset of lanes starting from the first. When there is +/// an odd number of active lanes, the data in the last lane is not +/// aggregated with any other lane's dat but is instead copied over. +/// +/// Dispersed Partial Warp Reduction +/// +/// This algorithm is used within a warp when any discontiguous subset of +/// lanes are active. It is used to implement the reduction operation +/// across lanes in an OpenMP simd region or in a nested parallel region. +/// +/// void +/// dispersed_partial_reduce(void *reduce_data, +/// kmp_ShuffleReductFctPtr ShuffleReduceFn) { +/// int size, remote_id; +/// int logical_lane_id = number_of_active_lanes_before_me() * 2; +/// do { +/// remote_id = next_active_lane_id_right_after_me(); +/// # the above function returns 0 of no active lane +/// # is present right after the current lane. +/// size = number_of_active_lanes_in_this_warp(); +/// logical_lane_id /= 2; +/// ShuffleReduceFn(reduce_data, logical_lane_id, +/// remote_id-1-threadIdx.x, 2); +/// } while (logical_lane_id % 2 == 0 && size > 1); +/// } +/// +/// There is no assumption made about the initial state of the reduction. +/// Any number of lanes (>=1) could be active at any position. The reduction +/// result is returned in the first active lane. +/// +/// In this version, 'ShuffleReduceFn' behaves, per element, as follows: +/// +/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); +/// if (lane_id % 2 == 0 && offset > 0) +/// reduce_elem = reduce_elem REDUCE_OP remote_elem +/// else +/// reduce_elem = remote_elem +/// +/// +/// Intra-Team Reduction +/// +/// This function, as implemented in the runtime call +/// '__kmpc_nvptx_parallel_reduce_nowait', aggregates data across OpenMP +/// threads in a team. It first reduces within a warp using the +/// aforementioned algorithms. We then proceed to gather all such +/// reduced values at the first warp. +/// +/// The runtime makes use of the function 'InterWarpCpyFn', which copies +/// data from each of the "warp master" (zeroth lane of each warp, where +/// warp-reduced data is held) to the zeroth warp. This step reduces (in +/// a mathematical sense) the problem of reduction across warp masters in +/// a block to the problem of warp reduction. +/// +void CGOpenMPRuntimeNVPTX::emitReduction( + CodeGenFunction &CGF, SourceLocation Loc, ArrayRef Privates, + ArrayRef LHSExprs, ArrayRef RHSExprs, + ArrayRef ReductionOps, ReductionOptionsTy Options) { + if (!CGF.HaveInsertPoint()) + return; + + bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind); + assert(ParallelReduction && "Invalid reduction selection in emitReduction."); + + auto &C = CGM.getContext(); + + // 1. Build a list of reduction variables. + // void *RedList[] = {[0], ..., [-1]}; + auto Size = RHSExprs.size(); + for (auto *E : Privates) { + if (E->getType()->isVariablyModifiedType()) + // Reserve place for array size. + ++Size; + } + llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size); + QualType ReductionArrayTy = + C.getConstantArrayType(C.VoidPtrTy, ArraySize, ArrayType::Normal, + /*IndexTypeQuals=*/0); + Address ReductionList = + CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list"); + auto IPriv = Privates.begin(); + unsigned Idx = 0; + for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) { + Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx, + CGF.getPointerSize()); + CGF.Builder.CreateStore( + CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( + CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy), + Elem); + if ((*IPriv)->getType()->isVariablyModifiedType()) { + // Store array size. + ++Idx; + Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx, + CGF.getPointerSize()); + llvm::Value *Size = CGF.Builder.CreateIntCast( + CGF.getVLASize( + CGF.getContext().getAsVariableArrayType((*IPriv)->getType())) + .first, + CGF.SizeTy, /*isSigned=*/false); + CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy), + Elem); + } + } + + // 2. Emit reduce_func(). + auto *ReductionFn = emitReductionFunction( + CGM, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), Privates, + LHSExprs, RHSExprs, ReductionOps); + + // 4. Build res = __kmpc_reduce{_nowait}(, , sizeof(RedList), + // RedList, shuffle_reduce_func, interwarp_copy_func); + auto *ThreadId = getThreadID(CGF, Loc); + auto *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy); + auto *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( + ReductionList.getPointer(), CGF.VoidPtrTy); + + auto *ShuffleAndReduceFn = emitShuffleAndReduceFunction( + CGM, Privates, ReductionArrayTy, ReductionFn); + auto *InterWarpCopyFn = + emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy); + + llvm::Value *Res = nullptr; + if (ParallelReduction) { + llvm::Value *Args[] = {ThreadId, + CGF.Builder.getInt32(RHSExprs.size()), + ReductionArrayTySize, + RL, + ShuffleAndReduceFn, + InterWarpCopyFn}; + + Res = CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_reduce_nowait), + Args); + } + + // 5. Build switch(res) + auto *DefaultBB = CGF.createBasicBlock(".omp.reduction.default"); + auto *SwInst = CGF.Builder.CreateSwitch(Res, DefaultBB, /*NumCases=*/1); + + // 6. Build case 1: where we have reduced values in the master + // thread in each team. + // __kmpc_end_reduce{_nowait}(); + // break; + auto *Case1BB = CGF.createBasicBlock(".omp.reduction.case1"); + SwInst->addCase(CGF.Builder.getInt32(1), Case1BB); + CGF.EmitBlock(Case1BB); + + // Add emission of __kmpc_end_reduce{_nowait}(); + llvm::Value *EndArgs[] = {ThreadId}; + auto &&CodeGen = [&Privates, &LHSExprs, &RHSExprs, &ReductionOps, + this](CodeGenFunction &CGF, PrePostActionTy &Action) { + auto IPriv = Privates.begin(); + auto ILHS = LHSExprs.begin(); + auto IRHS = RHSExprs.begin(); + for (auto *E : ReductionOps) { + emitSingleReductionCombiner(CGF, E, *IPriv, cast(*ILHS), + cast(*IRHS)); + ++IPriv; + ++ILHS; + ++IRHS; + } + }; + RegionCodeGenTy RCG(CodeGen); + NVPTXActionTy Action( + nullptr, llvm::None, + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_reduce_nowait), + EndArgs); + RCG.setAction(Action); + RCG(CGF); + CGF.EmitBranch(DefaultBB); + CGF.EmitBlock(DefaultBB, /*IsFinished=*/true); +} Index: lib/CodeGen/CGStmtOpenMP.cpp =================================================================== --- lib/CodeGen/CGStmtOpenMP.cpp +++ lib/CodeGen/CGStmtOpenMP.cpp @@ -1190,7 +1190,7 @@ } void CodeGenFunction::EmitOMPReductionClauseFinal( - const OMPExecutableDirective &D) { + const OMPExecutableDirective &D, const OpenMPDirectiveKind ReductionKind) { if (!HaveInsertPoint()) return; llvm::SmallVector Privates; @@ -1206,14 +1206,15 @@ ReductionOps.append(C->reduction_ops().begin(), C->reduction_ops().end()); } if (HasAtLeastOneReduction) { + bool WithNowait = D.getSingleClause() || + isOpenMPParallelDirective(D.getDirectiveKind()) || + D.getDirectiveKind() == OMPD_simd; + bool SimpleReduction = D.getDirectiveKind() == OMPD_simd; // Emit nowait reduction if nowait clause is present or directive is a // parallel directive (it always has implicit barrier). CGM.getOpenMPRuntime().emitReduction( *this, D.getLocEnd(), Privates, LHSExprs, RHSExprs, ReductionOps, - D.getSingleClause() || - isOpenMPParallelDirective(D.getDirectiveKind()) || - D.getDirectiveKind() == OMPD_simd, - D.getDirectiveKind() == OMPD_simd); + {WithNowait, SimpleReduction, ReductionKind}); } } @@ -1295,7 +1296,7 @@ CGF.EmitOMPReductionClauseInit(S, PrivateScope); (void)PrivateScope.Privatize(); CGF.EmitStmt(cast(S.getAssociatedStmt())->getCapturedStmt()); - CGF.EmitOMPReductionClauseFinal(S); + CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_parallel); }; emitCommonOMPParallelDirective(*this, S, OMPD_parallel, CodeGen); emitPostUpdateForReductionClause( @@ -1708,7 +1709,7 @@ // Emit final copy of the lastprivate variables at the end of loops. if (HasLastprivateClause) CGF.EmitOMPLastprivateClauseFinal(S, /*NoFinals=*/true); - CGF.EmitOMPReductionClauseFinal(S); + CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_simd); emitPostUpdateForReductionClause( CGF, S, [](CodeGenFunction &) -> llvm::Value * { return nullptr; }); } @@ -2244,7 +2245,10 @@ CGF.EmitLoadOfScalar(IL, S.getLocStart())); }); } - EmitOMPReductionClauseFinal(S); + EmitOMPReductionClauseFinal( + S, /*ReductionKind=*/isOpenMPSimdDirective(S.getDirectiveKind()) + ? /*Parallel and Simd*/ OMPD_parallel_for_simd + : /*Parallel only*/ OMPD_parallel); // Emit post-update of the reduction variables if IsLastIter != 0. emitPostUpdateForReductionClause( *this, S, [&](CodeGenFunction &CGF) -> llvm::Value * { @@ -2419,7 +2423,7 @@ CGF.CGM.getOpenMPRuntime().emitForStaticFinish(CGF, S.getLocEnd()); }; CGF.OMPCancelStack.emitExit(CGF, S.getDirectiveKind(), CodeGen); - CGF.EmitOMPReductionClauseFinal(S); + CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_parallel); // Emit post-update of the reduction variables if IsLastIter != 0. emitPostUpdateForReductionClause( CGF, S, [&](CodeGenFunction &CGF) -> llvm::Value * { @@ -3817,11 +3821,19 @@ // Get the captured statement associated with the 'parallel' region. auto *CS = S.getCapturedStmt(OMPD_parallel); Action.Enter(CGF); - auto &&CodeGen = [CS](CodeGenFunction &CGF, PrePostActionTy &) { + auto &&CodeGen = [&S, CS](CodeGenFunction &CGF, PrePostActionTy &) { + CodeGenFunction::OMPPrivateScope PrivateScope(CGF); + (void)CGF.EmitOMPFirstprivateClause(S, PrivateScope); + CGF.EmitOMPPrivateClause(S, PrivateScope); + CGF.EmitOMPReductionClauseInit(S, PrivateScope); + (void)PrivateScope.Privatize(); // TODO: Add support for clauses. CGF.EmitStmt(CS->getCapturedStmt()); + CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_parallel); }; emitCommonOMPParallelDirective(CGF, S, OMPD_parallel, CodeGen); + emitPostUpdateForReductionClause( + CGF, S, [](CodeGenFunction &) -> llvm::Value * { return nullptr; }); } void CodeGenFunction::EmitOMPTargetParallelDeviceFunction( Index: lib/CodeGen/CodeGenFunction.h =================================================================== --- lib/CodeGen/CodeGenFunction.h +++ lib/CodeGen/CodeGenFunction.h @@ -2638,7 +2638,9 @@ /// the end of the directive. /// /// \param D Directive that has at least one 'reduction' directives. - void EmitOMPReductionClauseFinal(const OMPExecutableDirective &D); + /// \param ReductionKind The kind of reduction to perform. + void EmitOMPReductionClauseFinal(const OMPExecutableDirective &D, + const OpenMPDirectiveKind ReductionKind); /// \brief Emit initial code for linear variables. Creates private copies /// and initializes them with the values according to OpenMP standard. /// Index: test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp =================================================================== --- /dev/null +++ test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp @@ -0,0 +1,830 @@ +// Test target codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// expected-no-diagnostics +#ifndef HEADER +#define HEADER + +// Check for the data transfer medium in shared memory to transfer the reduction list to the first warp. +// CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = common addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i64] + +// Check that the execution mode of all 3 target regions is set to Spmd Mode. +// CHECK-DAG: {{@__omp_offloading_.+l27}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l32}}_exec_mode = weak constant i8 0 +// CHECK-DAG: {{@__omp_offloading_.+l38}}_exec_mode = weak constant i8 0 + +template +tx ftemplate(int n) { + int a; + short b; + tx c; + float d; + double e; + + #pragma omp target parallel reduction(+: e) map(tofrom: e) + { + e += 5; + } + + #pragma omp target parallel reduction(^: c) reduction(*: d) map(tofrom: c,d) + { + c ^= 2; + d *= 33; + } + + #pragma omp target parallel reduction(|: a) reduction(max: b) map(tofrom: a,b) + { + a |= 1; + b = 99 > b ? 99 : b; + } + + return a+b+c+d+e; +} + +int bar(int n){ + int a = 0; + + a += ftemplate(n); + + return a; +} + + // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l27}}( + // + // CHECK: call void @__kmpc_spmd_kernel_init( + // CHECK: br label {{%?}}[[EXECUTE:.+]] + // + // CHECK: [[EXECUTE]] + // CHECK: {{call|invoke}} void [[PFN:@.+]](i32* + // CHECK: call void @__kmpc_spmd_kernel_deinit() + // + // + // define internal void [[PFN]]( + // CHECK: store double {{[0\.e\+]+}}, double* [[E:%.+]], align + // CHECK: [[EV:%.+]] = load double, double* [[E]], align + // CHECK: [[ADD:%.+]] = fadd double [[EV]], 5 + // CHECK: store double [[ADD]], double* [[E]], align + // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [1 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[E_CAST:%.+]] = bitcast double* [[E]] to i8* + // CHECK: store i8* [[E_CAST]], i8** [[PTR1]], align + // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* + // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait(i32 {{.+}}, i32 1, i{{32|64}} {{4|8}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]]) + // CHECK: switch i32 [[RET]], label {{%?}}[[DEFAULTLABEL:.+]] [ + // CHECK: i32 1, label {{%?}}[[REDLABEL:.+]] + + // CHECK: [[REDLABEL]] + // CHECK: [[E_INV:%.+]] = load double, double* [[E_IN:%.+]], align + // CHECK: [[EV:%.+]] = load double, double* [[E]], align + // CHECK: [[ADD:%.+]] = fadd double [[E_INV]], [[EV]] + // CHECK: store double [[ADD]], double* [[E_IN]], align + // CHECK: call void @__kmpc_nvptx_end_reduce_nowait( + // CHECK: br label %[[DEFAULTLABEL]] + // + // CHECK: [[DEFAULTLABEL]] + // CHECK: ret + + // + // Reduction function + // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8*, i8*) + // CHECK: [[VAR_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[VAR_RHS_VOID:%.+]] = load i8*, i8** [[VAR_RHS_REF]], + // CHECK: [[VAR_RHS:%.+]] = bitcast i8* [[VAR_RHS_VOID]] to double* + // + // CHECK: [[VAR_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[VAR_LHS_VOID:%.+]] = load i8*, i8** [[VAR_LHS_REF]], + // CHECK: [[VAR_LHS:%.+]] = bitcast i8* [[VAR_LHS_VOID]] to double* + // + // CHECK: [[VAR_LHS_VAL:%.+]] = load double, double* [[VAR_LHS]], + // CHECK: [[VAR_RHS_VAL:%.+]] = load double, double* [[VAR_RHS]], + // CHECK: [[RES:%.+]] = fadd double [[VAR_LHS_VAL]], [[VAR_RHS_VAL]] + // CHECK: store double [[RES]], double* [[VAR_LHS]], + // CHECK: ret void + + // + // Shuffle and reduce function + // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) + // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align + // CHECK: [[REMOTE_ELT:%.+]] = alloca double + // + // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align + // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align + // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* + // CHECK: [[ELT_VAL:%.+]] = load double, double* [[ELT]], align + // + // CHECK: [[ELT_CAST:%.+]] = bitcast double [[ELT_VAL]] to i64 + // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 + // CHECK: [[REMOTE_ELT_VAL64:%.+]] = call i64 @__kmpc_shuffle_int64(i64 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]]) + // CHECK: [[REMOTE_ELT_VAL:%.+]] = bitcast i64 [[REMOTE_ELT_VAL64]] to double + // + // CHECK: store double [[REMOTE_ELT_VAL]], double* [[REMOTE_ELT]], align + // CHECK: [[REMOTE_ELT_VOID:%.+]] = bitcast double* [[REMOTE_ELT]] to i8* + // CHECK: store i8* [[REMOTE_ELT_VOID]], i8** [[REMOTE_ELT_REF]], align + // + // Condition to reduce + // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0 + // + // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 + // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]] + // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]] + // + // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2 + // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1 + // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0 + // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]] + // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0 + // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]] + // + // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]] + // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]] + // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] + // + // CHECK: [[DO_REDUCE]] + // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* + // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* + // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) + // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] + // + // CHECK: [[REDUCE_ELSE]] + // CHECK: br label {{%?}}[[REDUCE_CONT]] + // + // CHECK: [[REDUCE_CONT]] + // Now check if we should just copy over the remote reduction list + // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 + // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]] + // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]] + // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // CHECK: [[DO_COPY]] + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* + // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to double* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[REMOTE_ELT]], align + // CHECK: store double [[REMOTE_ELT_VAL]], double* [[ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // CHECK: [[COPY_CONT]] + // CHECK: void + + // + // Inter warp copy function + // CHECK: define internal void [[WARP_COPY_FN]](i8*, i32) + // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 + // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 + // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* + // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 + // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // [[DO_COPY]] + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* + // CHECK: [[ELT_VAL:%.+]] = load double, double* [[ELT]], align + // + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to double addrspace([[SHARED_ADDRSPACE]])* + // CHECK: store double [[ELT_VAL]], double addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // Barrier after copy to shared memory storage medium. + // CHECK: [[COPY_CONT]] + // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // + // Read into warp 0. + // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] + // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] + // + // CHECK: [[DO_READ]] + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to double addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load double, double addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* + // CHECK: store double [[MEDIUM_ELT_VAL]], double* [[ELT]], align + // CHECK: br label {{%?}}[[READ_CONT:.+]] + // + // CHECK: [[READ_ELSE]] + // CHECK: br label {{%?}}[[READ_CONT]] + // + // CHECK: [[READ_CONT]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: ret + + + + + + + + + + + // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l32}}( + // + // CHECK: call void @__kmpc_spmd_kernel_init( + // CHECK: br label {{%?}}[[EXECUTE:.+]] + // + // CHECK: [[EXECUTE]] + // CHECK: {{call|invoke}} void [[PFN1:@.+]](i32* + // CHECK: call void @__kmpc_spmd_kernel_deinit() + // + // + // define internal void [[PFN1]]( + // CHECK: store float {{1\.[0e\+]+}}, float* [[D:%.+]], align + // CHECK: [[C_VAL:%.+]] = load i8, i8* [[C:%.+]], align + // CHECK: [[CONV:%.+]] = sext i8 [[C_VAL]] to i32 + // CHECK: [[XOR:%.+]] = xor i32 [[CONV]], 2 + // CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8 + // CHECK: store i8 [[TRUNC]], i8* [[C]], align + // CHECK: [[DV:%.+]] = load float, float* [[D]], align + // CHECK: [[MUL:%.+]] = fmul float [[DV]], {{[0-9e\.\+]+}} + // CHECK: store float [[MUL]], float* [[D]], align + // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: store i8* [[C]], i8** [[PTR1]], align + // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[D_CAST:%.+]] = bitcast float* [[D]] to i8* + // CHECK: store i8* [[D_CAST]], i8** [[PTR2]], align + // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* + // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait(i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]]) + // CHECK: switch i32 [[RET]], label {{%?}}[[DEFAULTLABEL:.+]] [ + // CHECK: i32 1, label {{%?}}[[REDLABEL:.+]] + + // CHECK: [[REDLABEL]] + // CHECK: [[C_INV8:%.+]] = load i8, i8* [[C_IN:%.+]], align + // CHECK: [[C_INV:%.+]] = sext i8 [[C_INV8]] to i32 + // CHECK: [[CV8:%.+]] = load i8, i8* [[C]], align + // CHECK: [[CV:%.+]] = sext i8 [[CV8]] to i32 + // CHECK: [[XOR:%.+]] = xor i32 [[C_INV]], [[CV]] + // CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8 + // CHECK: store i8 [[TRUNC]], i8* [[C_IN]], align + // CHECK: [[D_INV:%.+]] = load float, float* [[D_IN:%.+]], align + // CHECK: [[DV:%.+]] = load float, float* [[D]], align + // CHECK: [[MUL:%.+]] = fmul float [[D_INV]], [[DV]] + // CHECK: store float [[MUL]], float* [[D_IN]], align + // CHECK: call void @__kmpc_nvptx_end_reduce_nowait( + // CHECK: br label %[[DEFAULTLABEL]] + // + // CHECK: [[DEFAULTLABEL]] + // CHECK: ret + + // + // Reduction function + // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8*, i8*) + // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[VAR1_RHS:%.+]] = load i8*, i8** [[VAR1_RHS_REF]], + // + // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[VAR1_LHS:%.+]] = load i8*, i8** [[VAR1_LHS_REF]], + // + // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]], + // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to float* + // + // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]], + // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to float* + // + // CHECK: [[VAR1_LHS_VAL8:%.+]] = load i8, i8* [[VAR1_LHS]], + // CHECK: [[VAR1_LHS_VAL:%.+]] = sext i8 [[VAR1_LHS_VAL8]] to i32 + // CHECK: [[VAR1_RHS_VAL8:%.+]] = load i8, i8* [[VAR1_RHS]], + // CHECK: [[VAR1_RHS_VAL:%.+]] = sext i8 [[VAR1_RHS_VAL8]] to i32 + // CHECK: [[XOR:%.+]] = xor i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]] + // CHECK: [[RES:%.+]] = trunc i32 [[XOR]] to i8 + // CHECK: store i8 [[RES]], i8* [[VAR1_LHS]], + // + // CHECK: [[VAR2_LHS_VAL:%.+]] = load float, float* [[VAR2_LHS]], + // CHECK: [[VAR2_RHS_VAL:%.+]] = load float, float* [[VAR2_RHS]], + // CHECK: [[RES:%.+]] = fmul float [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]] + // CHECK: store float [[RES]], float* [[VAR2_LHS]], + // CHECK: ret void + + // + // Shuffle and reduce function + // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) + // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align + // CHECK: [[REMOTE_ELT1:%.+]] = alloca i8 + // CHECK: [[REMOTE_ELT2:%.+]] = alloca float + // + // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align + // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align + // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align + // + // CHECK: [[ELT_CAST:%.+]] = sext i8 [[ELT_VAL]] to i32 + // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 + // CHECK: [[REMOTE_ELT1_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]]) + // CHECK: [[REMOTE_ELT1_VAL:%.+]] = trunc i32 [[REMOTE_ELT1_VAL32]] to i8 + // + // CHECK: store i8 [[REMOTE_ELT1_VAL]], i8* [[REMOTE_ELT1]], align + // CHECK: store i8* [[REMOTE_ELT1]], i8** [[REMOTE_ELT_REF]], align + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* + // CHECK: [[ELT_VAL:%.+]] = load float, float* [[ELT]], align + // + // CHECK: [[ELT_CAST:%.+]] = bitcast float [[ELT_VAL]] to i32 + // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 + // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]]) + // CHECK: [[REMOTE_ELT2_VAL:%.+]] = bitcast i32 [[REMOTE_ELT2_VAL32]] to float + // + // CHECK: store float [[REMOTE_ELT2_VAL]], float* [[REMOTE_ELT2]], align + // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast float* [[REMOTE_ELT2]] to i8* + // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align + // + // Condition to reduce + // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0 + // + // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 + // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]] + // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]] + // + // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2 + // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1 + // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0 + // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]] + // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0 + // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]] + // + // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]] + // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]] + // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] + // + // CHECK: [[DO_REDUCE]] + // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* + // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* + // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) + // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] + // + // CHECK: [[REDUCE_ELSE]] + // CHECK: br label {{%?}}[[REDUCE_CONT]] + // + // CHECK: [[REDUCE_CONT]] + // Now check if we should just copy over the remote reduction list + // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 + // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]] + // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]] + // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // CHECK: [[DO_COPY]] + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i8, i8* [[REMOTE_ELT_VOID]], align + // CHECK: store i8 [[REMOTE_ELT_VAL]], i8* [[ELT_VOID]], align + // + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* + // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to float* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[REMOTE_ELT]], align + // CHECK: store float [[REMOTE_ELT_VAL]], float* [[ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // CHECK: [[COPY_CONT]] + // CHECK: void + + // + // Inter warp copy function + // CHECK: define internal void [[WARP_COPY_FN]](i8*, i32) + // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 + // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 + // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* + // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 + // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // [[DO_COPY]] + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align + // + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: store i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // Barrier after copy to shared memory storage medium. + // CHECK: [[COPY_CONT]] + // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // + // Read into warp 0. + // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] + // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] + // + // CHECK: [[DO_READ]] + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: store i8 [[MEDIUM_ELT_VAL]], i8* [[ELT_VOID]], align + // CHECK: br label {{%?}}[[READ_CONT:.+]] + // + // CHECK: [[READ_ELSE]] + // CHECK: br label {{%?}}[[READ_CONT]] + // + // CHECK: [[READ_CONT]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 + // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // [[DO_COPY]] + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* + // CHECK: [[ELT_VAL:%.+]] = load float, float* [[ELT]], align + // + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to float addrspace([[SHARED_ADDRSPACE]])* + // CHECK: store float [[ELT_VAL]], float addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // Barrier after copy to shared memory storage medium. + // CHECK: [[COPY_CONT]] + // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // + // Read into warp 0. + // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] + // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] + // + // CHECK: [[DO_READ]] + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to float addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load float, float addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* + // CHECK: store float [[MEDIUM_ELT_VAL]], float* [[ELT]], align + // CHECK: br label {{%?}}[[READ_CONT:.+]] + // + // CHECK: [[READ_ELSE]] + // CHECK: br label {{%?}}[[READ_CONT]] + // + // CHECK: [[READ_CONT]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: ret + + + + + + + + + + + // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l38}}( + // + // CHECK: call void @__kmpc_spmd_kernel_init( + // CHECK: br label {{%?}}[[EXECUTE:.+]] + // + // CHECK: [[EXECUTE]] + // CHECK: {{call|invoke}} void [[PFN2:@.+]](i32* + // CHECK: call void @__kmpc_spmd_kernel_deinit() + // + // + // define internal void [[PFN2]]( + // CHECK: store i32 0, i32* [[A:%.+]], align + // CHECK: store i16 -32768, i16* [[B:%.+]], align + // CHECK: [[A_VAL:%.+]] = load i32, i32* [[A:%.+]], align + // CHECK: [[OR:%.+]] = or i32 [[A_VAL]], 1 + // CHECK: store i32 [[OR]], i32* [[A]], align + // CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align + // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32 + // CHECK: [[CMP:%.+]] = icmp sgt i32 99, [[BV]] + // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] + // + // CHECK: [[DO_MAX]] + // CHECK: br label {{%?}}[[MAX_CONT:.+]] + // + // CHECK: [[MAX_ELSE]] + // CHECK: [[BV:%.+]] = load i16, i16* [[B]], align + // CHECK: [[MAX:%.+]] = sext i16 [[BV]] to i32 + // CHECK: br label {{%?}}[[MAX_CONT]] + // + // CHECK: [[MAX_CONT]] + // CHECK: [[B_LVALUE:%.+]] = phi i32 [ 99, %[[DO_MAX]] ], [ [[MAX]], %[[MAX_ELSE]] ] + // CHECK: [[TRUNC:%.+]] = trunc i32 [[B_LVALUE]] to i16 + // CHECK: store i16 [[TRUNC]], i16* [[B]], align + // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[A_CAST:%.+]] = bitcast i32* [[A]] to i8* + // CHECK: store i8* [[A_CAST]], i8** [[PTR1]], align + // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[B_CAST:%.+]] = bitcast i16* [[B]] to i8* + // CHECK: store i8* [[B_CAST]], i8** [[PTR2]], align + // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* + // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait(i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]]) + // CHECK: switch i32 [[RET]], label {{%?}}[[DEFAULTLABEL:.+]] [ + // CHECK: i32 1, label {{%?}}[[REDLABEL:.+]] + + // CHECK: [[REDLABEL]] + // CHECK: [[A_INV:%.+]] = load i32, i32* [[A_IN:%.+]], align + // CHECK: [[AV:%.+]] = load i32, i32* [[A]], align + // CHECK: [[OR:%.+]] = or i32 [[A_INV]], [[AV]] + // CHECK: store i32 [[OR]], i32* [[A_IN]], align + // CHECK: [[B_INV16:%.+]] = load i16, i16* [[B_IN:%.+]], align + // CHECK: [[B_INV:%.+]] = sext i16 [[B_INV16]] to i32 + // CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align + // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32 + // CHECK: [[CMP:%.+]] = icmp sgt i32 [[B_INV]], [[BV]] + // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] + // + // CHECK: [[DO_MAX]] + // CHECK: [[MAX1:%.+]] = load i16, i16* [[B_IN]], align + // CHECK: br label {{%?}}[[MAX_CONT:.+]] + // + // CHECK: [[MAX_ELSE]] + // CHECK: [[MAX2:%.+]] = load i16, i16* [[B]], align + // CHECK: br label {{%?}}[[MAX_CONT]] + // + // CHECK: [[MAX_CONT]] + // CHECK: [[B_MAX:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ] + // CHECK: store i16 [[B_MAX]], i16* [[B_IN]], align + // CHECK: call void @__kmpc_nvptx_end_reduce_nowait( + // CHECK: br label %[[DEFAULTLABEL]] + // + // CHECK: [[DEFAULTLABEL]] + // CHECK: ret + + // + // Reduction function + // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8*, i8*) + // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]], + // CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to i32* + // + // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]], + // CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to i32* + // + // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]], + // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to i16* + // + // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]], + // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to i16* + // + // CHECK: [[VAR1_LHS_VAL:%.+]] = load i32, i32* [[VAR1_LHS]], + // CHECK: [[VAR1_RHS_VAL:%.+]] = load i32, i32* [[VAR1_RHS]], + // CHECK: [[OR:%.+]] = or i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]] + // CHECK: store i32 [[OR]], i32* [[VAR1_LHS]], + // + // CHECK: [[VAR2_LHS_VAL16:%.+]] = load i16, i16* [[VAR2_LHS]], + // CHECK: [[VAR2_LHS_VAL:%.+]] = sext i16 [[VAR2_LHS_VAL16]] to i32 + // CHECK: [[VAR2_RHS_VAL16:%.+]] = load i16, i16* [[VAR2_RHS]], + // CHECK: [[VAR2_RHS_VAL:%.+]] = sext i16 [[VAR2_RHS_VAL16]] to i32 + // + // CHECK: [[CMP:%.+]] = icmp sgt i32 [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]] + // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] + // + // CHECK: [[DO_MAX]] + // CHECK: [[MAX1:%.+]] = load i16, i16* [[VAR2_LHS]], align + // CHECK: br label {{%?}}[[MAX_CONT:.+]] + // + // CHECK: [[MAX_ELSE]] + // CHECK: [[MAX2:%.+]] = load i16, i16* [[VAR2_RHS]], align + // CHECK: br label {{%?}}[[MAX_CONT]] + // + // CHECK: [[MAX_CONT]] + // CHECK: [[MAXV:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ] + // CHECK: store i16 [[MAXV]], i16* [[VAR2_LHS]], + // CHECK: ret void + + // + // Shuffle and reduce function + // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) + // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align + // CHECK: [[REMOTE_ELT1:%.+]] = alloca i32 + // CHECK: [[REMOTE_ELT2:%.+]] = alloca i16 + // + // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align + // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align + // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align + // + // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 + // CHECK: [[REMOTE_ELT1_VAL:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]]) + // + // CHECK: store i32 [[REMOTE_ELT1_VAL]], i32* [[REMOTE_ELT1]], align + // CHECK: [[REMOTE_ELT1C:%.+]] = bitcast i32* [[REMOTE_ELT1]] to i8* + // CHECK: store i8* [[REMOTE_ELT1C]], i8** [[REMOTE_ELT_REF]], align + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* + // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align + // + // CHECK: [[ELT_CAST:%.+]] = sext i16 [[ELT_VAL]] to i32 + // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 + // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]]) + // CHECK: [[REMOTE_ELT2_VAL:%.+]] = trunc i32 [[REMOTE_ELT2_VAL32]] to i16 + // + // CHECK: store i16 [[REMOTE_ELT2_VAL]], i16* [[REMOTE_ELT2]], align + // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast i16* [[REMOTE_ELT2]] to i8* + // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align + // + // Condition to reduce + // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0 + // + // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 + // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]] + // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]] + // + // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2 + // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1 + // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0 + // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]] + // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0 + // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]] + // + // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]] + // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]] + // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] + // + // CHECK: [[DO_REDUCE]] + // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* + // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* + // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) + // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] + // + // CHECK: [[REDUCE_ELSE]] + // CHECK: br label {{%?}}[[REDUCE_CONT]] + // + // CHECK: [[REDUCE_CONT]] + // Now check if we should just copy over the remote reduction list + // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 + // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]] + // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]] + // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // CHECK: [[DO_COPY]] + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align + // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align + // + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* + // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align + // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // CHECK: [[COPY_CONT]] + // CHECK: void + + // + // Inter warp copy function + // CHECK: define internal void [[WARP_COPY_FN]](i8*, i32) + // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 + // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 + // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* + // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 + // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // [[DO_COPY]] + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align + // + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i32 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: store i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // Barrier after copy to shared memory storage medium. + // CHECK: [[COPY_CONT]] + // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // + // Read into warp 0. + // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] + // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] + // + // CHECK: [[DO_READ]] + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i32 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align + // CHECK: br label {{%?}}[[READ_CONT:.+]] + // + // CHECK: [[READ_ELSE]] + // CHECK: br label {{%?}}[[READ_CONT]] + // + // CHECK: [[READ_CONT]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 + // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // [[DO_COPY]] + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* + // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align + // + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i16 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: store i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // Barrier after copy to shared memory storage medium. + // CHECK: [[COPY_CONT]] + // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // + // Read into warp 0. + // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] + // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] + // + // CHECK: [[DO_READ]] + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i16 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* + // CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align + // CHECK: br label {{%?}}[[READ_CONT:.+]] + // + // CHECK: [[READ_ELSE]] + // CHECK: br label {{%?}}[[READ_CONT]] + // + // CHECK: [[READ_CONT]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: ret + +#endif