Index: lib/CodeGen/CGOpenMPRuntime.h =================================================================== --- lib/CodeGen/CGOpenMPRuntime.h +++ lib/CodeGen/CGOpenMPRuntime.h @@ -517,7 +517,20 @@ Address Shareds, const OMPTaskDataTy &Data); public: - explicit CGOpenMPRuntime(CodeGenModule &CGM); + enum OpenMPRuntimeKind : unsigned { RK_HOST, RK_NVPTX }; + + OpenMPRuntimeKind getKind() const { return Kind; } + + static bool classof(const CGOpenMPRuntime *RT) { + return RT->getKind() == RK_HOST; + } + +private: + const OpenMPRuntimeKind Kind; + +public: + explicit CGOpenMPRuntime(CodeGenModule &CGM, + OpenMPRuntimeKind Kind = RK_HOST); virtual ~CGOpenMPRuntime() {} virtual void clear(); @@ -893,6 +906,27 @@ OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen, bool HasCancel = false); + + /// Emits reduction function. + /// \param ArgsType Array type containing pointers to reduction variables. + /// \param Privates List of private copies for original reduction arguments. + /// \param LHSExprs List of LHS in \a ReductionOps reduction operations. + /// \param RHSExprs List of RHS in \a ReductionOps reduction operations. + /// \param ReductionOps List of reduction operations in form 'LHS binop RHS' + /// or 'operator binop(LHS, RHS)'. + llvm::Value *emitReductionFunction(CodeGenModule &CGM, llvm::Type *ArgsType, + ArrayRef Privates, + ArrayRef LHSExprs, + ArrayRef RHSExprs, + ArrayRef ReductionOps); + + /// Emits single reduction combiner + void emitSingleReductionCombiner(CodeGenFunction &CGF, + const Expr *ReductionOp, + const Expr *PrivateRef, + const DeclRefExpr *LHS, + const DeclRefExpr *RHS); + /// \brief Emit a code for reduction clause. Next code should be emitted for /// reduction: /// \code @@ -931,12 +965,14 @@ /// or 'operator binop(LHS, RHS)'. /// \param WithNowait true if parent directive has also nowait clause, false /// otherwise. + /// \param ReductionKind The kind of reduction to perform. virtual void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef Privates, ArrayRef LHSExprs, ArrayRef RHSExprs, ArrayRef ReductionOps, - bool WithNowait, bool SimpleReduction); + bool WithNowait, bool SimpleReduction, + OpenMPDirectiveKind ReductionKind); /// \brief Emit code for 'taskwait' directive. virtual void emitTaskwaitCall(CodeGenFunction &CGF, SourceLocation Loc); Index: lib/CodeGen/CGOpenMPRuntime.cpp =================================================================== --- lib/CodeGen/CGOpenMPRuntime.cpp +++ lib/CodeGen/CGOpenMPRuntime.cpp @@ -723,8 +723,8 @@ AlignmentSource::Decl); } -CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM) - : CGM(CGM), OffloadEntriesInfoManager(CGM) { +CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM, OpenMPRuntimeKind Kind) + : CGM(CGM), OffloadEntriesInfoManager(CGM), Kind(Kind) { IdentTy = llvm::StructType::create( "ident_t", CGM.Int32Ty /* reserved_1 */, CGM.Int32Ty /* flags */, CGM.Int32Ty /* reserved_2 */, CGM.Int32Ty /* reserved_3 */, @@ -4257,12 +4257,10 @@ CGF.EmitIgnoredExpr(ReductionOp); } -static llvm::Value *emitReductionFunction(CodeGenModule &CGM, - llvm::Type *ArgsType, - ArrayRef Privates, - ArrayRef LHSExprs, - ArrayRef RHSExprs, - ArrayRef ReductionOps) { +llvm::Value *CGOpenMPRuntime::emitReductionFunction( + CodeGenModule &CGM, llvm::Type *ArgsType, ArrayRef Privates, + ArrayRef LHSExprs, ArrayRef RHSExprs, + ArrayRef ReductionOps) { auto &C = CGM.getContext(); // void reduction_func(void *LHSArg, void *RHSArg); @@ -4279,6 +4277,9 @@ ".omp.reduction.reduction_func", &CGM.getModule()); CGM.SetInternalFunctionAttributes(/*D=*/nullptr, Fn, CGFI); CodeGenFunction CGF(CGM); + // We don't need debug information in this function as nothing here refers to + // user source code. + CGF.disableDebugInfo(); CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args); // Dst = (void*[n])(LHSArg); @@ -4345,11 +4346,11 @@ return Fn; } -static void emitSingleReductionCombiner(CodeGenFunction &CGF, - const Expr *ReductionOp, - const Expr *PrivateRef, - const DeclRefExpr *LHS, - const DeclRefExpr *RHS) { +void CGOpenMPRuntime::emitSingleReductionCombiner(CodeGenFunction &CGF, + const Expr *ReductionOp, + const Expr *PrivateRef, + const DeclRefExpr *LHS, + const DeclRefExpr *RHS) { if (PrivateRef->getType()->isArrayType()) { // Emit reduction for array section. auto *LHSVar = cast(LHS->getDecl()); @@ -4369,7 +4370,8 @@ ArrayRef LHSExprs, ArrayRef RHSExprs, ArrayRef ReductionOps, - bool WithNowait, bool SimpleReduction) { + bool WithNowait, bool SimpleReduction, + OpenMPDirectiveKind ReductionKind) { if (!CGF.HaveInsertPoint()) return; // Next code should be emitted for reduction: @@ -4513,12 +4515,13 @@ }; auto &&CodeGen = [&Privates, &LHSExprs, &RHSExprs, &ReductionOps]( CodeGenFunction &CGF, PrePostActionTy &Action) { + auto &RT = CGF.CGM.getOpenMPRuntime(); auto IPriv = Privates.begin(); auto ILHS = LHSExprs.begin(); auto IRHS = RHSExprs.begin(); for (auto *E : ReductionOps) { - emitSingleReductionCombiner(CGF, E, *IPriv, cast(*ILHS), - cast(*IRHS)); + RT.emitSingleReductionCombiner(CGF, E, *IPriv, cast(*ILHS), + cast(*IRHS)); ++IPriv; ++ILHS; ++IRHS; Index: lib/CodeGen/CGOpenMPRuntimeNVPTX.h =================================================================== --- lib/CodeGen/CGOpenMPRuntimeNVPTX.h +++ lib/CodeGen/CGOpenMPRuntimeNVPTX.h @@ -67,12 +67,6 @@ /// \brief Signal termination of Spmd mode execution. void emitSpmdEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST); - /// \brief Returns specified OpenMP runtime function for the current OpenMP - /// implementation. Specialized for the NVPTX device. - /// \param Function OpenMP runtime function. - /// \return Specified function. - llvm::Constant *createNVPTXRuntimeFunction(unsigned Function); - // // Base class overrides. // @@ -170,6 +164,10 @@ public: explicit CGOpenMPRuntimeNVPTX(CodeGenModule &CGM); + static bool classof(const CGOpenMPRuntime *RT) { + return RT->getKind() == RK_NVPTX; + } + /// \brief Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 /// global_tid, int proc_bind) to generate code for 'proc_bind' clause. virtual void emitProcBindClause(CodeGenFunction &CGF, @@ -248,7 +246,30 @@ ArrayRef CapturedVars, const Expr *IfCond) override; -public: + /// \brief Emit a code for reduction clause. + /// + /// \param Privates List of private copies for original reduction arguments. + /// \param LHSExprs List of LHS in \a ReductionOps reduction operations. + /// \param RHSExprs List of RHS in \a ReductionOps reduction operations. + /// \param ReductionOps List of reduction operations in form 'LHS binop RHS' + /// or 'operator binop(LHS, RHS)'. + /// \param WithNowait true if parent directive has also nowait clause, false + /// otherwise. + /// \param ReductionKind The kind of reduction to perform. + virtual void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, + ArrayRef Privates, + ArrayRef LHSExprs, + ArrayRef RHSExprs, + ArrayRef ReductionOps, + bool WithNowait, bool SimpleReduction, + OpenMPDirectiveKind ReductionKind) override; + + /// \brief Returns specified OpenMP runtime function for the current OpenMP + /// implementation. Specialized for the NVPTX device. + /// \param Function OpenMP runtime function. + /// \return Specified function. + llvm::Constant *createNVPTXRuntimeFunction(unsigned Function); + /// Target codegen is specialized based on two programming models: the /// 'generic' fork-join model of OpenMP, and a more GPU efficient 'spmd' /// model for constructs like 'target parallel' that support it. Index: lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp =================================================================== --- lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -44,6 +44,24 @@ /// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 /// global_tid); OMPRTL_NVPTX__kmpc_end_serialized_parallel, + /// \brief Call to int32_t __kmpc_shuffle_int32(int32_t element, + /// int16_t lane_offset, int16_t warp_size); + OMPRTL_NVPTX__kmpc_shuffle_int32, + /// \brief Call to int64_t __kmpc_shuffle_int64(int64_t element, + /// int16_t lane_offset, int16_t warp_size); + OMPRTL_NVPTX__kmpc_shuffle_int64, + /// \brief Call to __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid, + /// int32_t num_vars, size_t reduce_size, void *reduce_data, + /// void (*kmp_ShuffleReductFctPtr)(void *rhs, int16_t lane_id, int16_t + /// lane_offset, int16_t shortCircuit), + /// void (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num), + /// void (*kmp_CopyToScratchpadFctPtr)(void *reduce_data, void * scratchpad, + /// int32_t index, int32_t width), + /// void (*kmp_LoadReduceFctPtr)(void *reduce_data, void * scratchpad, int32_t + /// index, int32_t width, int32_t reduce)) + OMPRTL_NVPTX__kmpc_teams_reduce_nowait, + /// \brief Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid); + OMPRTL_NVPTX__kmpc_end_reduce_nowait }; /// Pre(post)-action for different OpenMP constructs specialized for NVPTX. @@ -100,6 +118,28 @@ } ~ExecutionModeRAII() { Mode = SavedMode; } }; + +// GPU Configuration: This information can be derived from cuda registers, +// however, providing compile time constants helps generate more efficient +// code. For all practical purposes this is fine because the configuration +// is the same for all known NVPTX architectures. +enum MachineConfiguration : unsigned { + WarpSize = 32, + // Number of bits required to represent a lane identifier, which is + // computed as log_2(WarpSize). + LaneIDBits = 5, + LaneIDMask = WarpSize - 1, + + // Global memory alignment for performance. + GlobalMemoryAlignment = 256, +}; + +enum NamedBarrier : unsigned { + // Synchronize on this barrier #ID using a named barrier primitive. + // Only the subset of active threads in a parallel region arrive at the + // barrier. + NB_Parallel = 1, +}; } // anonymous namespace /// Get the GPU warp size. @@ -120,6 +160,23 @@ llvm::None, "nvptx_tid"); } +/// Get the id of the warp in the block. +/// We assume that the warp size is 32, which is always the case +/// on the NVPTX device, to generate more efficient code. +static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) { + CGBuilderTy &Bld = CGF.Builder; + return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits, "nvptx_warp_id"); +} + +/// Get the id of the current lane in the Warp. +/// We assume that the warp size is 32, which is always the case +/// on the NVPTX device, to generate more efficient code. +static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) { + CGBuilderTy &Bld = CGF.Builder; + return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask), + "nvptx_lane_id"); +} + /// Get the maximum number of threads in a block of the GPU. static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) { CGBuilderTy &Bld = CGF.Builder; @@ -136,9 +193,25 @@ &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0)); } +/// Get barrier #ID to synchronize selected (multiple of warp size) threads in +/// a CTA. +static void getNVPTXBarrier(CodeGenFunction &CGF, int ID, + llvm::Value *NumThreads) { + CGBuilderTy &Bld = CGF.Builder; + llvm::Value *Args[] = {Bld.getInt32(ID), NumThreads}; + Bld.CreateCall(llvm::Intrinsic::getDeclaration(&CGF.CGM.getModule(), + llvm::Intrinsic::nvvm_barrier), + Args); +} + /// Synchronize all GPU threads in a block. static void syncCTAThreads(CodeGenFunction &CGF) { getNVPTXCTABarrier(CGF); } +/// Synchronize worker threads in a parallel region. +static void syncParallelThreads(CodeGenFunction &CGF, llvm::Value *NumThreads) { + return getNVPTXBarrier(CGF, NB_Parallel, NumThreads); +} + /// Get the value of the thread_limit clause in the teams directive. /// For the 'generic' execution mode, the runtime encodes thread_limit in /// the launch parameters, always starting thread_limit+warpSize threads per @@ -583,6 +656,76 @@ RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel"); break; } + case OMPRTL_NVPTX__kmpc_shuffle_int32: { + /// Build int32_t __kmpc_shuffle_int32(int32_t element, + /// int16_t lane_offset, int16_t warp_size); + llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty}; + llvm::FunctionType *FnTy = + llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false); + RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int32"); + break; + } + case OMPRTL_NVPTX__kmpc_shuffle_int64: { + /// Build int64_t __kmpc_shuffle_int64(int64_t element, + /// int16_t lane_offset, int16_t warp_size); + llvm::Type *TypeParams[] = {CGM.Int64Ty, CGM.Int16Ty, CGM.Int16Ty}; + llvm::FunctionType *FnTy = + llvm::FunctionType::get(CGM.Int64Ty, TypeParams, /*isVarArg*/ false); + RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int64"); + break; + } + case OMPRTL_NVPTX__kmpc_teams_reduce_nowait: { + /// Build int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid, + /// int32_t num_vars, size_t reduce_size, void *reduce_data, + /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t + /// lane_offset, int16_t shortCircuit), + /// void (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num), + /// void (*kmp_CopyToScratchpadFctPtr)(void *reduce_data, void * scratchpad, + /// int32_t index, int32_t width), + /// void (*kmp_LoadReduceFctPtr)(void *reduce_data, void * scratchpad, + /// int32_t index, int32_t width, int32_t reduce)) + llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty, + CGM.Int16Ty, CGM.Int16Ty}; + auto *ShuffleReduceFnTy = + llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams, + /*isVarArg=*/false); + llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty}; + auto *InterWarpCopyFnTy = + llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams, + /*isVarArg=*/false); + llvm::Type *CopyToScratchpadTypeParams[] = {CGM.VoidPtrTy, CGM.VoidPtrTy, + CGM.Int32Ty, CGM.Int32Ty}; + auto *CopyToScratchpadFnTy = + llvm::FunctionType::get(CGM.VoidTy, CopyToScratchpadTypeParams, + /*isVarArg=*/false); + llvm::Type *LoadReduceTypeParams[] = { + CGM.VoidPtrTy, CGM.VoidPtrTy, CGM.Int32Ty, CGM.Int32Ty, CGM.Int32Ty}; + auto *LoadReduceFnTy = + llvm::FunctionType::get(CGM.VoidTy, LoadReduceTypeParams, + /*isVarArg=*/false); + llvm::Type *TypeParams[] = {CGM.Int32Ty, + CGM.Int32Ty, + CGM.SizeTy, + CGM.VoidPtrTy, + ShuffleReduceFnTy->getPointerTo(), + InterWarpCopyFnTy->getPointerTo(), + CopyToScratchpadFnTy->getPointerTo(), + LoadReduceFnTy->getPointerTo()}; + llvm::FunctionType *FnTy = + llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false); + RTLFn = CGM.CreateRuntimeFunction( + FnTy, /*Name=*/"__kmpc_nvptx_teams_reduce_nowait"); + break; + } + case OMPRTL_NVPTX__kmpc_end_reduce_nowait: { + // Build __kmpc_end_reduce_nowait(kmp_int32 global_tid); + llvm::Type *TypeParams[] = {CGM.Int32Ty}; + llvm::FunctionType *FnTy = + llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false); + RTLFn = CGM.CreateRuntimeFunction( + FnTy, /*Name=*/"__kmpc_nvptx_end_reduce_nowait"); + break; + } } return RTLFn; } @@ -638,7 +781,8 @@ } CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM) - : CGOpenMPRuntime(CGM), CurrentExecutionMode(ExecutionMode::Unknown) { + : CGOpenMPRuntime(CGM, RK_NVPTX), + CurrentExecutionMode(ExecutionMode::Unknown) { if (!CGM.getLangOpts().OpenMPIsDevice) llvm_unreachable("OpenMP NVPTX can only handle device code."); } @@ -805,3 +949,1239 @@ OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs); } + +/// This function creates calls to one of two shuffle functions to copy +/// variables between lanes in a warp. +static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF, + QualType ElemTy, + llvm::Value *Elem, + llvm::Value *Offset) { + auto &CGM = CGF.CGM; + CGOpenMPRuntimeNVPTX &RT = cast(CGM.getOpenMPRuntime()); + auto &Bld = CGF.Builder; + + unsigned Size = CGM.getContext().getTypeSizeInChars(ElemTy).getQuantity(); + assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction."); + + OpenMPRTLFunctionNVPTX ShuffleFn = Size <= 4 + ? OMPRTL_NVPTX__kmpc_shuffle_int32 + : OMPRTL_NVPTX__kmpc_shuffle_int64; + + // Cast all types to 32- or 64-bit values before calling shuffle routines. + auto CastTy = Size <= 4 ? CGM.Int32Ty : CGM.Int64Ty; + auto *ElemCast = Bld.CreateSExtOrBitCast(Elem, CastTy); + auto *WarpSize = Bld.CreateTruncOrBitCast(getNVPTXWarpSize(CGF), CGM.Int16Ty); + + llvm::SmallVector Args; + Args.push_back(ElemCast); + Args.push_back(Offset); + Args.push_back(WarpSize); + + auto *ShuffledVal = + CGF.EmitRuntimeCall(RT.createNVPTXRuntimeFunction(ShuffleFn), Args); + + return Bld.CreateTruncOrBitCast(ShuffledVal, CGF.ConvertTypeForMem(ElemTy)); +} + +namespace { +enum CopyAction : unsigned { + ScratchpadToThread, + ThreadToScratchpad, + ThreadCopy, + RemoteLaneToThread, +}; +} // namespace + +// Emit instructions to copy a Reduce list, which contains partially +// aggregated values, in the specified direction. +// +// ScratchpadToThread: Copy from a scratchpad array in global memory +// containing team-reduced data to a thread's stack. +// ThreadToScratchpad: Copy a team-reduced array to the scratchpad. +// ThreadCopy: Make a copy of a Reduce list on the thread's stack. +// RemoteLaneToThread: Copy over a Reduce list from a remote lane in +// the warp using shuffle instructions. +static void emitReductionListCopy(CopyAction Action, CodeGenFunction &CGF, + QualType ReductionArrayTy, + ArrayRef Privates, + Address SrcBase, Address DestBase, + llvm::Value *RemoteLaneOffset = nullptr, + llvm::Value *ScratchpadIndex = nullptr, + llvm::Value *ScratchpadWidth = nullptr) { + + auto &CGM = CGF.CGM; + auto &C = CGM.getContext(); + auto &Bld = CGF.Builder; + + // Iterates, element-by-element, through the source Reduce list and + // make a copy. + unsigned Idx = 0; + unsigned Size = Privates.size(); + for (auto &Private : Privates) { + Address SrcElementAddr = Address::invalid(); + Address DestElementAddr = Address::invalid(); + Address DestElementPtrAddr = Address::invalid(); + // Should we shuffle in an element from a remote lane? + bool ShuffleInElement = false; + // Set to true to update the pointer in the dest Reduce list to a + // newly created element. + bool UpdateDestListPtr = false; + // Increment the src or dest pointer to the scratchpad, for each + // new element. + bool IncrScratchpadSrc = false; + bool IncrScratchpadDest = false; + + switch (Action) { + case RemoteLaneToThread: { + // Step 1.1: Get the address for the src element in the Reduce list. + Address SrcElementPtrAddr = + Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize()); + llvm::Value *SrcElementPtrPtr = CGF.EmitLoadOfScalar( + SrcElementPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation()); + SrcElementAddr = + Address(SrcElementPtrPtr, C.getTypeAlignInChars(Private->getType())); + + // Step 1.2: Create a temporary to store the element in the destination + // Reduce list. + DestElementPtrAddr = + Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize()); + DestElementAddr = + CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element"); + ShuffleInElement = true; + UpdateDestListPtr = true; + break; + } + case ThreadCopy: { + // Step 1.1: Get the address for the src element in the Reduce list. + Address SrcElementPtrAddr = + Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize()); + llvm::Value *SrcElementPtrPtr = CGF.EmitLoadOfScalar( + SrcElementPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation()); + SrcElementAddr = + Address(SrcElementPtrPtr, C.getTypeAlignInChars(Private->getType())); + + // Step 1.2: Get the address for dest element. The destination + // element has already been created on the thread's stack. + DestElementPtrAddr = + Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize()); + llvm::Value *DestElementPtr = + CGF.EmitLoadOfScalar(DestElementPtrAddr, /*Volatile=*/false, + C.VoidPtrTy, SourceLocation()); + Address DestElemAddr = + Address(DestElementPtr, C.getTypeAlignInChars(Private->getType())); + DestElementAddr = Bld.CreateElementBitCast( + DestElemAddr, CGF.ConvertTypeForMem(Private->getType())); + break; + } + case ThreadToScratchpad: { + // Step 1.1: Get the address for the src element in the Reduce list. + Address SrcElementPtrAddr = + Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize()); + llvm::Value *SrcElementPtrPtr = CGF.EmitLoadOfScalar( + SrcElementPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation()); + SrcElementAddr = + Address(SrcElementPtrPtr, C.getTypeAlignInChars(Private->getType())); + + // Step 1.2: Get the address for dest element: + // address = base + index * ElementSizeInChars. + unsigned ElementSizeInChars = + C.getTypeSizeInChars(Private->getType()).getQuantity(); + auto *CurrentOffset = + Bld.CreateMul(Bld.getInt64(ElementSizeInChars), ScratchpadIndex); + auto *ScratchPadElemAbsolutePtrVal = + Bld.CreateAdd(DestBase.getPointer(), CurrentOffset); + ScratchPadElemAbsolutePtrVal = + Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy); + Address ScratchpadPtr = + Address(ScratchPadElemAbsolutePtrVal, + C.getTypeAlignInChars(Private->getType())); + DestElementAddr = Bld.CreateElementBitCast( + ScratchpadPtr, CGF.ConvertTypeForMem(Private->getType())); + IncrScratchpadDest = true; + break; + } + case ScratchpadToThread: { + // Step 1.1: Get the address for the src element in the scratchpad. + // address = base + index * ElementSizeInChars. + unsigned ElementSizeInChars = + C.getTypeSizeInChars(Private->getType()).getQuantity(); + auto *CurrentOffset = + Bld.CreateMul(Bld.getInt64(ElementSizeInChars), ScratchpadIndex); + auto *ScratchPadElemAbsolutePtrVal = + Bld.CreateAdd(SrcBase.getPointer(), CurrentOffset); + ScratchPadElemAbsolutePtrVal = + Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy); + SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal, + C.getTypeAlignInChars(Private->getType())); + IncrScratchpadSrc = true; + + // Step 1.2: Create a temporary to store the element in the destination + // Reduce list. + DestElementPtrAddr = + Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize()); + DestElementAddr = + CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element"); + UpdateDestListPtr = true; + break; + } + } + + // Regardless of src and dest of copy, we emit the load of src + // element as this is required in all directions + SrcElementAddr = Bld.CreateElementBitCast( + SrcElementAddr, CGF.ConvertTypeForMem(Private->getType())); + llvm::Value *Elem = + CGF.EmitLoadOfScalar(SrcElementAddr, /*Volatile=*/false, + Private->getType(), SourceLocation()); + + // Now that all active lanes have read the element in the + // Reduce list, shuffle over the value from the remote lane. + if (ShuffleInElement) + Elem = createRuntimeShuffleFunction(CGF, Private->getType(), Elem, + RemoteLaneOffset); + + // Store the source element value to the dest element address. + CGF.EmitStoreOfScalar(Elem, DestElementAddr, /*Volatile=*/false, + Private->getType()); + + // Step 3.1: Modify reference in dest Reduce list as needed. + // Modifying the reference in Reduce list to point to the newly + // created element. The element is live in the current function + // scope and that of functions it invokes (i.e., reduce_function). + // RemoteReduceData[i] = (void*)&RemoteElem + if (UpdateDestListPtr) + CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast( + DestElementAddr.getPointer(), CGF.VoidPtrTy), + DestElementPtrAddr, /*Volatile=*/false, + C.VoidPtrTy); + + // Step 4.1: Increment SrcBase/DestBase so that it points to the starting + // address of the next element in scratchpad memory, unless we're currently + // processing the last one. Memory alignment is also taken care of here. + if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) { + llvm::Value *ScratchpadBasePtr = + IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer(); + unsigned ElementSizeInChars = + C.getTypeSizeInChars(Private->getType()).getQuantity(); + ScratchpadBasePtr = Bld.CreateAdd( + ScratchpadBasePtr, + Bld.CreateMul(ScratchpadWidth, Bld.getInt64(ElementSizeInChars))); + + // Take care of global memory alignment for performance + ScratchpadBasePtr = Bld.CreateSub(ScratchpadBasePtr, Bld.getInt64(1)); + ScratchpadBasePtr = Bld.CreateSDiv(ScratchpadBasePtr, + Bld.getInt64(GlobalMemoryAlignment)); + ScratchpadBasePtr = Bld.CreateAdd(ScratchpadBasePtr, Bld.getInt64(1)); + ScratchpadBasePtr = + Bld.CreateMul(ScratchpadBasePtr, Bld.getInt64(GlobalMemoryAlignment)); + + if (IncrScratchpadDest) + DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign()); + else /* IncrScratchpadSrc = true */ + SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign()); + } + + Idx++; + } +} + +// This function emits a helper that loads data from the scratchpad array +// and (optionally) reduces it with the input operand. +// +// load_and_reduce(local, scratchpad, index, width, should_reduce) +// reduce_data remote; +// for elem in remote: +// remote.elem = Scratchpad[elem_id][index] +// if (should_reduce) +// local = local @ remote +// else +// local = remote +llvm::Value *emitReduceScratchpadFunction(CodeGenModule &CGM, + ArrayRef Privates, + QualType ReductionArrayTy, + llvm::Value *ReduceFn) { + auto &C = CGM.getContext(); + + // Destination of the copy. + ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, SourceLocation(), + /*Id=*/nullptr, C.VoidPtrTy); + // Base address of the scratchpad array, with each element storing a + // Reduce list per team. + ImplicitParamDecl ScratchPadArg(C, /*DC=*/nullptr, SourceLocation(), + /*Id=*/nullptr, C.VoidPtrTy); + // A source index into the scratchpad array. + ImplicitParamDecl IndexArg(C, /*DC=*/nullptr, SourceLocation(), + /*Id=*/nullptr, C.IntTy); + // Row width of an element in the scratchpad array, typically + // the number of teams. + ImplicitParamDecl WidthArg(C, /*DC=*/nullptr, SourceLocation(), + /*Id=*/nullptr, C.IntTy); + // If should_reduce == 1, then it's load AND reduce, + // If should_reduce == 0 (or otherwise), then it only loads (+ copy). + // The latter case is used for initialization. + ImplicitParamDecl ShouldReduceArg(C, /*DC=*/nullptr, SourceLocation(), + /*Id=*/nullptr, C.IntTy); + + FunctionArgList Args; + Args.push_back(&ReduceListArg); + Args.push_back(&ScratchPadArg); + Args.push_back(&IndexArg); + Args.push_back(&WidthArg); + Args.push_back(&ShouldReduceArg); + + auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); + auto *Fn = llvm::Function::Create( + CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, + "_omp_reduction_load_and_reduce", &CGM.getModule()); + CGM.SetInternalFunctionAttributes(/*DC=*/nullptr, Fn, CGFI); + CodeGenFunction CGF(CGM); + // We don't need debug information in this function as nothing here refers to + // user code. + CGF.disableDebugInfo(); + CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args); + + auto &Bld = CGF.Builder; + + // Get local Reduce list pointer. + Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); + Address ReduceListAddr( + Bld.CreatePointerBitCastOrAddrSpaceCast( + CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false, + C.VoidPtrTy, SourceLocation()), + CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()), + CGF.getPointerAlign()); + + Address AddrScratchPadArg = CGF.GetAddrOfLocalVar(&ScratchPadArg); + llvm::Value *ScratchPadBase = CGF.EmitLoadOfScalar( + AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, SourceLocation()); + + Address AddrIndexArg = CGF.GetAddrOfLocalVar(&IndexArg); + llvm::Value *IndexVal = + Bld.CreateSExt(CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false, + C.IntTy, SourceLocation()), + CGF.Int64Ty); + + Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg); + llvm::Value *WidthVal = + Bld.CreateSExt(CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false, + C.IntTy, SourceLocation()), + CGF.Int64Ty); + + Address AddrShouldReduceArg = CGF.GetAddrOfLocalVar(&ShouldReduceArg); + llvm::Value *ShouldReduceVal = CGF.EmitLoadOfScalar( + AddrShouldReduceArg, /*Volatile=*/false, C.IntTy, SourceLocation()); + + // The absolute ptr address to the base addr of the next element to copy. + llvm::Value *CumulativeElemBasePtr = + Bld.CreatePtrToInt(ScratchPadBase, CGM.Int64Ty); + Address SrcDataAddr(CumulativeElemBasePtr, CGF.getPointerAlign()); + + // Create a Remote Reduce list to store the elements read from the + // scratchpad array. + Address RemoteReduceList = + CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_red_list"); + + // Assemble remote Reduce list from scratchpad array. + emitReductionListCopy(ScratchpadToThread, CGF, ReductionArrayTy, Privates, + SrcDataAddr, RemoteReduceList, + /*RemoteLaneOffset=*/nullptr, IndexVal, WidthVal); + + llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then"); + llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else"); + llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont"); + + auto CondReduce = Bld.CreateICmpEQ(ShouldReduceVal, Bld.getInt32(1)); + Bld.CreateCondBr(CondReduce, ThenBB, ElseBB); + + CGF.EmitBlock(ThenBB); + // We should reduce with the local Reduce list. + // reduce_function(LocalReduceList, RemoteReduceList) + llvm::SmallVector FnArgs; + llvm::Value *LocalDataPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( + ReduceListAddr.getPointer(), CGF.VoidPtrTy); + FnArgs.push_back(LocalDataPtr); + llvm::Value *RemoteDataPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( + RemoteReduceList.getPointer(), CGF.VoidPtrTy); + FnArgs.push_back(RemoteDataPtr); + CGF.EmitCallOrInvoke(ReduceFn, FnArgs); + Bld.CreateBr(MergeBB); + + CGF.EmitBlock(ElseBB); + // No reduction; just copy: + // Local Reduce list = Remote Reduce list. + emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates, + RemoteReduceList, ReduceListAddr); + Bld.CreateBr(MergeBB); + + CGF.EmitBlock(MergeBB); + + CGF.FinishFunction(); + return Fn; +} + +// This function emits a helper that stores reduced data from the team +// master to a scratchpad array in global memory. +// +// for elem in Reduce List: +// scratchpad[elem_id][index] = elem +// +llvm::Value *emitCopyToScratchpad(CodeGenModule &CGM, + ArrayRef Privates, + QualType ReductionArrayTy) { + + auto &C = CGM.getContext(); + + // Source of the copy. + ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, SourceLocation(), + /*Id=*/nullptr, C.VoidPtrTy); + // Base address of the scratchpad array, with each element storing a + // Reduce list per team. + ImplicitParamDecl ScratchPadArg(C, /*DC=*/nullptr, SourceLocation(), + /*Id=*/nullptr, C.VoidPtrTy); + // A destination index into the scratchpad array, typically the team + // identifier. + ImplicitParamDecl IndexArg(C, /*DC=*/nullptr, SourceLocation(), + /*Id=*/nullptr, C.IntTy); + // Row width of an element in the scratchpad array, typically + // the number of teams. + ImplicitParamDecl WidthArg(C, /*DC=*/nullptr, SourceLocation(), + /*Id=*/nullptr, C.IntTy); + + FunctionArgList Args; + Args.push_back(&ReduceListArg); + Args.push_back(&ScratchPadArg); + Args.push_back(&IndexArg); + Args.push_back(&WidthArg); + + auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); + auto *Fn = llvm::Function::Create( + CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, + "_omp_reduction_copy_to_scratchpad", &CGM.getModule()); + CGM.SetInternalFunctionAttributes(/*DC=*/nullptr, Fn, CGFI); + CodeGenFunction CGF(CGM); + // We don't need debug information in this function as nothing here refers to + // user code. + CGF.disableDebugInfo(); + CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args); + + auto &Bld = CGF.Builder; + + Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); + Address SrcDataAddr( + Bld.CreatePointerBitCastOrAddrSpaceCast( + CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false, + C.VoidPtrTy, SourceLocation()), + CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()), + CGF.getPointerAlign()); + + Address AddrScratchPadArg = CGF.GetAddrOfLocalVar(&ScratchPadArg); + llvm::Value *ScratchPadBase = CGF.EmitLoadOfScalar( + AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, SourceLocation()); + + Address AddrIndexArg = CGF.GetAddrOfLocalVar(&IndexArg); + llvm::Value *IndexVal = + Bld.CreateSExt(CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false, + C.IntTy, SourceLocation()), + CGF.Int64Ty); + + Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg); + llvm::Value *WidthVal = + Bld.CreateSExt(CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false, + C.IntTy, SourceLocation()), + CGF.Int64Ty); + + // The absolute ptr address to the base addr of the next element to copy. + llvm::Value *CumulativeElemBasePtr = + Bld.CreatePtrToInt(ScratchPadBase, CGM.Int64Ty); + Address DestDataAddr(CumulativeElemBasePtr, CGF.getPointerAlign()); + + emitReductionListCopy(ThreadToScratchpad, CGF, ReductionArrayTy, Privates, + SrcDataAddr, DestDataAddr, /*RemoteLaneOffset=*/nullptr, + IndexVal, WidthVal); + + CGF.FinishFunction(); + return Fn; +} + +// This function emits a helper that gathers Reduce lists from the first +// lane of every active warp to lanes in the first warp. +// +// void inter_warp_copy_func(void* reduce_data, num_warps) +// shared smem[warp_size]; +// For all data entries D in reduce_data: +// If (I am the first lane in each warp) +// Copy my local D to smem[warp_id] +// sync +// if (I am the first warp) +// Copy smem[thread_id] to my local D +// sync +static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM, + ArrayRef Privates, + QualType ReductionArrayTy) { + auto &C = CGM.getContext(); + auto &M = CGM.getModule(); + + // ReduceList: thread local Reduce list. + // At the stage of the computation when this function is called, partially + // aggregated values reside in the first lane of every active warp. + ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, SourceLocation(), + /*Id=*/nullptr, C.VoidPtrTy); + // NumWarps: number of warps active in the parallel region. This could + // be smaller than 32 (max warps in a CTA) for partial block reduction. + ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, SourceLocation(), + /*Id=*/nullptr, C.IntTy); + FunctionArgList Args; + Args.push_back(&ReduceListArg); + Args.push_back(&NumWarpsArg); + + auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); + auto *Fn = llvm::Function::Create( + CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, + "_omp_reduction_inter_warp_copy_func", &CGM.getModule()); + CGM.SetInternalFunctionAttributes(/*DC=*/nullptr, Fn, CGFI); + CodeGenFunction CGF(CGM); + // We don't need debug information in this function as nothing here refers to + // user code. + CGF.disableDebugInfo(); + CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args); + + auto &Bld = CGF.Builder; + + // This array is used as a medium to transfer, one reduce element at a time, + // the data from the first lane of every warp to lanes in the first warp + // in order to perform the final step of a reduction in a parallel region + // (reduction across warps). The array is placed in NVPTX __shared__ memory + // for reduced latency, as well as to have a distinct copy for concurrently + // executing target regions. The array is declared with common linkage so + // as to be shared across compilation units. + const char *TransferMediumName = + "__openmp_nvptx_data_transfer_temporary_storage"; + llvm::GlobalVariable *TransferMedium = + M.getGlobalVariable(TransferMediumName); + if (!TransferMedium) { + auto *Ty = llvm::ArrayType::get(CGM.Int64Ty, WarpSize); + unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared); + TransferMedium = new llvm::GlobalVariable( + M, Ty, + /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage, + llvm::Constant::getNullValue(Ty), TransferMediumName, + /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal, + SharedAddressSpace); + } + + // Get the CUDA thread id of the current OpenMP thread on the GPU. + auto *ThreadID = getNVPTXThreadID(CGF); + // nvptx_lane_id = nvptx_id % warpsize + auto *LaneID = getNVPTXLaneID(CGF); + // nvptx_warp_id = nvptx_id / warpsize + auto *WarpID = getNVPTXWarpID(CGF); + + Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); + Address LocalReduceList( + Bld.CreatePointerBitCastOrAddrSpaceCast( + CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false, + C.VoidPtrTy, SourceLocation()), + CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()), + CGF.getPointerAlign()); + + unsigned Idx = 0; + for (auto &Private : Privates) { + // + // Warp master copies reduce element to transfer medium in __shared__ + // memory. + // + llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then"); + llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else"); + llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont"); + + // if (lane_id == 0) + auto IsWarpMaster = + Bld.CreateICmpEQ(LaneID, Bld.getInt32(0), "warp_master"); + Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB); + CGF.EmitBlock(ThenBB); + + // Reduce element = LocalReduceList[i] + Address ElemPtrPtrAddr = + Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize()); + llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar( + ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation()); + // elemptr = (type[i]*)(elemptrptr) + Address ElemPtr = + Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType())); + ElemPtr = Bld.CreateElementBitCast( + ElemPtr, CGF.ConvertTypeForMem(Private->getType())); + // elem = *elemptr + llvm::Value *Elem = CGF.EmitLoadOfScalar( + ElemPtr, /*Volatile=*/false, Private->getType(), SourceLocation()); + + // Get pointer to location in transfer medium. + // MediumPtr = &medium[warp_id] + llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP( + TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID}); + Address MediumPtr(MediumPtrVal, C.getTypeAlignInChars(Private->getType())); + // Casting to actual data type. + // MediumPtr = (type[i]*)MediumPtrAddr; + MediumPtr = Bld.CreateElementBitCast( + MediumPtr, CGF.ConvertTypeForMem(Private->getType())); + + //*MediumPtr = elem + Bld.CreateStore(Elem, MediumPtr); + + Bld.CreateBr(MergeBB); + + CGF.EmitBlock(ElseBB); + Bld.CreateBr(MergeBB); + + CGF.EmitBlock(MergeBB); + + Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg); + llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar( + AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, SourceLocation()); + + auto *NumActiveThreads = Bld.CreateNSWMul( + NumWarpsVal, getNVPTXWarpSize(CGF), "num_active_threads"); + // named_barrier_sync(ParallelBarrierID, num_active_threads) + syncParallelThreads(CGF, NumActiveThreads); + + // + // Warp 0 copies reduce element from transfer medium. + // + llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then"); + llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else"); + llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont"); + + // Up to 32 threads in warp 0 are active. + auto IsActiveThread = + Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread"); + Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB); + + CGF.EmitBlock(W0ThenBB); + + // SrcMediumPtr = &medium[tid] + llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP( + TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID}); + Address SrcMediumPtr(SrcMediumPtrVal, + C.getTypeAlignInChars(Private->getType())); + // SrcMediumVal = *SrcMediumPtr; + SrcMediumPtr = Bld.CreateElementBitCast( + SrcMediumPtr, CGF.ConvertTypeForMem(Private->getType())); + llvm::Value *SrcMediumValue = CGF.EmitLoadOfScalar( + SrcMediumPtr, /*Volatile=*/false, Private->getType(), SourceLocation()); + + // TargetElemPtr = (type[i]*)(SrcDataAddr[i]) + Address TargetElemPtrPtr = + Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize()); + llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar( + TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation()); + Address TargetElemPtr = + Address(TargetElemPtrVal, C.getTypeAlignInChars(Private->getType())); + TargetElemPtr = Bld.CreateElementBitCast( + TargetElemPtr, CGF.ConvertTypeForMem(Private->getType())); + + // *TargetElemPtr = SrcMediumVal; + CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false, + Private->getType()); + Bld.CreateBr(W0MergeBB); + + CGF.EmitBlock(W0ElseBB); + Bld.CreateBr(W0MergeBB); + + CGF.EmitBlock(W0MergeBB); + + // While warp 0 copies values from transfer medium, all other warps must + // wait. + syncParallelThreads(CGF, NumActiveThreads); + Idx++; + } + + CGF.FinishFunction(); + return Fn; +} + +// Emit a helper that reduces data across two OpenMP threads (lanes) +// in the same warp. It uses shuffle instructions to copy over data from +// a remote lane's stack. The reduction algorithm performed is specified +// by the fourth parameter. +// +// Algorithm Versions. +// Full Warp Reduce (argument value 0): +// This algorithm assumes that all 32 lanes are active and gathers +// data from these 32 lanes, producing a single resultant value. +// Contiguous Partial Warp Reduce (argument value 1): +// This algorithm assumes that only a *contiguous* subset of lanes +// are active. This happens for the last warp in a parallel region +// when the user specified num_threads is not an integer multiple of +// 32. This contiguous subset always starts with the zeroth lane. +// Partial Warp Reduce (argument value 2): +// This algorithm gathers data from any number of lanes at any position. +// All reduced values are stored in the lowest possible lane. The set +// of problems every algorithm addresses is a super set of those +// addressable by algorithms with a lower version number. Overhead +// increases as algorithm version increases. +// +// Terminology +// Reduce element: +// Reduce element refers to the individual data field with primitive +// data types to be combined and reduced across threads. +// Reduce list: +// Reduce list refers to a collection of local, thread-private +// reduce elements. +// Remote Reduce list: +// Remote Reduce list refers to a collection of remote (relative to +// the current thread) reduce elements. +// +// We distinguish between three states of threads that are important to +// the implementation of this function. +// Alive threads: +// Threads in a warp executing the SIMT instruction, as distinguished from +// threads that are inactive due to divergent control flow. +// Active threads: +// The minimal set of threads that has to be alive upon entry to this +// function. The computation is correct iff active threads are alive. +// Some threads are alive but they are not active because they do not +// contribute to the computation in any useful manner. Turning them off +// may introduce control flow overheads without any tangible benefits. +// Effective threads: +// In order to comply with the argument requirements of the shuffle +// function, we must keep all lanes holding data alive. But at most +// half of them perform value aggregation; we refer to this half of +// threads as effective. The other half is simply handing off their +// data. +// +// Procedure +// Value shuffle: +// In this step active threads transfer data from higher lane positions +// in the warp to lower lane positions, creating Remote Reduce list. +// Value aggregation: +// In this step, effective threads combine their thread local Reduce list +// with Remote Reduce list and store the result in the thread local +// Reduce list. +// Value copy: +// In this step, we deal with the assumption made by algorithm 2 +// (i.e. contiguity assumption). When we have an odd number of lanes +// active, say 2k+1, only k threads will be effective and therefore k +// new values will be produced. However, the Reduce list owned by the +// (2k+1)th thread is ignored in the value aggregation. Therefore +// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so +// that the contiguity assumption still holds. +static llvm::Value * +emitShuffleAndReduceFunction(CodeGenModule &CGM, + ArrayRef Privates, + QualType ReductionArrayTy, llvm::Value *ReduceFn) { + auto &C = CGM.getContext(); + + // Thread local Reduce list used to host the values of data to be reduced. + ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, SourceLocation(), + /*Id=*/nullptr, C.VoidPtrTy); + // Current lane id; could be logical. + ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, SourceLocation(), + /*Id=*/nullptr, C.ShortTy); + // Offset of the remote source lane relative to the current lane. + ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, SourceLocation(), + /*Id=*/nullptr, C.ShortTy); + // Algorithm version. This is expected to be known at compile time. + ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, SourceLocation(), + /*Id=*/nullptr, C.ShortTy); + FunctionArgList Args; + Args.push_back(&ReduceListArg); + Args.push_back(&LaneIDArg); + Args.push_back(&RemoteLaneOffsetArg); + Args.push_back(&AlgoVerArg); + + auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); + auto *Fn = llvm::Function::Create( + CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, + "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule()); + CGM.SetInternalFunctionAttributes(/*D=*/nullptr, Fn, CGFI); + CodeGenFunction CGF(CGM); + // We don't need debug information in this function as nothing here refers to + // user code. + CGF.disableDebugInfo(); + CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args); + + auto &Bld = CGF.Builder; + + Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); + Address LocalReduceList( + Bld.CreatePointerBitCastOrAddrSpaceCast( + CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false, + C.VoidPtrTy, SourceLocation()), + CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()), + CGF.getPointerAlign()); + + Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg); + llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar( + AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation()); + + Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg); + llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar( + AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation()); + + Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg); + llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar( + AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation()); + + // Create a local thread-private variable to host the Reduce list + // from a remote lane. + Address RemoteReduceList = + CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list"); + + // This loop iterates through the list of reduce elements and copies, + // element by element, from a remote lane in the warp to RemoteReduceList, + // hosted on the thread's stack. + emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates, + LocalReduceList, RemoteReduceList, + RemoteLaneOffsetArgVal); + + // The actions to be performed on the Remote Reduce list is dependent + // on the algorithm version. + // + // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 && + // LaneId % 2 == 0 && Offset > 0): + // do the reduction value aggregation + // + // The thread local variable Reduce list is mutated in place to host the + // reduced data, which is the aggregated value produced from local and + // remote lanes. + // + // Note that AlgoVer is expected to be a constant integer known at compile + // time. + // When AlgoVer==0, the first conjunction evaluates to true, making + // the entire predicate true during compile time. + // When AlgoVer==1, the second conjunction has only the second part to be + // evaluated during runtime. Other conjunctions evaluates to false + // during compile time. + // When AlgoVer==2, the third conjunction has only the second part to be + // evaluated during runtime. Other conjunctions evaluates to false + // during compile time. + auto CondAlgo0 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(0)); + + auto CondAlgo1 = + Bld.CreateAnd(Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1)), + Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal)); + + auto CondAlgo2 = Bld.CreateAnd( + Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2)), + Bld.CreateICmpEQ(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1)), + Bld.getInt16(0))); + CondAlgo2 = Bld.CreateAnd( + CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0))); + + auto CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1); + CondReduce = Bld.CreateOr(CondReduce, CondAlgo2); + + llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then"); + llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else"); + llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont"); + Bld.CreateCondBr(CondReduce, ThenBB, ElseBB); + + CGF.EmitBlock(ThenBB); + // reduce_function(LocalReduceList, RemoteReduceList) + llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( + LocalReduceList.getPointer(), CGF.VoidPtrTy); + llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( + RemoteReduceList.getPointer(), CGF.VoidPtrTy); + llvm::SmallVector FnArgs; + FnArgs.push_back(LocalReduceListPtr); + FnArgs.push_back(RemoteReduceListPtr); + CGF.EmitCallOrInvoke(ReduceFn, FnArgs); + Bld.CreateBr(MergeBB); + + CGF.EmitBlock(ElseBB); + Bld.CreateBr(MergeBB); + + CGF.EmitBlock(MergeBB); + + // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local + // Reduce list. + auto CondCopy = + Bld.CreateAnd(Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1)), + Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal)); + + llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then"); + llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else"); + llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont"); + Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB); + + CGF.EmitBlock(CpyThenBB); + emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates, + RemoteReduceList, LocalReduceList); + Bld.CreateBr(CpyMergeBB); + + CGF.EmitBlock(CpyElseBB); + Bld.CreateBr(CpyMergeBB); + + CGF.EmitBlock(CpyMergeBB); + + CGF.FinishFunction(); + return Fn; +} + +// +// Design of OpenMP reductions on the GPU +// +// Consider a typical OpenMP program with one or more reduction +// clauses: +// +// float foo; +// double bar; +// #pragma omp target teams distribute parallel for \ +// reduction(+:foo) reduction(*:bar) +// for (int i = 0; i < N; i++) { +// foo += A[i]; bar *= B[i]; +// } +// +// where 'foo' and 'bar' are reduced across all OpenMP threads in +// all teams. In our OpenMP implementation on the NVPTX device an +// OpenMP team is mapped to a CUDA threadblock and OpenMP threads +// within a team are mapped to CUDA threads within a threadblock. +// Our goal is to efficiently aggregate values across all OpenMP +// threads such that: +// +// - the compiler and runtime are logically concise, and +// - the reduction is performed efficiently in a hierarchical +// manner as follows: within OpenMP threads in the same warp, +// across warps in a threadblock, and finally across teams on +// the NVPTX device. +// +// Introduction to Decoupling +// +// We would like to decouple the compiler and the runtime so that the +// latter is ignorant of the reduction variables (number, data types) +// and the reduction operators. This allows a simpler interface +// and implementation while still attaining good performance. +// +// Pseudocode for the aforementioned OpenMP program generated by the +// compiler is as follows: +// +// 1. Create private copies of reduction variables on each OpenMP +// thread: 'foo_private', 'bar_private' +// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned +// to it and writes the result in 'foo_private' and 'bar_private' +// respectively. +// 3. Call the OpenMP runtime on the GPU to reduce within a team +// and store the result on the team master: +// +// __kmpc_nvptx_parallel_reduce_nowait(..., +// reduceData, shuffleReduceFn, interWarpCpyFn) +// +// where: +// struct ReduceData { +// double *foo; +// double *bar; +// } reduceData +// reduceData.foo = &foo_private +// reduceData.bar = &bar_private +// +// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two +// auxiliary functions generated by the compiler that operate on +// variables of type 'ReduceData'. They aid the runtime perform +// algorithmic steps in a data agnostic manner. +// +// 'shuffleReduceFn' is a pointer to a function that reduces data +// of type 'ReduceData' across two OpenMP threads (lanes) in the +// same warp. It takes the following arguments as input: +// +// a. variable of type 'ReduceData' on the calling lane, +// b. its lane_id, +// c. an offset relative to the current lane_id to generate a +// remote_lane_id. The remote lane contains the second +// variable of type 'ReduceData' that is to be reduced. +// d. an algorithm version parameter determining which reduction +// algorithm to use. +// +// 'shuffleReduceFn' retrieves data from the remote lane using +// efficient GPU shuffle intrinsics and reduces, using the +// algorithm specified by the 4th parameter, the two operands +// element-wise. The result is written to the first operand. +// +// Different reduction algorithms are implemented in different +// runtime functions, all calling 'shuffleReduceFn' to perform +// the essential reduction step. Therefore, based on the 4th +// parameter, this function behaves slightly differently to +// cooperate with the runtime to ensure correctness under +// different circumstances. +// +// 'InterWarpCpyFn' is a pointer to a function that transfers +// reduced variables across warps. It tunnels, through CUDA +// shared memory, the thread-private data of type 'ReduceData' +// from lane 0 of each warp to a lane in the first warp. +// 4. Call the OpenMP runtime on the GPU to reduce across teams. +// The last team writes the global reduced value to memory. +// +// ret = __kmpc_nvptx_teams_reduce_nowait(..., +// reduceData, shuffleReduceFn, interWarpCpyFn, +// scratchpadCopyFn, loadAndReduceFn) +// +// 'scratchpadCopyFn' is a helper that stores reduced +// data from the team master to a scratchpad array in +// global memory. +// +// 'loadAndReduceFn' is a helper that loads data from +// the scratchpad array and reduces it with the input +// operand. +// +// These compiler generated functions hide address +// calculation and alignment information from the runtime. +// 5. if ret == 1: +// The team master of the last team stores the reduced +// result to the globals in memory. +// foo += reduceData.foo; bar *= reduceData.bar +// +// +// Warp Reduction Algorithms +// +// On the warp level, we have three algorithms implemented in the +// OpenMP runtime depending on the number of active lanes: +// +// Full Warp Reduction +// +// The reduce algorithm within a warp where all lanes are active +// is implemented in the runtime as follows: +// +// full_warp_reduce(void *reduce_data, +// kmp_ShuffleReductFctPtr ShuffleReduceFn) { +// for (int offset = WARPSIZE/2; offset > 0; offset /= 2) +// ShuffleReduceFn(reduce_data, 0, offset, 0); +// } +// +// The algorithm completes in log(2, WARPSIZE) steps. +// +// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is +// not used therefore we save instructions by not retrieving lane_id +// from the corresponding special registers. The 4th parameter, which +// represents the version of the algorithm being used, is set to 0 to +// signify full warp reduction. +// +// In this version, 'ShuffleReduceFn' behaves, per element, as follows: +// +// #reduce_elem refers to an element in the local lane's data structure +// #remote_elem is retrieved from a remote lane +// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); +// reduce_elem = reduce_elem REDUCE_OP remote_elem; +// +// Contiguous Partial Warp Reduction +// +// This reduce algorithm is used within a warp where only the first +// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the +// number of OpenMP threads in a parallel region is not a multiple of +// WARPSIZE. The algorithm is implemented in the runtime as follows: +// +// void +// contiguous_partial_reduce(void *reduce_data, +// kmp_ShuffleReductFctPtr ShuffleReduceFn, +// int size, int lane_id) { +// int curr_size; +// int offset; +// curr_size = size; +// mask = curr_size/2; +// while (offset>0) { +// ShuffleReduceFn(reduce_data, lane_id, offset, 1); +// curr_size = (curr_size+1)/2; +// offset = curr_size/2; +// } +// } +// +// In this version, 'ShuffleReduceFn' behaves, per element, as follows: +// +// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); +// if (lane_id < offset) +// reduce_elem = reduce_elem REDUCE_OP remote_elem +// else +// reduce_elem = remote_elem +// +// This algorithm assumes that the data to be reduced are located in a +// contiguous subset of lanes starting from the first. When there is +// an odd number of active lanes, the data in the last lane is not +// aggregated with any other lane's dat but is instead copied over. +// +// Dispersed Partial Warp Reduction +// +// This algorithm is used within a warp when any discontiguous subset of +// lanes are active. It is used to implement the reduction operation +// across lanes in an OpenMP simd region or in a nested parallel region. +// +// void +// dispersed_partial_reduce(void *reduce_data, +// kmp_ShuffleReductFctPtr ShuffleReduceFn) { +// int size, remote_id; +// int logical_lane_id = number_of_active_lanes_before_me() * 2; +// do { +// remote_id = next_active_lane_id_right_after_me(); +// # the above function returns 0 of no active lane +// # is present right after the current lane. +// size = number_of_active_lanes_in_this_warp(); +// logical_lane_id /= 2; +// ShuffleReduceFn(reduce_data, logical_lane_id, +// remote_id-1-threadIdx.x, 2); +// } while (logical_lane_id % 2 == 0 && size > 1); +// } +// +// There is no assumption made about the initial state of the reduction. +// Any number of lanes (>=1) could be active at any position. The reduction +// result is returned in the first active lane. +// +// In this version, 'ShuffleReduceFn' behaves, per element, as follows: +// +// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE); +// if (lane_id % 2 == 0 && offset > 0) +// reduce_elem = reduce_elem REDUCE_OP remote_elem +// else +// reduce_elem = remote_elem +// +// +// Intra-Team Reduction +// +// This function, as implemented in the runtime call +// '__kmpc_nvptx_parallel_reduce_nowait', aggregates data across OpenMP +// threads in a team. It first reduces within a warp using the +// aforementioned algorithms. We then proceed to gather all such +// reduced values at the first warp. +// +// The runtime makes use of the function 'InterWarpCpyFn', which copies +// data from each of the "warp master" (zeroth lane of each warp, where +// warp-reduced data is held) to the zeroth warp. This step reduces (in +// a mathematical sense) the problem of reduction across warp masters in +// a block to the problem of warp reduction. +// +// +// Inter-Team Reduction +// +// Once a team has reduced its data to a single value, it is stored in +// a global scratchpad array. Since each team has a distinct slot, this +// can be done without locking. +// +// The last team to write to the scratchpad array proceeds to reduce the +// scratchpad array. One or more workers in the last team use the helper +// 'loadAndReduceDataFn' to load and reduce values from the array, i.e., +// the k'th worker reduces every k'th element. +// +// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait' to +// reduce across workers and compute a globally reduced value. +// +void CGOpenMPRuntimeNVPTX::emitReduction( + CodeGenFunction &CGF, SourceLocation Loc, ArrayRef Privates, + ArrayRef LHSExprs, ArrayRef RHSExprs, + ArrayRef ReductionOps, bool WithNowait, bool SimpleReduction, + OpenMPDirectiveKind ReductionKind) { + if (!CGF.HaveInsertPoint()) + return; + + bool TeamsReduction = isOpenMPTeamsDirective(ReductionKind); + // FIXME: Add support for parallel and simd reduction. + assert(TeamsReduction && "Invalid reduction selection in emitReduction."); + + auto &C = CGM.getContext(); + + // 1. Build a list of reduction variables. + // void *RedList[] = {[0], ..., [-1]}; + auto Size = RHSExprs.size(); + for (auto *E : Privates) { + if (E->getType()->isVariablyModifiedType()) + // Reserve place for array size. + ++Size; + } + llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size); + QualType ReductionArrayTy = + C.getConstantArrayType(C.VoidPtrTy, ArraySize, ArrayType::Normal, + /*IndexTypeQuals=*/0); + Address ReductionList = + CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list"); + auto IPriv = Privates.begin(); + unsigned Idx = 0; + for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) { + Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx, + CGF.getPointerSize()); + CGF.Builder.CreateStore( + CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( + CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy), + Elem); + if ((*IPriv)->getType()->isVariablyModifiedType()) { + // Store array size. + ++Idx; + Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx, + CGF.getPointerSize()); + llvm::Value *Size = CGF.Builder.CreateIntCast( + CGF.getVLASize( + CGF.getContext().getAsVariableArrayType((*IPriv)->getType())) + .first, + CGF.SizeTy, /*isSigned=*/false); + CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy), + Elem); + } + } + + // 2. Emit reduce_func(). + auto *ReductionFn = emitReductionFunction( + CGM, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), Privates, + LHSExprs, RHSExprs, ReductionOps); + + // 4. Build res = __kmpc_reduce{_nowait}(, , sizeof(RedList), + // RedList, shuffle_reduce_func, interwarp_copy_func); + auto *ThreadId = getThreadID(CGF, Loc); + auto *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy); + auto *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast( + ReductionList.getPointer(), CGF.VoidPtrTy); + + auto *ShuffleAndReduceFn = emitShuffleAndReduceFunction( + CGM, Privates, ReductionArrayTy, ReductionFn); + auto *InterWarpCopyFn = + emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy); + auto *ScratchPadCopyFn = + emitCopyToScratchpad(CGM, Privates, ReductionArrayTy); + auto *LoadAndReduceFn = emitReduceScratchpadFunction( + CGM, Privates, ReductionArrayTy, ReductionFn); + + llvm::Value *Res = nullptr; + if (TeamsReduction) { + llvm::Value *TeamsArgs[] = {ThreadId, + CGF.Builder.getInt32(RHSExprs.size()), + ReductionArrayTySize, + RL, + ShuffleAndReduceFn, + InterWarpCopyFn, + ScratchPadCopyFn, + LoadAndReduceFn}; + Res = CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_teams_reduce_nowait), + TeamsArgs); + } + + // 5. Build switch(res) + auto *DefaultBB = CGF.createBasicBlock(".omp.reduction.default"); + auto *SwInst = CGF.Builder.CreateSwitch(Res, DefaultBB, /*NumCases=*/1); + + // 6. Build case 1: where we have reduced values in the master + // thread in each team. + // Team reduction implementation is pending + // __kmpc_end_reduce{_nowait}(); + // break; + auto *Case1BB = CGF.createBasicBlock(".omp.reduction.case1"); + SwInst->addCase(CGF.Builder.getInt32(1), Case1BB); + CGF.EmitBlock(Case1BB); + + // Add emission of __kmpc_end_reduce{_nowait}(); + llvm::Value *EndArgs[] = {ThreadId}; + auto &&CodeGen = [&Privates, &LHSExprs, &RHSExprs, &ReductionOps, + this](CodeGenFunction &CGF, PrePostActionTy &Action) { + auto IPriv = Privates.begin(); + auto ILHS = LHSExprs.begin(); + auto IRHS = RHSExprs.begin(); + for (auto *E : ReductionOps) { + emitSingleReductionCombiner(CGF, E, *IPriv, cast(*ILHS), + cast(*IRHS)); + ++IPriv; + ++ILHS; + ++IRHS; + } + }; + RegionCodeGenTy RCG(CodeGen); + NVPTXActionTy Action( + nullptr, llvm::None, + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_reduce_nowait), + EndArgs); + RCG.setAction(Action); + RCG(CGF); + CGF.EmitBranch(DefaultBB); + CGF.EmitBlock(DefaultBB, /*IsFinished=*/true); +} Index: lib/CodeGen/CGStmtOpenMP.cpp =================================================================== --- lib/CodeGen/CGStmtOpenMP.cpp +++ lib/CodeGen/CGStmtOpenMP.cpp @@ -1190,7 +1190,7 @@ } void CodeGenFunction::EmitOMPReductionClauseFinal( - const OMPExecutableDirective &D) { + const OMPExecutableDirective &D, const OpenMPDirectiveKind ReductionKind) { if (!HaveInsertPoint()) return; llvm::SmallVector Privates; @@ -1213,7 +1213,7 @@ D.getSingleClause() || isOpenMPParallelDirective(D.getDirectiveKind()) || D.getDirectiveKind() == OMPD_simd, - D.getDirectiveKind() == OMPD_simd); + D.getDirectiveKind() == OMPD_simd, ReductionKind); } } @@ -1295,7 +1295,7 @@ CGF.EmitOMPReductionClauseInit(S, PrivateScope); (void)PrivateScope.Privatize(); CGF.EmitStmt(cast(S.getAssociatedStmt())->getCapturedStmt()); - CGF.EmitOMPReductionClauseFinal(S); + CGF.EmitOMPReductionClauseFinal(S, OMPD_parallel); }; emitCommonOMPParallelDirective(*this, S, OMPD_parallel, CodeGen); emitPostUpdateForReductionClause( @@ -1708,7 +1708,7 @@ // Emit final copy of the lastprivate variables at the end of loops. if (HasLastprivateClause) CGF.EmitOMPLastprivateClauseFinal(S, /*NoFinals=*/true); - CGF.EmitOMPReductionClauseFinal(S); + CGF.EmitOMPReductionClauseFinal(S, OMPD_simd); emitPostUpdateForReductionClause( CGF, S, [](CodeGenFunction &) -> llvm::Value * { return nullptr; }); } @@ -2244,7 +2244,9 @@ CGF.EmitLoadOfScalar(IL, S.getLocStart())); }); } - EmitOMPReductionClauseFinal(S); + EmitOMPReductionClauseFinal(S, isOpenMPSimdDirective(S.getDirectiveKind()) + ? OMPD_parallel_for_simd + : OMPD_parallel); // Emit post-update of the reduction variables if IsLastIter != 0. emitPostUpdateForReductionClause( *this, S, [&](CodeGenFunction &CGF) -> llvm::Value * { @@ -2419,7 +2421,7 @@ CGF.CGM.getOpenMPRuntime().emitForStaticFinish(CGF, S.getLocEnd()); }; CGF.OMPCancelStack.emitExit(CGF, S.getDirectiveKind(), CodeGen); - CGF.EmitOMPReductionClauseFinal(S); + CGF.EmitOMPReductionClauseFinal(S, OMPD_parallel); // Emit post-update of the reduction variables if IsLastIter != 0. emitPostUpdateForReductionClause( CGF, S, [&](CodeGenFunction &CGF) -> llvm::Value * { @@ -3549,10 +3551,14 @@ OMPPrivateScope PrivateScope(CGF); (void)CGF.EmitOMPFirstprivateClause(S, PrivateScope); CGF.EmitOMPPrivateClause(S, PrivateScope); + CGF.EmitOMPReductionClauseInit(S, PrivateScope); (void)PrivateScope.Privatize(); CGF.EmitStmt(cast(S.getAssociatedStmt())->getCapturedStmt()); + CGF.EmitOMPReductionClauseFinal(S, OMPD_teams); }; emitCommonOMPTeamsDirective(*this, S, OMPD_teams, CodeGen); + emitPostUpdateForReductionClause( + *this, S, [](CodeGenFunction &) -> llvm::Value * { return nullptr; }); } static void emitTargetTeamsRegion(CodeGenFunction &CGF, PrePostActionTy &Action, Index: lib/CodeGen/CodeGenFunction.h =================================================================== --- lib/CodeGen/CodeGenFunction.h +++ lib/CodeGen/CodeGenFunction.h @@ -2638,7 +2638,9 @@ /// the end of the directive. /// /// \param D Directive that has at least one 'reduction' directives. - void EmitOMPReductionClauseFinal(const OMPExecutableDirective &D); + /// \param ReductionKind The kind of reduction to perform. + void EmitOMPReductionClauseFinal(const OMPExecutableDirective &D, + const OpenMPDirectiveKind ReductionKind); /// \brief Emit initial code for linear variables. Creates private copies /// and initializes them with the values according to OpenMP standard. /// Index: test/OpenMP/nvptx_teams_reduction_codegen.cpp =================================================================== --- /dev/null +++ test/OpenMP/nvptx_teams_reduction_codegen.cpp @@ -0,0 +1,1131 @@ +// Test target codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// expected-no-diagnostics +#ifndef HEADER +#define HEADER + +// Check for the data transfer medium in shared memory to transfer the reduction list to the first warp. +// CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = common addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i64] + +// Check that the execution mode of all 3 target regions is set to Generic Mode. +// CHECK-DAG: {{@__omp_offloading_.+l27}}_exec_mode = weak constant i8 1 +// CHECK-DAG: {{@__omp_offloading_.+l33}}_exec_mode = weak constant i8 1 +// CHECK-DAG: {{@__omp_offloading_.+l40}}_exec_mode = weak constant i8 1 + +template +tx ftemplate(int n) { + int a; + short b; + tx c; + float d; + double e; + + #pragma omp target + #pragma omp teams reduction(+: e) + { + e += 5; + } + + #pragma omp target + #pragma omp teams reduction(^: c) reduction(*: d) + { + c ^= 2; + d *= 33; + } + + #pragma omp target + #pragma omp teams reduction(|: a) reduction(max: b) + { + a |= 1; + b = 99 > b ? 99 : b; + } + + return a+b+c+d+e; +} + +int bar(int n){ + int a = 0; + + a += ftemplate(n); + + return a; +} + + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l27}}_worker() + + // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+template.+l27]]( + // + // CHECK: {{call|invoke}} void [[T1]]_worker() + // + // CHECK: call void @__kmpc_kernel_init( + // + // CHECK: store double {{[0\.e\+]+}}, double* [[E:%.+]], align + // CHECK: [[EV:%.+]] = load double, double* [[E]], align + // CHECK: [[ADD:%.+]] = fadd double [[EV]], 5 + // CHECK: store double [[ADD]], double* [[E]], align + // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [1 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[E_CAST:%.+]] = bitcast double* [[E]] to i8* + // CHECK: store i8* [[E_CAST]], i8** [[PTR1]], align + // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* + // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait(i32 {{.+}}, i32 1, i{{32|64}} {{4|8}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]], void (i8*, i8*, i32, i32)* [[SCRATCH_COPY_FN:@.+]], void (i8*, i8*, i32, i32, i32)* [[LOAD_REDUCE_FN:@.+]]) + // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1 + // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]] + // + // CHECK: [[IFLABEL]] + // CHECK: [[E_INV:%.+]] = load double, double* [[E_IN:%.+]], align + // CHECK: [[EV:%.+]] = load double, double* [[E]], align + // CHECK: [[ADD:%.+]] = fadd double [[E_INV]], [[EV]] + // CHECK: store double [[ADD]], double* [[E_IN]], align + // CHECK: call void @__kmpc_nvptx_end_reduce_nowait( + // CHECK: br label %[[EXIT]] + // + // CHECK: [[EXIT]] + // CHECK: call void @__kmpc_kernel_deinit() + + // + // Reduction function + // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8*, i8*) + // CHECK: [[VAR_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[VAR_RHS_VOID:%.+]] = load i8*, i8** [[VAR_RHS_REF]], + // CHECK: [[VAR_RHS:%.+]] = bitcast i8* [[VAR_RHS_VOID]] to double* + // + // CHECK: [[VAR_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[VAR_LHS_VOID:%.+]] = load i8*, i8** [[VAR_LHS_REF]], + // CHECK: [[VAR_LHS:%.+]] = bitcast i8* [[VAR_LHS_VOID]] to double* + // + // CHECK: [[VAR_LHS_VAL:%.+]] = load double, double* [[VAR_LHS]], + // CHECK: [[VAR_RHS_VAL:%.+]] = load double, double* [[VAR_RHS]], + // CHECK: [[RES:%.+]] = fadd double [[VAR_LHS_VAL]], [[VAR_RHS_VAL]] + // CHECK: store double [[RES]], double* [[VAR_LHS]], + // CHECK: ret void + + // + // Shuffle and reduce function + // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) + // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align + // CHECK: [[REMOTE_ELT:%.+]] = alloca double + // + // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align + // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align + // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* + // CHECK: [[ELT_VAL:%.+]] = load double, double* [[ELT]], align + // + // CHECK: [[ELT_CAST:%.+]] = bitcast double [[ELT_VAL]] to i64 + // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 + // CHECK: [[REMOTE_ELT_VAL64:%.+]] = call i64 @__kmpc_shuffle_int64(i64 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]]) + // CHECK: [[REMOTE_ELT_VAL:%.+]] = bitcast i64 [[REMOTE_ELT_VAL64]] to double + // + // CHECK: store double [[REMOTE_ELT_VAL]], double* [[REMOTE_ELT]], align + // CHECK: [[REMOTE_ELT_VOID:%.+]] = bitcast double* [[REMOTE_ELT]] to i8* + // CHECK: store i8* [[REMOTE_ELT_VOID]], i8** [[REMOTE_ELT_REF]], align + // + // Condition to reduce + // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0 + // + // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 + // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]] + // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]] + // + // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2 + // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1 + // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0 + // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]] + // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0 + // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]] + // + // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]] + // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]] + // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] + // + // CHECK: [[DO_REDUCE]] + // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* + // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* + // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) + // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] + // + // CHECK: [[REDUCE_ELSE]] + // CHECK: br label {{%?}}[[REDUCE_CONT]] + // + // CHECK: [[REDUCE_CONT]] + // Now check if we should just copy over the remote reduction list + // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 + // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]] + // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]] + // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // CHECK: [[DO_COPY]] + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* + // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to double* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[REMOTE_ELT]], align + // CHECK: store double [[REMOTE_ELT_VAL]], double* [[ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // CHECK: [[COPY_CONT]] + // CHECK: void + + // + // Inter warp copy function + // CHECK: define internal void [[WARP_COPY_FN]](i8*, i32) + // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 + // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 + // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* + // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 + // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // [[DO_COPY]] + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* + // CHECK: [[ELT_VAL:%.+]] = load double, double* [[ELT]], align + // + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to double addrspace([[SHARED_ADDRSPACE]])* + // CHECK: store double [[ELT_VAL]], double addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // Barrier after copy to shared memory storage medium. + // CHECK: [[COPY_CONT]] + // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // + // Read into warp 0. + // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] + // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] + // + // CHECK: [[DO_READ]] + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to double addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load double, double addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* + // CHECK: store double [[MEDIUM_ELT_VAL]], double* [[ELT]], align + // CHECK: br label {{%?}}[[READ_CONT:.+]] + // + // CHECK: [[READ_ELSE]] + // CHECK: br label {{%?}}[[READ_CONT]] + // + // CHECK: [[READ_CONT]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: ret + + // + // Copy to scratchpad function + // CHECK: define internal void [[SCRATCH_COPY_FN]](i8*, i8*, i32, i32) + // CHECK: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* + // CHECK: [[SCRATCHPAD_PTR:%.+]] = load i8*, i8** {{.+}}, align + // CHECK: [[TEAM32:%.+]] = load i32, i32* {{.+}}, align + // CHECK: [[TEAM:%.+]] = sext i32 [[TEAM32]] to i64 + // CHECK: [[NUM_TEAMS32:%.+]] = load i32, i32* {{.+}}, align + // CHECK: [[NUM_TEAMS:%.+]] = sext i32 [[NUM_TEAMS32]] to i64 + // CHECK: [[SCRATCHPAD:%.+]] = ptrtoint i8* [[SCRATCHPAD_PTR]] to i64 + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // + // CHECK: [[P:%.+]] = mul i64 8, [[TEAM]] + // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i64 [[SCRATCHPAD]], [[P]] + // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i64 [[SCRATCHPAD_ELT_PTR64]] to i8* + // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to double* + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* + // CHECK: [[ELT_VAL:%.+]] = load double, double* [[ELT]], align + // CHECK: store double [[ELT_VAL]], double* [[SCRATCHPAD_ELT_PTR]], align + // + // CHECK: ret + + // + // Load and reduce function + // CHECK: define internal void [[LOAD_REDUCE_FN]](i8*, i8*, i32, i32, i32) + // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align + // CHECK: [[REMOTE_ELT:%.+]] = alloca double + // CHECK: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* + // CHECK: [[SCRATCHPAD_PTR:%.+]] = load i8*, i8** {{.+}}, align + // CHECK: [[TEAM32:%.+]] = load i32, i32* {{.+}}, align + // CHECK: [[TEAM:%.+]] = sext i32 [[TEAM32]] to i64 + // CHECK: [[NUM_TEAMS32:%.+]] = load i32, i32* {{.+}}, align + // CHECK: [[NUM_TEAMS:%.+]] = sext i32 [[NUM_TEAMS32]] to i64 + // CHECK: [[SHOULD_REDUCE:%.+]] = load i32, i32* {{.+}}, align + // CHECK: [[SCRATCHPAD:%.+]] = ptrtoint i8* [[SCRATCHPAD_PTR]] to i64 + // + // CHECK: [[P:%.+]] = mul i64 8, [[TEAM]] + // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i64 [[SCRATCHPAD]], [[P]] + // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i64 [[SCRATCHPAD_ELT_PTR64]] to i8* + + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to double* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[SCRATCHPAD_ELT_PTR]], align + // CHECK: store double [[REMOTE_ELT_VAL]], double* [[REMOTE_ELT]], align + // CHECK: [[REMOTE_ELT_PTR:%.+]] = bitcast double* [[REMOTE_ELT]] to i8* + // CHECK: store i8* [[REMOTE_ELT_PTR]], i8** [[REMOTE_ELT_REF]], align + // + // CHECK: [[REDUCE:%.+]] = icmp eq i32 [[SHOULD_REDUCE]], 1 + // CHECK: br i1 [[REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] + // + // CHECK: [[DO_REDUCE]] + // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* + // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* + // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) + // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] + // + // Copy element from remote reduce list + // CHECK: [[REDUCE_ELSE]] + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* + // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to double* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[REMOTE_ELT]], align + // CHECK: store double [[REMOTE_ELT_VAL]], double* [[ELT]], align + // CHECK: br label {{%?}}[[REDUCE_CONT]] + // + // CHECK: [[REDUCE_CONT]] + // CHECK: ret + + + + + + + + + + + + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l33}}_worker() + + // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+template.+l33]]( + // + // CHECK: {{call|invoke}} void [[T2]]_worker() + // + // CHECK: call void @__kmpc_kernel_init( + // + // CHECK: store float {{1\.[0e\+]+}}, float* [[D:%.+]], align + // CHECK: [[C_VAL:%.+]] = load i8, i8* [[C:%.+]], align + // CHECK: [[CONV:%.+]] = sext i8 [[C_VAL]] to i32 + // CHECK: [[XOR:%.+]] = xor i32 [[CONV]], 2 + // CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8 + // CHECK: store i8 [[TRUNC]], i8* [[C]], align + // CHECK: [[DV:%.+]] = load float, float* [[D]], align + // CHECK: [[MUL:%.+]] = fmul float [[DV]], {{[0-9e\.\+]+}} + // CHECK: store float [[MUL]], float* [[D]], align + // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: store i8* [[C]], i8** [[PTR1]], align + // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[D_CAST:%.+]] = bitcast float* [[D]] to i8* + // CHECK: store i8* [[D_CAST]], i8** [[PTR2]], align + // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* + // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait(i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]], void (i8*, i8*, i32, i32)* [[SCRATCH_COPY_FN:@.+]], void (i8*, i8*, i32, i32, i32)* [[LOAD_REDUCE_FN:@.+]]) + // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1 + // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]] + // + // CHECK: [[IFLABEL]] + // CHECK: [[C_INV8:%.+]] = load i8, i8* [[C_IN:%.+]], align + // CHECK: [[C_INV:%.+]] = sext i8 [[C_INV8]] to i32 + // CHECK: [[CV8:%.+]] = load i8, i8* [[C]], align + // CHECK: [[CV:%.+]] = sext i8 [[CV8]] to i32 + // CHECK: [[XOR:%.+]] = xor i32 [[C_INV]], [[CV]] + // CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8 + // CHECK: store i8 [[TRUNC]], i8* [[C_IN]], align + // CHECK: [[D_INV:%.+]] = load float, float* [[D_IN:%.+]], align + // CHECK: [[DV:%.+]] = load float, float* [[D]], align + // CHECK: [[MUL:%.+]] = fmul float [[D_INV]], [[DV]] + // CHECK: store float [[MUL]], float* [[D_IN]], align + // CHECK: call void @__kmpc_nvptx_end_reduce_nowait( + // CHECK: br label %[[EXIT]] + // + // CHECK: [[EXIT]] + // CHECK: call void @__kmpc_kernel_deinit() + + // + // Reduction function + // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8*, i8*) + // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[VAR1_RHS:%.+]] = load i8*, i8** [[VAR1_RHS_REF]], + // + // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[VAR1_LHS:%.+]] = load i8*, i8** [[VAR1_LHS_REF]], + // + // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]], + // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to float* + // + // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]], + // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to float* + // + // CHECK: [[VAR1_LHS_VAL8:%.+]] = load i8, i8* [[VAR1_LHS]], + // CHECK: [[VAR1_LHS_VAL:%.+]] = sext i8 [[VAR1_LHS_VAL8]] to i32 + // CHECK: [[VAR1_RHS_VAL8:%.+]] = load i8, i8* [[VAR1_RHS]], + // CHECK: [[VAR1_RHS_VAL:%.+]] = sext i8 [[VAR1_RHS_VAL8]] to i32 + // CHECK: [[XOR:%.+]] = xor i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]] + // CHECK: [[RES:%.+]] = trunc i32 [[XOR]] to i8 + // CHECK: store i8 [[RES]], i8* [[VAR1_LHS]], + // + // CHECK: [[VAR2_LHS_VAL:%.+]] = load float, float* [[VAR2_LHS]], + // CHECK: [[VAR2_RHS_VAL:%.+]] = load float, float* [[VAR2_RHS]], + // CHECK: [[RES:%.+]] = fmul float [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]] + // CHECK: store float [[RES]], float* [[VAR2_LHS]], + // CHECK: ret void + + // + // Shuffle and reduce function + // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) + // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align + // CHECK: [[REMOTE_ELT1:%.+]] = alloca i8 + // CHECK: [[REMOTE_ELT2:%.+]] = alloca float + // + // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align + // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align + // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align + // + // CHECK: [[ELT_CAST:%.+]] = sext i8 [[ELT_VAL]] to i32 + // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 + // CHECK: [[REMOTE_ELT1_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]]) + // CHECK: [[REMOTE_ELT1_VAL:%.+]] = trunc i32 [[REMOTE_ELT1_VAL32]] to i8 + // + // CHECK: store i8 [[REMOTE_ELT1_VAL]], i8* [[REMOTE_ELT1]], align + // CHECK: store i8* [[REMOTE_ELT1]], i8** [[REMOTE_ELT_REF]], align + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* + // CHECK: [[ELT_VAL:%.+]] = load float, float* [[ELT]], align + // + // CHECK: [[ELT_CAST:%.+]] = bitcast float [[ELT_VAL]] to i32 + // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 + // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]]) + // CHECK: [[REMOTE_ELT2_VAL:%.+]] = bitcast i32 [[REMOTE_ELT2_VAL32]] to float + // + // CHECK: store float [[REMOTE_ELT2_VAL]], float* [[REMOTE_ELT2]], align + // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast float* [[REMOTE_ELT2]] to i8* + // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align + // + // Condition to reduce + // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0 + // + // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 + // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]] + // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]] + // + // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2 + // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1 + // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0 + // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]] + // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0 + // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]] + // + // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]] + // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]] + // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] + // + // CHECK: [[DO_REDUCE]] + // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* + // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* + // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) + // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] + // + // CHECK: [[REDUCE_ELSE]] + // CHECK: br label {{%?}}[[REDUCE_CONT]] + // + // CHECK: [[REDUCE_CONT]] + // Now check if we should just copy over the remote reduction list + // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 + // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]] + // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]] + // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // CHECK: [[DO_COPY]] + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i8, i8* [[REMOTE_ELT_VOID]], align + // CHECK: store i8 [[REMOTE_ELT_VAL]], i8* [[ELT_VOID]], align + // + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* + // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to float* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[REMOTE_ELT]], align + // CHECK: store float [[REMOTE_ELT_VAL]], float* [[ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // CHECK: [[COPY_CONT]] + // CHECK: void + + // + // Inter warp copy function + // CHECK: define internal void [[WARP_COPY_FN]](i8*, i32) + // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 + // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 + // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* + // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 + // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // [[DO_COPY]] + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align + // + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: store i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // Barrier after copy to shared memory storage medium. + // CHECK: [[COPY_CONT]] + // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // + // Read into warp 0. + // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] + // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] + // + // CHECK: [[DO_READ]] + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: store i8 [[MEDIUM_ELT_VAL]], i8* [[ELT_VOID]], align + // CHECK: br label {{%?}}[[READ_CONT:.+]] + // + // CHECK: [[READ_ELSE]] + // CHECK: br label {{%?}}[[READ_CONT]] + // + // CHECK: [[READ_CONT]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 + // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // [[DO_COPY]] + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* + // CHECK: [[ELT_VAL:%.+]] = load float, float* [[ELT]], align + // + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to float addrspace([[SHARED_ADDRSPACE]])* + // CHECK: store float [[ELT_VAL]], float addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // Barrier after copy to shared memory storage medium. + // CHECK: [[COPY_CONT]] + // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // + // Read into warp 0. + // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] + // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] + // + // CHECK: [[DO_READ]] + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to float addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load float, float addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* + // CHECK: store float [[MEDIUM_ELT_VAL]], float* [[ELT]], align + // CHECK: br label {{%?}}[[READ_CONT:.+]] + // + // CHECK: [[READ_ELSE]] + // CHECK: br label {{%?}}[[READ_CONT]] + // + // CHECK: [[READ_CONT]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: ret + + // + // Copy to scratchpad function + // CHECK: define internal void [[SCRATCH_COPY_FN]](i8*, i8*, i32, i32) + // CHECK: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* + // CHECK: [[SCRATCHPAD_PTR:%.+]] = load i8*, i8** {{.+}}, align + // CHECK: [[TEAM32:%.+]] = load i32, i32* {{.+}}, align + // CHECK: [[TEAM:%.+]] = sext i32 [[TEAM32]] to i64 + // CHECK: [[NUM_TEAMS32:%.+]] = load i32, i32* {{.+}}, align + // CHECK: [[NUM_TEAMS:%.+]] = sext i32 [[NUM_TEAMS32]] to i64 + // CHECK: [[SCRATCHPAD:%.+]] = ptrtoint i8* [[SCRATCHPAD_PTR]] to i64 + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // + // CHECK: [[P:%.+]] = mul i64 1, [[TEAM]] + // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i64 [[SCRATCHPAD]], [[P]] + // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = inttoptr i64 [[SCRATCHPAD_ELT_PTR64]] to i8* + // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align + // CHECK: store i8 [[ELT_VAL]], i8* [[SCRATCHPAD_ELT_PTR]], align + // + // CHECK: [[OF:%.+]] = mul i64 [[NUM_TEAMS]], 1 + // CHECK: [[POS1:%.+]] = add i64 [[SCRATCHPAD]], [[OF]] + // CHECK: [[POS2:%.+]] = sub i64 [[POS1]], 1 + // CHECK: [[POS3:%.+]] = sdiv i64 [[POS2]], 256 + // CHECK: [[POS4:%.+]] = add i64 [[POS3]], 1 + // CHECK: [[SCRATCHPAD_NEXT:%.+]] = mul i64 [[POS4]], 256 + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // + // CHECK: [[P:%.+]] = mul i64 4, [[TEAM]] + // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i64 [[SCRATCHPAD_NEXT]], [[P]] + // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i64 [[SCRATCHPAD_ELT_PTR64]] to i8* + // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to float* + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* + // CHECK: [[ELT_VAL:%.+]] = load float, float* [[ELT]], align + // CHECK: store float [[ELT_VAL]], float* [[SCRATCHPAD_ELT_PTR]], align + // + // CHECK: ret + + // + // Load and reduce function + // CHECK: define internal void [[LOAD_REDUCE_FN]](i8*, i8*, i32, i32, i32) + // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align + // CHECK: [[REMOTE_ELT1:%.+]] = alloca i8 + // CHECK: [[REMOTE_ELT2:%.+]] = alloca float + // CHECK: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* + // CHECK: [[SCRATCHPAD_PTR:%.+]] = load i8*, i8** {{.+}}, align + // CHECK: [[TEAM32:%.+]] = load i32, i32* {{.+}}, align + // CHECK: [[TEAM:%.+]] = sext i32 [[TEAM32]] to i64 + // CHECK: [[NUM_TEAMS32:%.+]] = load i32, i32* {{.+}}, align + // CHECK: [[NUM_TEAMS:%.+]] = sext i32 [[NUM_TEAMS32]] to i64 + // CHECK: [[SHOULD_REDUCE:%.+]] = load i32, i32* {{.+}}, align + // CHECK: [[SCRATCHPAD:%.+]] = ptrtoint i8* [[SCRATCHPAD_PTR]] to i64 + // + // CHECK: [[P:%.+]] = mul i64 1, [[TEAM]] + // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i64 [[SCRATCHPAD]], [[P]] + // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i64 [[SCRATCHPAD_ELT_PTR64]] to i8* + + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i8, i8* [[SCRATCHPAD_ELT_PTR_VOID]], align + // CHECK: store i8 [[REMOTE_ELT_VAL]], i8* [[REMOTE_ELT1]], align + // CHECK: store i8* [[REMOTE_ELT1]], i8** [[REMOTE_ELT_REF]], align + // + // CHECK: [[OF:%.+]] = mul i64 [[NUM_TEAMS]], 1 + // CHECK: [[POS1:%.+]] = add i64 [[SCRATCHPAD]], [[OF]] + // CHECK: [[POS2:%.+]] = sub i64 [[POS1]], 1 + // CHECK: [[POS3:%.+]] = sdiv i64 [[POS2]], 256 + // CHECK: [[POS4:%.+]] = add i64 [[POS3]], 1 + // CHECK: [[SCRATCHPAD_NEXT:%.+]] = mul i64 [[POS4]], 256 + // + // CHECK: [[P:%.+]] = mul i64 4, [[TEAM]] + // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i64 [[SCRATCHPAD_NEXT]], [[P]] + // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i64 [[SCRATCHPAD_ELT_PTR64]] to i8* + + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to float* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[SCRATCHPAD_ELT_PTR]], align + // CHECK: store float [[REMOTE_ELT_VAL]], float* [[REMOTE_ELT2]], align + // CHECK: [[REMOTE_ELT_PTR:%.+]] = bitcast float* [[REMOTE_ELT2]] to i8* + // CHECK: store i8* [[REMOTE_ELT_PTR]], i8** [[REMOTE_ELT_REF]], align + // + // CHECK: [[REDUCE:%.+]] = icmp eq i32 [[SHOULD_REDUCE]], 1 + // CHECK: br i1 [[REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] + // + // CHECK: [[DO_REDUCE]] + // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* + // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* + // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) + // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] + // + // Copy element from remote reduce list + // CHECK: [[REDUCE_ELSE]] + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i8, i8* [[REMOTE_ELT_VOID]], align + // CHECK: store i8 [[REMOTE_ELT_VAL]], i8* [[ELT_VOID]], align + // + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* + // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to float* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[REMOTE_ELT]], align + // CHECK: store float [[REMOTE_ELT_VAL]], float* [[ELT]], align + // CHECK: br label {{%?}}[[REDUCE_CONT]] + // + // CHECK: [[REDUCE_CONT]] + // CHECK: ret + + + + + + + + + + + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l40}}_worker() + + // CHECK: define {{.*}}void [[T3:@__omp_offloading_.+template.+l40]]( + // + // CHECK: {{call|invoke}} void [[T3]]_worker() + // + // CHECK: call void @__kmpc_kernel_init( + // + // CHECK: store i32 0, i32* [[A:%.+]], align + // CHECK: store i16 -32768, i16* [[B:%.+]], align + // CHECK: [[A_VAL:%.+]] = load i32, i32* [[A:%.+]], align + // CHECK: [[OR:%.+]] = or i32 [[A_VAL]], 1 + // CHECK: store i32 [[OR]], i32* [[A]], align + // CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align + // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32 + // CHECK: [[CMP:%.+]] = icmp sgt i32 99, [[BV]] + // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] + // + // CHECK: [[DO_MAX]] + // CHECK: br label {{%?}}[[MAX_CONT:.+]] + // + // CHECK: [[MAX_ELSE]] + // CHECK: [[BV:%.+]] = load i16, i16* [[B]], align + // CHECK: [[MAX:%.+]] = sext i16 [[BV]] to i32 + // CHECK: br label {{%?}}[[MAX_CONT]] + // + // CHECK: [[MAX_CONT]] + // CHECK: [[B_LVALUE:%.+]] = phi i32 [ 99, %[[DO_MAX]] ], [ [[MAX]], %[[MAX_ELSE]] ] + // CHECK: [[TRUNC:%.+]] = trunc i32 [[B_LVALUE]] to i16 + // CHECK: store i16 [[TRUNC]], i16* [[B]], align + // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[A_CAST:%.+]] = bitcast i32* [[A]] to i8* + // CHECK: store i8* [[A_CAST]], i8** [[PTR1]], align + // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[B_CAST:%.+]] = bitcast i16* [[B]] to i8* + // CHECK: store i8* [[B_CAST]], i8** [[PTR2]], align + // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* + // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait(i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]], void (i8*, i8*, i32, i32)* [[SCRATCH_COPY_FN:@.+]], void (i8*, i8*, i32, i32, i32)* [[LOAD_REDUCE_FN:@.+]]) + // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1 + // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]] + // + // CHECK: [[IFLABEL]] + // CHECK: [[A_INV:%.+]] = load i32, i32* [[A_IN:%.+]], align + // CHECK: [[AV:%.+]] = load i32, i32* [[A]], align + // CHECK: [[OR:%.+]] = or i32 [[A_INV]], [[AV]] + // CHECK: store i32 [[OR]], i32* [[A_IN]], align + // CHECK: [[B_INV16:%.+]] = load i16, i16* [[B_IN:%.+]], align + // CHECK: [[B_INV:%.+]] = sext i16 [[B_INV16]] to i32 + // CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align + // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32 + // CHECK: [[CMP:%.+]] = icmp sgt i32 [[B_INV]], [[BV]] + // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] + // + // CHECK: [[DO_MAX]] + // CHECK: [[MAX1:%.+]] = load i16, i16* [[B_IN]], align + // CHECK: br label {{%?}}[[MAX_CONT:.+]] + // + // CHECK: [[MAX_ELSE]] + // CHECK: [[MAX2:%.+]] = load i16, i16* [[B]], align + // CHECK: br label {{%?}}[[MAX_CONT]] + // + // CHECK: [[MAX_CONT]] + // CHECK: [[B_MAX:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ] + // CHECK: store i16 [[B_MAX]], i16* [[B_IN]], align + // CHECK: call void @__kmpc_nvptx_end_reduce_nowait( + // CHECK: br label %[[EXIT]] + // + // CHECK: [[EXIT]] + // CHECK: call void @__kmpc_kernel_deinit() + + // + // Reduction function + // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8*, i8*) + // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]], + // CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to i32* + // + // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]], + // CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to i32* + // + // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]], + // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to i16* + // + // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]], + // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to i16* + // + // CHECK: [[VAR1_LHS_VAL:%.+]] = load i32, i32* [[VAR1_LHS]], + // CHECK: [[VAR1_RHS_VAL:%.+]] = load i32, i32* [[VAR1_RHS]], + // CHECK: [[OR:%.+]] = or i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]] + // CHECK: store i32 [[OR]], i32* [[VAR1_LHS]], + // + // CHECK: [[VAR2_LHS_VAL16:%.+]] = load i16, i16* [[VAR2_LHS]], + // CHECK: [[VAR2_LHS_VAL:%.+]] = sext i16 [[VAR2_LHS_VAL16]] to i32 + // CHECK: [[VAR2_RHS_VAL16:%.+]] = load i16, i16* [[VAR2_RHS]], + // CHECK: [[VAR2_RHS_VAL:%.+]] = sext i16 [[VAR2_RHS_VAL16]] to i32 + // + // CHECK: [[CMP:%.+]] = icmp sgt i32 [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]] + // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] + // + // CHECK: [[DO_MAX]] + // CHECK: [[MAX1:%.+]] = load i16, i16* [[VAR2_LHS]], align + // CHECK: br label {{%?}}[[MAX_CONT:.+]] + // + // CHECK: [[MAX_ELSE]] + // CHECK: [[MAX2:%.+]] = load i16, i16* [[VAR2_RHS]], align + // CHECK: br label {{%?}}[[MAX_CONT]] + // + // CHECK: [[MAX_CONT]] + // CHECK: [[MAXV:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ] + // CHECK: store i16 [[MAXV]], i16* [[VAR2_LHS]], + // CHECK: ret void + + // + // Shuffle and reduce function + // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) + // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align + // CHECK: [[REMOTE_ELT1:%.+]] = alloca i32 + // CHECK: [[REMOTE_ELT2:%.+]] = alloca i16 + // + // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align + // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align + // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align + // + // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 + // CHECK: [[REMOTE_ELT1_VAL:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]]) + // + // CHECK: store i32 [[REMOTE_ELT1_VAL]], i32* [[REMOTE_ELT1]], align + // CHECK: [[REMOTE_ELT1C:%.+]] = bitcast i32* [[REMOTE_ELT1]] to i8* + // CHECK: store i8* [[REMOTE_ELT1C]], i8** [[REMOTE_ELT_REF]], align + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* + // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align + // + // CHECK: [[ELT_CAST:%.+]] = sext i16 [[ELT_VAL]] to i32 + // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 + // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]]) + // CHECK: [[REMOTE_ELT2_VAL:%.+]] = trunc i32 [[REMOTE_ELT2_VAL32]] to i16 + // + // CHECK: store i16 [[REMOTE_ELT2_VAL]], i16* [[REMOTE_ELT2]], align + // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast i16* [[REMOTE_ELT2]] to i8* + // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align + // + // Condition to reduce + // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0 + // + // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 + // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]] + // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]] + // + // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2 + // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1 + // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0 + // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]] + // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0 + // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]] + // + // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]] + // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]] + // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] + // + // CHECK: [[DO_REDUCE]] + // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* + // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* + // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) + // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] + // + // CHECK: [[REDUCE_ELSE]] + // CHECK: br label {{%?}}[[REDUCE_CONT]] + // + // CHECK: [[REDUCE_CONT]] + // Now check if we should just copy over the remote reduction list + // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 + // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]] + // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]] + // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // CHECK: [[DO_COPY]] + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align + // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align + // + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* + // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align + // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // CHECK: [[COPY_CONT]] + // CHECK: void + + // + // Inter warp copy function + // CHECK: define internal void [[WARP_COPY_FN]](i8*, i32) + // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 + // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 + // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* + // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 + // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // [[DO_COPY]] + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align + // + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i32 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: store i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // Barrier after copy to shared memory storage medium. + // CHECK: [[COPY_CONT]] + // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // + // Read into warp 0. + // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] + // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] + // + // CHECK: [[DO_READ]] + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i32 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align + // CHECK: br label {{%?}}[[READ_CONT:.+]] + // + // CHECK: [[READ_ELSE]] + // CHECK: br label {{%?}}[[READ_CONT]] + // + // CHECK: [[READ_CONT]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 + // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // [[DO_COPY]] + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* + // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align + // + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i16 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: store i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // Barrier after copy to shared memory storage medium. + // CHECK: [[COPY_CONT]] + // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // + // Read into warp 0. + // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] + // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] + // + // CHECK: [[DO_READ]] + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i16 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* + // CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align + // CHECK: br label {{%?}}[[READ_CONT:.+]] + // + // CHECK: [[READ_ELSE]] + // CHECK: br label {{%?}}[[READ_CONT]] + // + // CHECK: [[READ_CONT]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: ret + + // + // Copy to scratchpad function + // CHECK: define internal void [[SCRATCH_COPY_FN]](i8*, i8*, i32, i32) + // CHECK: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* + // CHECK: [[SCRATCHPAD_PTR:%.+]] = load i8*, i8** {{.+}}, align + // CHECK: [[TEAM32:%.+]] = load i32, i32* {{.+}}, align + // CHECK: [[TEAM:%.+]] = sext i32 [[TEAM32]] to i64 + // CHECK: [[NUM_TEAMS32:%.+]] = load i32, i32* {{.+}}, align + // CHECK: [[NUM_TEAMS:%.+]] = sext i32 [[NUM_TEAMS32]] to i64 + // CHECK: [[SCRATCHPAD:%.+]] = ptrtoint i8* [[SCRATCHPAD_PTR]] to i64 + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // + // CHECK: [[P:%.+]] = mul i64 4, [[TEAM]] + // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i64 [[SCRATCHPAD]], [[P]] + // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i64 [[SCRATCHPAD_ELT_PTR64]] to i8* + // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to i32* + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align + // CHECK: store i32 [[ELT_VAL]], i32* [[SCRATCHPAD_ELT_PTR]], align + // + // CHECK: [[OF:%.+]] = mul i64 [[NUM_TEAMS]], 4 + // CHECK: [[POS1:%.+]] = add i64 [[SCRATCHPAD]], [[OF]] + // CHECK: [[POS2:%.+]] = sub i64 [[POS1]], 1 + // CHECK: [[POS3:%.+]] = sdiv i64 [[POS2]], 256 + // CHECK: [[POS4:%.+]] = add i64 [[POS3]], 1 + // CHECK: [[SCRATCHPAD_NEXT:%.+]] = mul i64 [[POS4]], 256 + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // + // CHECK: [[P:%.+]] = mul i64 2, [[TEAM]] + // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i64 [[SCRATCHPAD_NEXT]], [[P]] + // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i64 [[SCRATCHPAD_ELT_PTR64]] to i8* + // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to i16* + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* + // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align + // CHECK: store i16 [[ELT_VAL]], i16* [[SCRATCHPAD_ELT_PTR]], align + // + // CHECK: ret + + // + // Load and reduce function + // CHECK: define internal void [[LOAD_REDUCE_FN]](i8*, i8*, i32, i32, i32) + // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align + // CHECK: [[REMOTE_ELT1:%.+]] = alloca i32 + // CHECK: [[REMOTE_ELT2:%.+]] = alloca i16 + // CHECK: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* + // CHECK: [[SCRATCHPAD_PTR:%.+]] = load i8*, i8** {{.+}}, align + // CHECK: [[TEAM32:%.+]] = load i32, i32* {{.+}}, align + // CHECK: [[TEAM:%.+]] = sext i32 [[TEAM32]] to i64 + // CHECK: [[NUM_TEAMS32:%.+]] = load i32, i32* {{.+}}, align + // CHECK: [[NUM_TEAMS:%.+]] = sext i32 [[NUM_TEAMS32]] to i64 + // CHECK: [[SHOULD_REDUCE:%.+]] = load i32, i32* {{.+}}, align + // CHECK: [[SCRATCHPAD:%.+]] = ptrtoint i8* [[SCRATCHPAD_PTR]] to i64 + // + // CHECK: [[P:%.+]] = mul i64 4, [[TEAM]] + // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i64 [[SCRATCHPAD]], [[P]] + // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i64 [[SCRATCHPAD_ELT_PTR64]] to i8* + + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to i32* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[SCRATCHPAD_ELT_PTR]], align + // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[REMOTE_ELT1]], align + // CHECK: [[REMOTE_ELT1_PTR:%.+]] = bitcast i32* [[REMOTE_ELT1]] to i8* + // CHECK: store i8* [[REMOTE_ELT1_PTR]], i8** [[REMOTE_ELT_REF]], align + // + // CHECK: [[OF:%.+]] = mul i64 [[NUM_TEAMS]], 4 + // CHECK: [[POS1:%.+]] = add i64 [[SCRATCHPAD]], [[OF]] + // CHECK: [[POS2:%.+]] = sub i64 [[POS1]], 1 + // CHECK: [[POS3:%.+]] = sdiv i64 [[POS2]], 256 + // CHECK: [[POS4:%.+]] = add i64 [[POS3]], 1 + // CHECK: [[SCRATCHPAD_NEXT:%.+]] = mul i64 [[POS4]], 256 + // + // CHECK: [[P:%.+]] = mul i64 2, [[TEAM]] + // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i64 [[SCRATCHPAD_NEXT]], [[P]] + // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i64 [[SCRATCHPAD_ELT_PTR64]] to i8* + + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to i16* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[SCRATCHPAD_ELT_PTR]], align + // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[REMOTE_ELT2]], align + // CHECK: [[REMOTE_ELT_PTR:%.+]] = bitcast i16* [[REMOTE_ELT2]] to i8* + // CHECK: store i8* [[REMOTE_ELT_PTR]], i8** [[REMOTE_ELT_REF]], align + // + // CHECK: [[REDUCE:%.+]] = icmp eq i32 [[SHOULD_REDUCE]], 1 + // CHECK: br i1 [[REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] + // + // CHECK: [[DO_REDUCE]] + // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* + // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* + // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) + // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] + // + // Copy element from remote reduce list + // CHECK: [[REDUCE_ELSE]] + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align + // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align + // + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* + // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align + // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align + // CHECK: br label {{%?}}[[REDUCE_CONT]] + // + // CHECK: [[REDUCE_CONT]] + // CHECK: ret + + +#endif