diff --git a/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h --- a/clang/include/clang-c/Index.h +++ b/clang/include/clang-c/Index.h @@ -2568,7 +2568,11 @@ */ CXCursor_OMPScanDirective = 287, - CXCursor_LastStmt = CXCursor_OMPScanDirective, + /** OpenMP canonical loop. + */ + CXCursor_OMPCanonicalLoop = 288, + + CXCursor_LastStmt = CXCursor_OMPCanonicalLoop, /** * Cursor that represents the translation unit itself. diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h --- a/clang/include/clang/AST/RecursiveASTVisitor.h +++ b/clang/include/clang/AST/RecursiveASTVisitor.h @@ -2787,6 +2787,14 @@ return true; } +DEF_TRAVERSE_STMT(OMPCanonicalLoop, { + if (!getDerived().shouldVisitImplicitCode()) { + // Visit only the syntactical loop. + TRY_TO(TraverseStmt(S->getLoopStmt())); + ShouldVisitChildren = false; + } +}) + template bool RecursiveASTVisitor::TraverseOMPLoopDirective(OMPLoopDirective *S) { diff --git a/clang/include/clang/AST/StmtOpenMP.h b/clang/include/clang/AST/StmtOpenMP.h --- a/clang/include/clang/AST/StmtOpenMP.h +++ b/clang/include/clang/AST/StmtOpenMP.h @@ -28,6 +28,201 @@ // AST classes for directives. //===----------------------------------------------------------------------===// +/// Representation of an OpenMP canonical loop. +/// +/// OpenMP 1.0 C/C++, section 2.4.1 for Construct; canonical-shape +/// OpenMP 2.0 C/C++, section 2.4.1 for Construct; canonical-shape +/// OpenMP 2.5, section 2.5.1 Loop Construct; canonical form +/// OpenMP 3.1, section 2.5.1 Loop Construct; canonical form +/// OpenMP 4.0, section 2.6 Canonical Loop Form +/// OpenMP 4.5, section 2.6 Canonical Loop Form +/// OpenMP 5.0, section 2.9.1 Canonical Loop Form +/// OpenMP 5.1, section 2.11.1 Canonical Loop Nest Form +/// +/// An OpenMP canonical loop is a for-statement or range-based for-statement +/// with additional requirements that ensure that the number of iterations is +/// known before entering the loop and allow skipping to an arbitrary iteration. +/// The OMPCanonicalLoop AST node wraps a ForStmt or CXXRangeForStmt that is +/// known to fulfill OpenMP's canonical loop requirements. +/// +/// There are three different kinds of iteration variables for different +/// purposes: +/// * Loop user variable: The user-accessible variable with different value for +/// each iteration. +/// * Loop iteration variable: The variable used to identify a loop iteration; +/// for range-based for-statement, this is the hidden iterator '__begin'. For +/// other loops, it is identical to the loop variable. Must be a random-access +/// iterator, pointer or integer type. +/// * Logical iteration counter: Normalized loop counter starting at 0 and +/// incrementing by one at each iteration. Allows abstracting over the type +/// of the loop counter and is always an unsigned integer type appropriate to +/// represent the range of the loop counter variable. Its value corresponds to +/// the logical iteration number in the OpenMP specification. +/// +/// This AST node provides two captured statements: +/// * The distance function which computes the number of iterations. +/// * The loop user variable function that computes the loop user variable when +/// given a logical iteration number. +/// +/// These captured statements provide the link between C/C++ semantics and the +/// logical iteration counters used by the OpenMPIRBuilder which is +/// language-agnostic and therefore does not know e.g. how to advance a +/// random-access iterator. The OpenMPIRBuilder will use this information to +/// apply simd, workshare-loop, distribute, taskloop and loop directives to the +/// loop. For compatibility with the non-OpenMPIRBuilder codegen path, an +/// OMPCanonicalLoop can itself also be wrapped into the CapturedStmts of an +/// OMPLoopDirective and skipped when searching for the associated syntactical +/// loop. +/// +/// Example: +/// +/// std::vector Container{1,2,3}; +/// for (std::string Str : Container) +/// Body(Str); +/// +/// which is syntactic sugar for approximately: +/// +/// auto &&__range = Container; +/// auto __begin = std::begin(__range); +/// auto __end = std::end(__range); +/// for (; __begin != __end; ++__begin) { +/// std::String Str = *__begin; +/// Body(Str); +/// } +/// +/// In this example, the loop user variable is `Str`, the loop iteration +/// variable is `__begin` of type `std::vector::iterator` and the +/// logical iteration number type is `size_t` (unsigned version of +/// `std::vector::iterator::difference_type` aka `ptrdiff_t`). +/// Therefore, the distance function will be +/// [&](size_t &Result) { Result = __end - __begin; } +/// +/// and the loop variable function is +/// +/// [&,__begin](std::vector::iterator &Result, size_t Logical) { +/// Result = __begin + Logical; } +/// +class OMPCanonicalLoop : public Stmt { + friend class ASTStmtReader; + friend class ASTStmtWriter; + + /// Children of this AST node. + enum { + LOOPY_STMT, + DISTANCE_FUNC, + LOOPVAR_FUNC, + LOOPVAR_REF, + LastSubStmt = LOOPVAR_REF + }; + +private: + /// This AST node's children. + Stmt *SubStmts[LastSubStmt + 1] = {}; + + OMPCanonicalLoop() : Stmt(StmtClass::OMPCanonicalLoopClass) {} + +public: + /// Create a new OMPCanonicalLoop. + static OMPCanonicalLoop *create(const ASTContext &Ctx, Stmt *LoopStmt, + CapturedStmt *DistanceFunc, + CapturedStmt *LoopVarFunc, + DeclRefExpr *LoopVarRef) { + OMPCanonicalLoop *S = new (Ctx) OMPCanonicalLoop(); + S->setLoopStmt(LoopStmt); + S->setDistanceFunc(DistanceFunc); + S->setLoopVarFunc(LoopVarFunc); + S->setLoopVarRef(LoopVarRef); + return S; + } + + /// Create an empty OMPCanonicalLoop for deserialization. + static OMPCanonicalLoop *createEmpty(const ASTContext &Ctx) { + return new (Ctx) OMPCanonicalLoop(); + } + + static bool classof(const Stmt *S) { + return S->getStmtClass() == StmtClass::OMPCanonicalLoopClass; + } + + SourceLocation getBeginLoc() const { return getLoopStmt()->getBeginLoc(); } + SourceLocation getEndLoc() const { return getLoopStmt()->getEndLoc(); } + + /// Return this AST node's children. + /// @{ + child_range children() { + return child_range(&SubStmts[0], &SubStmts[0] + LastSubStmt + 1); + } + const_child_range children() const { + return const_child_range(&SubStmts[0], &SubStmts[0] + LastSubStmt + 1); + } + /// @} + + /// The wrapped syntactic loop statement (ForStmt or CXXForRangeStmt). + /// @{ + Stmt *getLoopStmt() { return SubStmts[LOOPY_STMT]; } + const Stmt *getLoopStmt() const { return SubStmts[LOOPY_STMT]; } + void setLoopStmt(Stmt *S) { + assert((isa(S) || isa(S)) && + "Canonical loop must be a for loop (range-based or otherwise)"); + SubStmts[LOOPY_STMT] = S; + } + /// @} + + /// The function that computes the number of loop iterations. Can be evaluated + /// before entering the loop but after the syntactical loop's init + /// statement(s). + /// + /// Function signature: void(LogicalTy &Result) + /// Any values necessary to compute the distance are captures of the closure. + /// @{ + CapturedStmt *getDistanceFunc() { + return cast(SubStmts[DISTANCE_FUNC]); + } + const CapturedStmt *getDistanceFunc() const { + return cast(SubStmts[DISTANCE_FUNC]); + } + void setDistanceFunc(CapturedStmt *S) { + assert(S && "Expected non-null captured statement"); + SubStmts[DISTANCE_FUNC] = S; + } + /// @} + + /// The function that compute the loop user variable from a logical iteration + /// counter. Can be evaluated as first statement in the loop. + /// + /// Function signature: void(LoopVarTy &Result, LogicalTy Number) + /// Any other values required to compute the loop user variable (such as start + /// value, step size) are captured by the closure. In particular, the initial + /// value of loop counter is captured by value to be unaffected by previous + /// iterations. + /// @{ + CapturedStmt *getLoopVarFunc() { + return cast(SubStmts[LOOPVAR_FUNC]); + } + const CapturedStmt *getLoopVarFunc() const { + return cast(SubStmts[LOOPVAR_FUNC]); + } + void setLoopVarFunc(CapturedStmt *S) { + assert(S && "Expected non-null captured statement"); + SubStmts[LOOPVAR_FUNC] = S; + } + /// @} + + /// Reference to the loop user variable as accessed in the loop body. + /// @{ + DeclRefExpr *getLoopVarRef() { + return cast(SubStmts[LOOPVAR_REF]); + } + const DeclRefExpr *getLoopVarRef() const { + return cast(SubStmts[LOOPVAR_REF]); + } + void setLoopVarRef(DeclRefExpr *E) { + assert(E && "Expected non-null loop variable"); + SubStmts[LOOPVAR_REF] = E; + } + /// @} +}; + /// This is a basic class for representing single OpenMP executable /// directive. /// diff --git a/clang/include/clang/Basic/StmtNodes.td b/clang/include/clang/Basic/StmtNodes.td --- a/clang/include/clang/Basic/StmtNodes.td +++ b/clang/include/clang/Basic/StmtNodes.td @@ -216,6 +216,7 @@ def AsTypeExpr : StmtNode; // OpenMP Directives. +def OMPCanonicalLoop : StmtNode; def OMPExecutableDirective : StmtNode; def OMPLoopDirective : StmtNode; def OMPParallelDirective : StmtNode; diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -10482,6 +10482,11 @@ /// Initialization of captured region for OpenMP region. void ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope); + + /// Called for syntactical loops (ForStmt for CXXRangeForStmt) associated to + /// an OpenMP loop directive. + StmtResult ActOnOpenMPCanonicalLoop(Stmt *AStmt); + /// End of OpenMP region. /// /// \param S Statement associated with the current OpenMP region. diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h --- a/clang/include/clang/Serialization/ASTBitCodes.h +++ b/clang/include/clang/Serialization/ASTBitCodes.h @@ -1830,21 +1830,21 @@ /// A CXXBoolLiteralExpr record. EXPR_CXX_BOOL_LITERAL, - EXPR_CXX_NULL_PTR_LITERAL, // CXXNullPtrLiteralExpr - EXPR_CXX_TYPEID_EXPR, // CXXTypeidExpr (of expr). - EXPR_CXX_TYPEID_TYPE, // CXXTypeidExpr (of type). - EXPR_CXX_THIS, // CXXThisExpr - EXPR_CXX_THROW, // CXXThrowExpr - EXPR_CXX_DEFAULT_ARG, // CXXDefaultArgExpr - EXPR_CXX_DEFAULT_INIT, // CXXDefaultInitExpr - EXPR_CXX_BIND_TEMPORARY, // CXXBindTemporaryExpr + EXPR_CXX_NULL_PTR_LITERAL, // CXXNullPtrLiteralExpr + EXPR_CXX_TYPEID_EXPR, // CXXTypeidExpr (of expr). + EXPR_CXX_TYPEID_TYPE, // CXXTypeidExpr (of type). + EXPR_CXX_THIS, // CXXThisExpr + EXPR_CXX_THROW, // CXXThrowExpr + EXPR_CXX_DEFAULT_ARG, // CXXDefaultArgExpr + EXPR_CXX_DEFAULT_INIT, // CXXDefaultInitExpr + EXPR_CXX_BIND_TEMPORARY, // CXXBindTemporaryExpr EXPR_CXX_SCALAR_VALUE_INIT, // CXXScalarValueInitExpr EXPR_CXX_NEW, // CXXNewExpr EXPR_CXX_DELETE, // CXXDeleteExpr EXPR_CXX_PSEUDO_DESTRUCTOR, // CXXPseudoDestructorExpr - EXPR_EXPR_WITH_CLEANUPS, // ExprWithCleanups + EXPR_EXPR_WITH_CLEANUPS, // ExprWithCleanups EXPR_CXX_DEPENDENT_SCOPE_MEMBER, // CXXDependentScopeMemberExpr EXPR_CXX_DEPENDENT_SCOPE_DECL_REF, // DependentScopeDeclRefExpr @@ -1852,41 +1852,42 @@ EXPR_CXX_UNRESOLVED_MEMBER, // UnresolvedMemberExpr EXPR_CXX_UNRESOLVED_LOOKUP, // UnresolvedLookupExpr - EXPR_CXX_EXPRESSION_TRAIT, // ExpressionTraitExpr - EXPR_CXX_NOEXCEPT, // CXXNoexceptExpr + EXPR_CXX_EXPRESSION_TRAIT, // ExpressionTraitExpr + EXPR_CXX_NOEXCEPT, // CXXNoexceptExpr - EXPR_OPAQUE_VALUE, // OpaqueValueExpr - EXPR_BINARY_CONDITIONAL_OPERATOR, // BinaryConditionalOperator - EXPR_TYPE_TRAIT, // TypeTraitExpr - EXPR_ARRAY_TYPE_TRAIT, // ArrayTypeTraitIntExpr + EXPR_OPAQUE_VALUE, // OpaqueValueExpr + EXPR_BINARY_CONDITIONAL_OPERATOR, // BinaryConditionalOperator + EXPR_TYPE_TRAIT, // TypeTraitExpr + EXPR_ARRAY_TYPE_TRAIT, // ArrayTypeTraitIntExpr - EXPR_PACK_EXPANSION, // PackExpansionExpr - EXPR_SIZEOF_PACK, // SizeOfPackExpr - EXPR_SUBST_NON_TYPE_TEMPLATE_PARM, // SubstNonTypeTemplateParmExpr - EXPR_SUBST_NON_TYPE_TEMPLATE_PARM_PACK,// SubstNonTypeTemplateParmPackExpr - EXPR_FUNCTION_PARM_PACK, // FunctionParmPackExpr - EXPR_MATERIALIZE_TEMPORARY, // MaterializeTemporaryExpr - EXPR_CXX_FOLD, // CXXFoldExpr - EXPR_CONCEPT_SPECIALIZATION,// ConceptSpecializationExpr - EXPR_REQUIRES, // RequiresExpr + EXPR_PACK_EXPANSION, // PackExpansionExpr + EXPR_SIZEOF_PACK, // SizeOfPackExpr + EXPR_SUBST_NON_TYPE_TEMPLATE_PARM, // SubstNonTypeTemplateParmExpr + EXPR_SUBST_NON_TYPE_TEMPLATE_PARM_PACK, // SubstNonTypeTemplateParmPackExpr + EXPR_FUNCTION_PARM_PACK, // FunctionParmPackExpr + EXPR_MATERIALIZE_TEMPORARY, // MaterializeTemporaryExpr + EXPR_CXX_FOLD, // CXXFoldExpr + EXPR_CONCEPT_SPECIALIZATION, // ConceptSpecializationExpr + EXPR_REQUIRES, // RequiresExpr // CUDA - EXPR_CUDA_KERNEL_CALL, // CUDAKernelCallExpr + EXPR_CUDA_KERNEL_CALL, // CUDAKernelCallExpr // OpenCL - EXPR_ASTYPE, // AsTypeExpr + EXPR_ASTYPE, // AsTypeExpr // Microsoft - EXPR_CXX_PROPERTY_REF_EXPR, // MSPropertyRefExpr + EXPR_CXX_PROPERTY_REF_EXPR, // MSPropertyRefExpr EXPR_CXX_PROPERTY_SUBSCRIPT_EXPR, // MSPropertySubscriptExpr - EXPR_CXX_UUIDOF_EXPR, // CXXUuidofExpr (of expr). - EXPR_CXX_UUIDOF_TYPE, // CXXUuidofExpr (of type). - STMT_SEH_LEAVE, // SEHLeaveStmt - STMT_SEH_EXCEPT, // SEHExceptStmt - STMT_SEH_FINALLY, // SEHFinallyStmt - STMT_SEH_TRY, // SEHTryStmt + EXPR_CXX_UUIDOF_EXPR, // CXXUuidofExpr (of expr). + EXPR_CXX_UUIDOF_TYPE, // CXXUuidofExpr (of type). + STMT_SEH_LEAVE, // SEHLeaveStmt + STMT_SEH_EXCEPT, // SEHExceptStmt + STMT_SEH_FINALLY, // SEHFinallyStmt + STMT_SEH_TRY, // SEHTryStmt // OpenMP directives + STMT_OMP_CANONICAL_LOOP, STMT_OMP_PARALLEL_DIRECTIVE, STMT_OMP_SIMD_DIRECTIVE, STMT_OMP_FOR_DIRECTIVE, @@ -1946,10 +1947,10 @@ EXPR_OMP_ITERATOR, // ARC - EXPR_OBJC_BRIDGED_CAST, // ObjCBridgedCastExpr + EXPR_OBJC_BRIDGED_CAST, // ObjCBridgedCastExpr - STMT_MS_DEPENDENT_EXISTS, // MSDependentExistsStmt - EXPR_LAMBDA, // LambdaExpr + STMT_MS_DEPENDENT_EXISTS, // MSDependentExistsStmt + EXPR_LAMBDA, // LambdaExpr STMT_COROUTINE_BODY, STMT_CORETURN, EXPR_COAWAIT, diff --git a/clang/lib/AST/Stmt.cpp b/clang/lib/AST/Stmt.cpp --- a/clang/lib/AST/Stmt.cpp +++ b/clang/lib/AST/Stmt.cpp @@ -1266,13 +1266,6 @@ break; case VCK_ByCopy: assert(Var && "capturing by copy must have a variable!"); - assert( - (Var->getType()->isScalarType() || (Var->getType()->isReferenceType() && - Var->getType() - ->castAs() - ->getPointeeType() - ->isScalarType())) && - "captures by copy are expected to have a scalar type!"); break; case VCK_VLAType: assert(!Var && diff --git a/clang/lib/AST/StmtOpenMP.cpp b/clang/lib/AST/StmtOpenMP.cpp --- a/clang/lib/AST/StmtOpenMP.cpp +++ b/clang/lib/AST/StmtOpenMP.cpp @@ -91,6 +91,8 @@ for (Stmt *S : CS->body()) { if (!S) continue; + if (auto *CanonLoop = dyn_cast(S)) + S = CanonLoop->getLoopStmt(); if (isa(S) || isa(S)) { // Only single loop construct is allowed. if (CurStmt) { @@ -121,6 +123,8 @@ Stmt *OMPLoopDirective::getBody() { // This relies on the loop form is already checked by Sema. Stmt *Body = Data->getRawStmt()->IgnoreContainers(); + if (auto *CanonLoop = dyn_cast(Body)) + Body = CanonLoop->getLoopStmt(); if (auto *For = dyn_cast(Body)) { Body = For->getBody(); } else { @@ -130,6 +134,8 @@ } for (unsigned Cnt = 1; Cnt < CollapsedNum; ++Cnt) { Body = tryToFindNextInnerLoop(Body, /*TryImperfectlyNestedLoops=*/true); + if (auto *CanonLoop = dyn_cast(Body)) + Body = CanonLoop->getLoopStmt(); if (auto *For = dyn_cast(Body)) { Body = For->getBody(); } else { diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp --- a/clang/lib/AST/StmtPrinter.cpp +++ b/clang/lib/AST/StmtPrinter.cpp @@ -636,6 +636,10 @@ // OpenMP directives printing methods //===----------------------------------------------------------------------===// +void StmtPrinter::VisitOMPCanonicalLoop(OMPCanonicalLoop *Node) { + PrintStmt(Node->getLoopStmt()); +} + void StmtPrinter::PrintOMPExecutableDirective(OMPExecutableDirective *S, bool ForceNoStmt) { OMPClausePrinter Printer(OS, Policy); diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp --- a/clang/lib/AST/StmtProfile.cpp +++ b/clang/lib/AST/StmtProfile.cpp @@ -848,6 +848,10 @@ P.Visit(*I); } +void StmtProfiler::VisitOMPCanonicalLoop(const OMPCanonicalLoop *L) { + VisitStmt(L); +} + void StmtProfiler::VisitOMPLoopDirective(const OMPLoopDirective *S) { VisitOMPExecutableDirective(S); } diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp --- a/clang/lib/CodeGen/CGStmt.cpp +++ b/clang/lib/CodeGen/CGStmt.cpp @@ -194,6 +194,9 @@ case Stmt::SEHTryStmtClass: EmitSEHTryStmt(cast(*S)); break; + case Stmt::OMPCanonicalLoopClass: + EmitOMPCanonicalLoop(cast(S)); + break; case Stmt::OMPParallelDirectiveClass: EmitOMPParallelDirective(cast(*S)); break; diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -163,7 +163,9 @@ for (unsigned Cnt = 0; Cnt < S.getCollapsedNumber(); ++Cnt) { Body = OMPLoopDirective::tryToFindNextInnerLoop( Body, /*TryImperfectlyNestedLoops=*/true); - if (auto *For = dyn_cast(Body)) { + if (const auto *CanonLoop = dyn_cast(Body)) + Body = CanonLoop->getLoopStmt(); + if (const auto *For = dyn_cast(Body)) { Body = For->getBody(); } else { assert(isa(Body) && @@ -1771,6 +1773,8 @@ return; } if (SimplifiedS == NextLoop) { + if (const auto *CanonLoop = dyn_cast(SimplifiedS)) + SimplifiedS = CanonLoop->getLoopStmt(); if (const auto *For = dyn_cast(SimplifiedS)) { S = For->getBody(); } else { @@ -1858,6 +1862,118 @@ BreakContinueStack.pop_back(); } +using EmittedClosureTy = std::pair; + +/// Emit a captured statement and return the function as well as its captured +/// closure context. +static EmittedClosureTy emitCapturedStmtFunc(CodeGenFunction &ParentCGF, + const CapturedStmt *S) { + LValue CapStruct = ParentCGF.InitCapturedStruct(*S); + CodeGenFunction CGF(ParentCGF.CGM, true); + std::unique_ptr CSI = + std::make_unique(*S); + CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, CSI.get()); + llvm::Function *F = CGF.GenerateCapturedStmtFunction(*S); + + return {F, CapStruct.getPointer(ParentCGF)}; +} + +/// Emit a call to a previously captured closure. +static llvm::CallInst * +emitCapturedStmtCall(CodeGenFunction &ParentCGF, EmittedClosureTy Cap, + llvm::ArrayRef Args) { + // Append the closure context to the argument. + SmallVector EffectiveArgs; + EffectiveArgs.reserve(Args.size() + 1); + llvm::append_range(EffectiveArgs, Args); + EffectiveArgs.push_back(Cap.second); + + return ParentCGF.Builder.CreateCall(Cap.first, EffectiveArgs); +} + +llvm::CanonicalLoopInfo * +CodeGenFunction::EmitOMPCollapsedCanonicalLoopNest(const Stmt *S, int Depth) { + assert(Depth == 1 && "Nested loops with OpenMPIRBuilder not yet implemented"); + + EmitStmt(S); + assert(OMPLoopNestStack.size() >= (size_t)Depth && "Found too few loops"); + + // The last added loop is the outermost one. + return OMPLoopNestStack.back(); +} + +void CodeGenFunction::EmitOMPCanonicalLoop(const OMPCanonicalLoop *S) { + const Stmt *SyntacticalLoop = S->getLoopStmt(); + if (!getLangOpts().OpenMPIRBuilder) { + // Ignore if OpenMPIRBuilder is not enabled. + EmitStmt(SyntacticalLoop); + return; + } + + LexicalScope ForScope(*this, S->getSourceRange()); + + // Emit init statements. The Distance/LoopVar funcs may reference variable + // declarations they contain. + const Stmt *BodyStmt; + if (const auto *For = dyn_cast(SyntacticalLoop)) { + if (const Stmt *InitStmt = For->getInit()) + EmitStmt(InitStmt); + BodyStmt = For->getBody(); + } else if (auto *RangeFor = dyn_cast(SyntacticalLoop)) { + if (const DeclStmt *RangeStmt = RangeFor->getRangeStmt()) + EmitStmt(RangeStmt); + if (const DeclStmt *BeginStmt = RangeFor->getBeginStmt()) + EmitStmt(BeginStmt); + if (const DeclStmt *EndStmt = RangeFor->getEndStmt()) + EmitStmt(EndStmt); + if (const DeclStmt *LoopVarStmt = RangeFor->getLoopVarStmt()) + EmitStmt(LoopVarStmt); + BodyStmt = RangeFor->getBody(); + } else + llvm_unreachable("Expected for-stmt or range-based for-stmt"); + + // Emit closure for later use. By-value captures will be captured here. + const CapturedStmt *DistanceFunc = S->getDistanceFunc(); + EmittedClosureTy DistanceClosure = emitCapturedStmtFunc(*this, DistanceFunc); + const CapturedStmt *LoopVarFunc = S->getLoopVarFunc(); + EmittedClosureTy LoopVarClosure = emitCapturedStmtFunc(*this, LoopVarFunc); + + // Call the distance function to get the number of iterations of the loop to + // come. + QualType LogicalTy = DistanceFunc->getCapturedDecl() + ->getParam(0) + ->getType() + .getNonReferenceType(); + Address CountAddr = CreateMemTemp(LogicalTy, ".count.addr"); + emitCapturedStmtCall(*this, DistanceClosure, {CountAddr.getPointer()}); + llvm::Value *DistVal = Builder.CreateLoad(CountAddr, ".count"); + + // Emit the loop structure. + llvm::OpenMPIRBuilder &OMPBuilder = CGM.getOpenMPRuntime().getOMPBuilder(); + llvm::CanonicalLoopInfo *CL = + OMPBuilder.createCanonicalLoop(Builder, {}, DistVal); + + // Emit the loop body: Convert the logical iteration number to the loop + // variable and emit the body. + Builder.restoreIP(CL->getBodyIP()); + const DeclRefExpr *LoopVarRef = S->getLoopVarRef(); + LValue LCVal = EmitLValue(LoopVarRef); + Address LoopVarAddress = LCVal.getAddress(*this); + emitCapturedStmtCall(*this, LoopVarClosure, + {LoopVarAddress.getPointer(), CL->getIndVar()}); + { + RunCleanupsScope BodyScope(*this); + EmitStmt(BodyStmt); + } + + // Finish up the loop. + Builder.restoreIP(CL->getAfterIP()); + ForScope.ForceCleanup(); + + // Remember the CanonicalLoopInfo for parent AST nodes consuming it. + OMPLoopNestStack.push_back(CL); +} + void CodeGenFunction::EmitOMPInnerLoop( const OMPExecutableDirective &S, bool RequiresCleanup, const Expr *LoopCond, const Expr *IncExpr, @@ -1875,6 +1991,7 @@ const CapturedStmt *ICS = OMPED.getInnermostCapturedStmt(); const Stmt *SS = ICS->getCapturedStmt(); const AttributedStmt *AS = dyn_cast_or_null(SS); + OMPLoopNestStack.clear(); if (AS) LoopStack.push(CondBlock, CGM.getContext(), CGM.getCodeGenOpts(), AS->getAttrs(), SourceLocToDebugLoc(R.getBegin()), @@ -2424,6 +2541,7 @@ llvm::BasicBlock *CondBlock = createBasicBlock("omp.dispatch.cond"); EmitBlock(CondBlock); const SourceRange R = S.getSourceRange(); + OMPLoopNestStack.clear(); LoopStack.push(CondBlock, SourceLocToDebugLoc(R.getBegin()), SourceLocToDebugLoc(R.getEnd())); @@ -2507,6 +2625,7 @@ } EmitBranch(CondBlock); + OMPLoopNestStack.clear(); LoopStack.pop(); // Emit the fall-through block. EmitBlock(LoopExit.getBlock()); @@ -3349,10 +3468,38 @@ return HasLastprivates; } +static bool isSupportedByOpenMPIRBuilder(const OMPForDirective &S) { + if (S.hasCancel()) + return false; + for (OMPClause *C : S.clauses()) + if (!isa(C)) + return false; + + return true; +} + void CodeGenFunction::EmitOMPForDirective(const OMPForDirective &S) { bool HasLastprivates = false; - auto &&CodeGen = [&S, &HasLastprivates](CodeGenFunction &CGF, - PrePostActionTy &) { + bool UseOMPIRBuilder = + CGM.getLangOpts().OpenMPIRBuilder && isSupportedByOpenMPIRBuilder(S); + auto &&CodeGen = [this, &S, &HasLastprivates, + UseOMPIRBuilder](CodeGenFunction &CGF, PrePostActionTy &) { + // Use the OpenMPIRBuilder if enabled. + if (UseOMPIRBuilder) { + // Emit the associated statement and get its loop representation. + const Stmt *Inner = S.getRawStmt(); + llvm::CanonicalLoopInfo *CLI = + EmitOMPCollapsedCanonicalLoopNest(Inner, 1); + + bool NeedsBarrer = !S.getSingleClause(); + llvm::OpenMPIRBuilder &OMPBuilder = + CGM.getOpenMPRuntime().getOMPBuilder(); + llvm::OpenMPIRBuilder::InsertPointTy AllocaIP( + AllocaInsertPt->getParent(), AllocaInsertPt->getIterator()); + OMPBuilder.createWorkshareLoop(Builder, CLI, AllocaIP, NeedsBarrer); + return; + } + HasLastprivates = emitWorksharingDirective(CGF, S, S.hasCancel()); }; { @@ -3363,9 +3510,11 @@ S.hasCancel()); } - // Emit an implicit barrier at the end. - if (!S.getSingleClause() || HasLastprivates) - CGM.getOpenMPRuntime().emitBarrierCall(*this, S.getBeginLoc(), OMPD_for); + if (!UseOMPIRBuilder) { + // Emit an implicit barrier at the end. + if (!S.getSingleClause() || HasLastprivates) + CGM.getOpenMPRuntime().emitBarrierCall(*this, S.getBeginLoc(), OMPD_for); + } // Check for outer lastprivate conditional update. checkForLastprivateConditionalUpdate(*this, S); } diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -50,6 +50,7 @@ class SwitchInst; class Twine; class Value; +class CanonicalLoopInfo; } namespace clang { @@ -276,6 +277,20 @@ // because of jumps. VarBypassDetector Bypasses; + /// List of recently emitted OMPCanonicalLoops. + /// + /// Since OMPCanonicalLoops are nested inside other statements (in particular + /// CapturedStmt generated by OMPExecutableDirective and non-perfectly nested + /// loops), we cannot directly call OMPEmitOMPCanonicalLoop and receive its + /// llvm::CanonicalLoopInfo. Instead, we call EmitStmt and any + /// OMPEmitOMPCanonicalLoop called by it will add its CanonicalLoopInfo to + /// this stack when done. Entering a new loop requires clearing this list; it + /// either means we start parsing an new loop nest (in which case the previous + /// loop nest goes out of scope) or a second loop in the same level in which + /// case it would be ambiguous into which of the two (or more) loops the loop + /// nest would extend. + SmallVector OMPLoopNestStack; + // CodeGen lambda for loops and support for ordered clause typedef llvm::function_ref @@ -3499,6 +3514,18 @@ static void EmitOMPTargetTeamsDistributeParallelForDeviceFunction( CodeGenModule &CGM, StringRef ParentName, const OMPTargetTeamsDistributeParallelForDirective &S); + + /// Emit the Stmt \p S and return its topmost canonical loop, if any. + /// TODO: The \p Depth paramter is not yet implemented and must be 1. In the + /// future it is meant to be the number of loops expected in the loop nests + /// (usually specified by the "collapse" clause) that are collapsed to a + /// single loop by this function. + llvm::CanonicalLoopInfo *EmitOMPCollapsedCanonicalLoopNest(const Stmt *S, + int Depth); + + /// Emit an OMPCanonicalLoop using the OpenMPIRBuilder. + void EmitOMPCanonicalLoop(const OMPCanonicalLoop *S); + /// Emit inner loop of the worksharing/simd construct. /// /// \param S Directive, for which the inner loop must be emitted. diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -91,8 +91,8 @@ // seems to be a reasonable spot. We do it here, as opposed to the deletion // time of the CodeGenModule, because we have to ensure the IR has not yet // been "emitted" to the outside, thus, modifications are still sensible. - if (CGM.getLangOpts().OpenMPIRBuilder) - CGM.getOpenMPRuntime().getOMPBuilder().finalize(); + if (CGM.getLangOpts().OpenMPIRBuilder && CurFn) + CGM.getOpenMPRuntime().getOMPBuilder().finalize(CurFn); } // Map the LangOption for exception behavior into diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp --- a/clang/lib/Parse/ParseOpenMP.cpp +++ b/clang/lib/Parse/ParseOpenMP.cpp @@ -2529,7 +2529,15 @@ // the captured region. Code elsewhere assumes that any FunctionScopeInfo // should have at least one compound statement scope within it. ParsingOpenMPDirectiveRAII NormalScope(*this, /*Value=*/false); - AssociatedStmt = (Sema::CompoundScopeRAII(Actions), ParseStatement()); + { + Sema::CompoundScopeRAII Scope(Actions); + AssociatedStmt = ParseStatement(); + + if (AssociatedStmt.isUsable() && isOpenMPLoopDirective(DKind) && + getLangOpts().OpenMPIRBuilder) + AssociatedStmt = + Actions.ActOnOpenMPCanonicalLoop(AssociatedStmt.get()); + } AssociatedStmt = Actions.ActOnOpenMPRegionEnd(AssociatedStmt, Clauses); } else if (DKind == OMPD_target_update || DKind == OMPD_target_enter_data || DKind == OMPD_target_exit_data) { diff --git a/clang/lib/Sema/SemaExceptionSpec.cpp b/clang/lib/Sema/SemaExceptionSpec.cpp --- a/clang/lib/Sema/SemaExceptionSpec.cpp +++ b/clang/lib/Sema/SemaExceptionSpec.cpp @@ -1448,6 +1448,7 @@ case Stmt::OMPMasterTaskLoopDirectiveClass: case Stmt::OMPMasterTaskLoopSimdDirectiveClass: case Stmt::OMPOrderedDirectiveClass: + case Stmt::OMPCanonicalLoopClass: case Stmt::OMPParallelDirectiveClass: case Stmt::OMPParallelForDirectiveClass: case Stmt::OMPParallelForSimdDirectiveClass: diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -17263,18 +17263,17 @@ /// Capture the given variable in the captured region. -static bool captureInCapturedRegion(CapturedRegionScopeInfo *RSI, - VarDecl *Var, - SourceLocation Loc, - const bool BuildAndDiagnose, - QualType &CaptureType, - QualType &DeclRefType, - const bool RefersToCapturedVariable, - Sema &S, bool Invalid) { +static bool captureInCapturedRegion( + CapturedRegionScopeInfo *RSI, VarDecl *Var, SourceLocation Loc, + const bool BuildAndDiagnose, QualType &CaptureType, QualType &DeclRefType, + const bool RefersToCapturedVariable, Sema::TryCaptureKind Kind, + bool IsTopScope, Sema &S, bool Invalid) { // By default, capture variables by reference. bool ByRef = true; - // Using an LValue reference type is consistent with Lambdas (see below). - if (S.getLangOpts().OpenMP && RSI->CapRegionKind == CR_OpenMP) { + if (IsTopScope && Kind != Sema::TryCapture_Implicit) { + ByRef = (Kind == Sema::TryCapture_ExplicitByRef); + } else if (S.getLangOpts().OpenMP && RSI->CapRegionKind == CR_OpenMP) { + // Using an LValue reference type is consistent with Lambdas (see below). if (S.isOpenMPCapturedDecl(Var)) { bool HasConst = DeclRefType.isConstQualified(); DeclRefType = DeclRefType.getUnqualifiedType(); @@ -17620,9 +17619,9 @@ DeclRefType, Nested, *this, Invalid); Nested = true; } else if (CapturedRegionScopeInfo *RSI = dyn_cast(CSI)) { - Invalid = !captureInCapturedRegion(RSI, Var, ExprLoc, BuildAndDiagnose, - CaptureType, DeclRefType, Nested, - *this, Invalid); + Invalid = !captureInCapturedRegion( + RSI, Var, ExprLoc, BuildAndDiagnose, CaptureType, DeclRefType, Nested, + Kind, /*IsTopScope*/ I == N - 1, *this, Invalid); Nested = true; } else { LambdaScopeInfo *LSI = cast(CSI); diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -3767,6 +3767,9 @@ Visit(E->getBase()); } } + + void VisitOMPCanonicalLoop(OMPCanonicalLoop *S) { VisitStmt(S); } + void VisitOMPExecutableDirective(OMPExecutableDirective *S) { for (OMPClause *C : S->clauses()) { // Skip analysis of arguments of implicitly defined firstprivate clause @@ -5130,6 +5133,350 @@ } } +namespace { +/// Rewrite statements and expressions for Sema \p Actions CurContext. +/// Used to capture variable references if already parsed statements/expressions +/// into a CapturedStatement. +class CaptureVars : public TreeTransform { + using BaseTransform = TreeTransform; + +public: + CaptureVars(Sema &Actions) : BaseTransform(Actions) {} + + bool AlwaysRebuild() { return true; } +}; +} // namespace + +/// Create a closure that computes the number of iterations of a loop. +/// +/// \param Actions The Sema object. +/// \param LogicalTy Type for the logical iteration number. +/// \param Rel Comparison operator of the loop condition. +/// \param StartExpr Value of the loop counter at the first iteration. +/// \param StopExpr Expression the loop counter is compared against in the loop +/// condition. \param Step Amount of increment after each iteration. +/// +/// \return Closure (CapturedStmt) of the distance calculation. +static CapturedStmt *buildDistanceFunc(Sema &Actions, QualType LogicalTy, + BinaryOperator::Opcode Rel, + Expr *StartExpr, Expr *StopExpr, + Expr *Step) { + ASTContext &Ctx = Actions.getASTContext(); + TypeSourceInfo *LogicalTSI = Ctx.getTrivialTypeSourceInfo(LogicalTy); + + // Captured regions currently don't support return values, we use an + // out-parameter instead. All inputs are implicit captures. + // TODO: Instead of capturing each DeclRefExpr occurring in + // StartExpr/StopExpr/Step, these could also be passed as a value capture. + QualType ResultTy = Ctx.getLValueReferenceType(LogicalTy); + Sema::CapturedParamNameType Params[] = {{"Distance", ResultTy}, + {StringRef(), QualType()}}; + Actions.ActOnCapturedRegionStart({}, nullptr, CR_Default, Params); + + Expr *Body; + { + Sema::CompoundScopeRAII CompoundScope(Actions); + CapturedDecl *CS = cast(Actions.CurContext); + + // Get the LValue expression for the result. + ImplicitParamDecl *DistParam = CS->getParam(0); + DeclRefExpr *DistRef = Actions.BuildDeclRefExpr( + DistParam, LogicalTy, VK_LValue, {}, nullptr, nullptr, {}, nullptr); + + // Capture all referenced variable references. + CaptureVars Recap(Actions); + Expr *NewStart = AssertSuccess(Recap.TransformExpr(StartExpr)); + Expr *NewStop = AssertSuccess(Recap.TransformExpr(StopExpr)); + Expr *NewStep = AssertSuccess(Recap.TransformExpr(Step)); + + IntegerLiteral *Zero = IntegerLiteral::Create( + Ctx, llvm::APInt(Ctx.getIntWidth(LogicalTy), 0), LogicalTy, {}); + Expr *Dist; + if (Rel == BO_NE) { + // When using a != comparison, the increment can be +1 or -1. This can be + // dynamic at runtime, so we need to check for the direction. + Expr *IsNegStep = + AssertSuccess(Actions.BuildBinOp(nullptr, {}, BO_LT, NewStep, Zero)); + + // Positive increment. + Expr *ForwardRange = AssertSuccess( + Actions.BuildBinOp(nullptr, {}, BO_Sub, NewStop, NewStart)); + ForwardRange = AssertSuccess( + Actions.BuildCStyleCastExpr({}, LogicalTSI, {}, ForwardRange)); + Expr *ForwardDist = AssertSuccess( + Actions.BuildBinOp(nullptr, {}, BO_Div, ForwardRange, NewStep)); + + // Negative increment. + Expr *BackwardRange = AssertSuccess( + Actions.BuildBinOp(nullptr, {}, BO_Sub, NewStart, NewStop)); + BackwardRange = AssertSuccess( + Actions.BuildCStyleCastExpr({}, LogicalTSI, {}, BackwardRange)); + Expr *NegIncAmount = + AssertSuccess(Actions.BuildUnaryOp(nullptr, {}, UO_Minus, NewStep)); + Expr *BackwardDist = AssertSuccess( + Actions.BuildBinOp(nullptr, {}, BO_Div, BackwardRange, NegIncAmount)); + + // Use the appropriate case. + Dist = AssertSuccess(Actions.ActOnConditionalOp( + {}, {}, IsNegStep, BackwardDist, ForwardDist)); + } else { + assert((Rel == BO_LT || Rel == BO_LE || Rel == BO_GE || Rel == BO_GT) && + "Expected one of these relational operators"); + + // We can derive the direction from any other comparison operator. It is + // non well-formed OpenMP if Step increments/decrements in the other + // directions. Whether at least the first iteration passes the loop + // condition. + Expr *HasAnyIteration = AssertSuccess( + Actions.BuildBinOp(nullptr, {}, Rel, NewStart, NewStop)); + + // Compute the range between first and last counter value. + Expr *Range; + if (Rel == BO_GE || Rel == BO_GT) + Range = AssertSuccess( + Actions.BuildBinOp(nullptr, {}, BO_Sub, NewStart, NewStop)); + else + Range = AssertSuccess( + Actions.BuildBinOp(nullptr, {}, BO_Sub, NewStop, NewStart)); + + // Ensure unsigned range space. + Range = + AssertSuccess(Actions.BuildCStyleCastExpr({}, LogicalTSI, {}, Range)); + + if (Rel == BO_LE || Rel == BO_GE) { + // Add one to the range if the relational operator is inclusive. + Range = + AssertSuccess(Actions.BuildUnaryOp(nullptr, {}, UO_PreInc, Range)); + } + + // Divide by the absolute step amount. + if (Rel == BO_GE || Rel == BO_GT) + NewStep = + AssertSuccess(Actions.BuildUnaryOp(nullptr, {}, UO_Minus, NewStep)); + Dist = AssertSuccess( + Actions.BuildBinOp(nullptr, {}, BO_Div, Range, NewStep)); + + // If there is not at least one iteration, the range contains garbage. Fix + // to zero in this case. + Dist = AssertSuccess( + Actions.ActOnConditionalOp({}, {}, HasAnyIteration, Dist, Zero)); + } + + // Assign the result to the out-parameter. + Body = AssertSuccess(Actions.BuildBinOp(Actions.getCurScope(), {}, + BO_Assign, DistRef, Dist)); + } + return cast( + AssertSuccess(Actions.ActOnCapturedRegionEnd(Body))); +} + +/// Create a closure that computes the loop variable from the logical iteration +/// number. +/// +/// \param Actions The Sema object. +/// \param LoopVarTy Type for the loop variable used for result value. +/// \param LogicalTy Type for the logical iteration number. +/// \param StartExpr Value of the loop counter at the first iteration. +/// \param Step Amount of increment after each iteration. +/// \param Deref Whether the loop variable is a dereference of the loop +/// counter variable. +/// +/// \return Closure (CapturedStmt) of the loop value calculation. +static CapturedStmt *buildLoopVarFunc(Sema &Actions, QualType LoopVarTy, + QualType LogicalTy, + DeclRefExpr *StartExpr, Expr *Step, + bool Deref) { + ASTContext &Ctx = Actions.getASTContext(); + + // Pass the result as an out-parameter. Passing as return value would require + // the OpenMPIRBuilder to know additional C/C++ semantics, such as how to + // invoke a copy constructor. + QualType TargetParamTy = Ctx.getLValueReferenceType(LoopVarTy); + Sema::CapturedParamNameType Params[] = {{"LoopVar", TargetParamTy}, + {"Logical", LogicalTy}, + {StringRef(), QualType()}}; + Actions.ActOnCapturedRegionStart({}, nullptr, CR_Default, Params); + + // Capture the initial iterator which represents the LoopVar value at the + // zero's logical iteration. Since the original ForStmt/CXXRangeForStmt update + // it in every iteration, capture it by value before it is modified. + VarDecl *StartVar = cast(StartExpr->getDecl()); + bool Invalid = Actions.tryCaptureVariable(StartVar, {}, + Sema::TryCapture_ExplicitByVal, {}); + (void)Invalid; + assert(!Invalid && "Expecting capture-by-value to work."); + + Expr *Body; + { + Sema::CompoundScopeRAII CompoundScope(Actions); + auto *CS = cast(Actions.CurContext); + + ImplicitParamDecl *TargetParam = CS->getParam(0); + DeclRefExpr *TargetRef = Actions.BuildDeclRefExpr( + TargetParam, LoopVarTy, VK_LValue, {}, nullptr, nullptr, {}, nullptr); + ImplicitParamDecl *IndvarParam = CS->getParam(1); + DeclRefExpr *LogicalRef = Actions.BuildDeclRefExpr( + IndvarParam, LogicalTy, VK_LValue, {}, nullptr, nullptr, {}, nullptr); + + // Capture the Start expression. + CaptureVars Recap(Actions); + Expr *NewStart = AssertSuccess(Recap.TransformExpr(StartExpr)); + Expr *NewStep = AssertSuccess(Recap.TransformExpr(Step)); + + Expr *Skip = AssertSuccess( + Actions.BuildBinOp(nullptr, {}, BO_Mul, NewStep, LogicalRef)); + // TODO: Explicitly cast to the iterator's difference_type instead of + // relying on implicit conversion. + Expr *Advanced = + AssertSuccess(Actions.BuildBinOp(nullptr, {}, BO_Add, NewStart, Skip)); + + if (Deref) { + // For range-based for-loops convert the loop counter value to a concrete + // loop variable value by dereferencing the iterator. + Advanced = + AssertSuccess(Actions.BuildUnaryOp(nullptr, {}, UO_Deref, Advanced)); + } + + // Assign the result to the output parameter. + Body = AssertSuccess(Actions.BuildBinOp(Actions.getCurScope(), {}, + BO_Assign, TargetRef, Advanced)); + } + return cast( + AssertSuccess(Actions.ActOnCapturedRegionEnd(Body))); +} + +StmtResult Sema::ActOnOpenMPCanonicalLoop(Stmt *AStmt) { + ASTContext &Ctx = getASTContext(); + + // Extract the common elements of ForStmt and CXXForRangeStmt: + // Loop variable, repeat condition, increment + Expr *Cond, *Inc; + VarDecl *CounterDecl, *LVDecl; + if (auto *For = dyn_cast(AStmt)) { + Stmt *Init = For->getInit(); + if (auto *LCVarDeclStmt = dyn_cast(Init)) { + // For statement declares loop variable. + CounterDecl = cast(LCVarDeclStmt->getSingleDecl()); + } else if (auto *LCAssign = dyn_cast(Init)) { + // For statement reuses variable. + assert(LCAssign->getOpcode() == BO_Assign && + "init part must be a loop variable assignment"); + auto *CounterRef = cast(LCAssign->getLHS()); + CounterDecl = cast(CounterRef->getDecl()); + } else + llvm_unreachable("Cannot determine loop variable"); + LVDecl = CounterDecl; + + Cond = For->getCond(); + Inc = For->getInc(); + } else if (auto *RangeFor = dyn_cast(AStmt)) { + DeclStmt *BeginStmt = RangeFor->getBeginStmt(); + CounterDecl = cast(BeginStmt->getSingleDecl()); + LVDecl = RangeFor->getLoopVariable(); + + Cond = RangeFor->getCond(); + Inc = RangeFor->getInc(); + } else + llvm_unreachable("unhandled kind of loop"); + + QualType CounterTy = CounterDecl->getType(); + QualType LVTy = LVDecl->getType(); + + // Analyze the loop condition. + Expr *LHS, *RHS; + BinaryOperator::Opcode CondRel; + Cond = Cond->IgnoreImplicit(); + if (auto *CondBinExpr = dyn_cast(Cond)) { + LHS = CondBinExpr->getLHS(); + RHS = CondBinExpr->getRHS(); + CondRel = CondBinExpr->getOpcode(); + } else if (auto *CondCXXOp = dyn_cast(Cond)) { + assert(CondCXXOp->getOperator() == OO_ExclaimEqual && + "Expected != loop condition for iterator-based loops"); + assert(CondCXXOp->getNumArgs() == 2 && "Comparison should have 2 operands"); + LHS = CondCXXOp->getArg(0); + RHS = CondCXXOp->getArg(1); + CondRel = BO_NE; + } else + llvm_unreachable("unexpected loop condition"); + + // Normalize such that the loop counter is on the LHS. + if (!isa(LHS->IgnoreImplicit()) || + cast(LHS->IgnoreImplicit())->getDecl() != CounterDecl) { + std::swap(LHS, RHS); + CondRel = BinaryOperator::reverseComparisonOp(CondRel); + } + auto *CounterRef = cast(LHS->IgnoreImplicit()); + + // Decide the bit width for the logical iteration counter. By default use the + // unsigned ptrdiff_t integer size (for iterators and pointers). + // TODO: For iterators, use iterator::difference_type, + // std::iterator_traits<>::difference_type or decltype(it - end). + QualType LogicalTy = Ctx.getUnsignedPointerDiffType(); + if (CounterTy->isIntegerType()) { + unsigned BitWidth = Ctx.getIntWidth(CounterTy); + LogicalTy = Ctx.getIntTypeForBitwidth(BitWidth, false); + } + + // Analyze the loop increment. + Expr *Step; + if (auto *IncUn = dyn_cast(Inc)) { + int Direction; + switch (IncUn->getOpcode()) { + case UO_PreInc: + case UO_PostInc: + Direction = 1; + break; + case UO_PreDec: + case UO_PostDec: + Direction = -1; + break; + default: + llvm_unreachable("unhandled unary increment operator"); + } + Step = IntegerLiteral::Create( + Ctx, llvm::APInt(Ctx.getIntWidth(LogicalTy), Direction), LogicalTy, {}); + } else if (auto *IncBin = dyn_cast(Inc)) { + if (IncBin->getOpcode() == BO_AddAssign) { + Step = IncBin->getRHS(); + } else if (IncBin->getOpcode() == BO_SubAssign) { + Step = + AssertSuccess(BuildUnaryOp(nullptr, {}, UO_Minus, IncBin->getRHS())); + } else + llvm_unreachable("unhandled binary increment operator"); + } else if (auto *CondCXXOp = dyn_cast(Inc)) { + switch (CondCXXOp->getOperator()) { + case OO_PlusPlus: + Step = IntegerLiteral::Create( + Ctx, llvm::APInt(Ctx.getIntWidth(LogicalTy), 1), LogicalTy, {}); + break; + case OO_MinusMinus: + Step = IntegerLiteral::Create( + Ctx, llvm::APInt(Ctx.getIntWidth(LogicalTy), -1), LogicalTy, {}); + break; + case OO_PlusEqual: + Step = CondCXXOp->getArg(1); + break; + case OO_MinusEqual: + Step = AssertSuccess( + BuildUnaryOp(nullptr, {}, UO_Minus, CondCXXOp->getArg(1))); + break; + default: + llvm_unreachable("unhandled overloaded increment operator"); + } + } else + llvm_unreachable("unknown increment expression"); + + CapturedStmt *DistanceFunc = + buildDistanceFunc(*this, LogicalTy, CondRel, LHS, RHS, Step); + CapturedStmt *LoopVarFunc = buildLoopVarFunc( + *this, LVTy, LogicalTy, CounterRef, Step, isa(AStmt)); + DeclRefExpr *LVRef = BuildDeclRefExpr(LVDecl, LVDecl->getType(), VK_LValue, + {}, nullptr, nullptr, {}, nullptr); + return OMPCanonicalLoop::create(getASTContext(), AStmt, DistanceFunc, + LoopVarFunc, LVRef); +} + StmtResult Sema::ActOnOpenMPExecutableDirective( OpenMPDirectiveKind Kind, const DeclarationNameInfo &DirName, OpenMPDirectiveKind CancelRegion, ArrayRef Clauses, @@ -7947,6 +8294,8 @@ // OpenMP [2.9.1, Canonical Loop Form] // for (init-expr; test-expr; incr-expr) structured-block // for (range-decl: range-expr) structured-block + if (auto *CanonLoop = dyn_cast_or_null(S)) + S = CanonLoop->getLoopStmt(); auto *For = dyn_cast_or_null(S); auto *CXXFor = dyn_cast_or_null(S); // Ranged for is supported only in OpenMP 5.0. @@ -8280,6 +8629,17 @@ return PostUpdate; } +static Stmt *findLoopBody(Stmt *L) { + if (auto *CanonLoop = dyn_cast_or_null(L)) + L = CanonLoop->getLoopStmt(); + if (auto *For = dyn_cast(L)) + return For->getBody(); + + assert(isa(L) && + "Expected canonical for or range-based for loops."); + return cast(L)->getBody(); +} + /// Called on a for stmt to check itself and nested loops (if any). /// \return Returns 0 if one of the collapsed stmts is not canonical for loop, /// number of collapsed loops otherwise. @@ -8340,13 +8700,7 @@ // All loops associated with the construct must be perfectly nested; that // is, there must be no intervening code nor any OpenMP directive between // any two loops. - if (auto *For = dyn_cast(CurStmt)) { - CurStmt = For->getBody(); - } else { - assert(isa(CurStmt) && - "Expected canonical for or range-based for loops."); - CurStmt = cast(CurStmt)->getBody(); - } + CurStmt = findLoopBody(CurStmt); CurStmt = OMPLoopDirective::tryToFindNextInnerLoop( CurStmt, SemaRef.LangOpts.OpenMP >= 50); } @@ -8368,13 +8722,7 @@ // All loops associated with the construct must be perfectly nested; that // is, there must be no intervening code nor any OpenMP directive between // any two loops. - if (auto *For = dyn_cast(CurStmt)) { - CurStmt = For->getBody(); - } else { - assert(isa(CurStmt) && - "Expected canonical for or range-based for loops."); - CurStmt = cast(CurStmt)->getBody(); - } + CurStmt = findLoopBody(CurStmt); CurStmt = OMPLoopDirective::tryToFindNextInnerLoop( CurStmt, SemaRef.LangOpts.OpenMP >= 50); } diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -8314,6 +8314,15 @@ //===----------------------------------------------------------------------===// // OpenMP directive transformation //===----------------------------------------------------------------------===// + +template +StmtResult +TreeTransform::TransformOMPCanonicalLoop(OMPCanonicalLoop *L) { + // The OMPCanonicalLoop will be recreated when transforming the loop-associted + // directive. + return getDerived().TransformStmt(L->getLoopStmt()); +} + template StmtResult TreeTransform::TransformOMPExecutableDirective( OMPExecutableDirective *D) { @@ -8350,6 +8359,9 @@ else CS = D->getInnermostCapturedStmt()->getCapturedStmt(); Body = getDerived().TransformStmt(CS); + if (Body.isUsable() && isOpenMPLoopDirective(D->getDirectiveKind()) && + getSema().getLangOpts().OpenMPIRBuilder) + Body = getSema().ActOnOpenMPCanonicalLoop(Body.get()); } AssociatedStmt = getDerived().getSema().ActOnOpenMPRegionEnd(Body, TClauses); diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp --- a/clang/lib/Serialization/ASTReaderStmt.cpp +++ b/clang/lib/Serialization/ASTReaderStmt.cpp @@ -2273,6 +2273,12 @@ // OpenMP Directives. //===----------------------------------------------------------------------===// +void ASTStmtReader::VisitOMPCanonicalLoop(OMPCanonicalLoop *S) { + VisitStmt(S); + for (Stmt *&SubStmt : S->SubStmts) + SubStmt = Record.readSubStmt(); +} + void ASTStmtReader::VisitOMPExecutableDirective(OMPExecutableDirective *E) { Record.readOMPChildren(E->Data); E->setLocStart(readSourceLocation()); @@ -3130,6 +3136,10 @@ nullptr); break; + case STMT_OMP_CANONICAL_LOOP: + S = OMPCanonicalLoop::createEmpty(Context); + break; + case STMT_OMP_PARALLEL_DIRECTIVE: S = OMPParallelDirective::CreateEmpty(Context, diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp --- a/clang/lib/Serialization/ASTWriterStmt.cpp +++ b/clang/lib/Serialization/ASTWriterStmt.cpp @@ -2170,6 +2170,13 @@ // OpenMP Directives. //===----------------------------------------------------------------------===// +void ASTStmtWriter::VisitOMPCanonicalLoop(OMPCanonicalLoop *S) { + VisitStmt(S); + for (Stmt *SubStmt : S->SubStmts) + Record.AddStmt(SubStmt); + Code = serialization::STMT_OMP_CANONICAL_LOOP; +} + void ASTStmtWriter::VisitOMPExecutableDirective(OMPExecutableDirective *E) { Record.writeOMPChildren(E->Data); Record.AddSourceLocation(E->getBeginLoc()); diff --git a/clang/test/OpenMP/irbuilder_for_iterator.cpp b/clang/test/OpenMP/irbuilder_for_iterator.cpp new file mode 100644 --- /dev/null +++ b/clang/test/OpenMP/irbuilder_for_iterator.cpp @@ -0,0 +1,147 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs +// RUN: %clang_cc1 -fopenmp-enable-irbuilder -verify -fopenmp -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s +// expected-no-diagnostics + +#ifndef HEADER +#define HEADER + +struct MyIterator { + MyIterator(unsigned pos); + MyIterator(const MyIterator &other); + const MyIterator &operator=(const MyIterator &that); + MyIterator &operator++(); + int operator-(const MyIterator &that) const; + MyIterator &operator+=(unsigned a); + MyIterator operator+(unsigned a) const; + bool operator==(const MyIterator &that) const; + bool operator!=(const MyIterator &that) const; + unsigned operator*() const; +}; + +extern "C" void workshareloop_iterator(float *a, float *b, float *c) { +#pragma omp for + for (MyIterator it = MyIterator(7); it != MyIterator(41); ++it) { + unsigned i = *it; + a[i] = b[i] * c[i]; + } +} + +#endif // HEADER +// CHECK-LABEL: define {{[^@]+}}@workshareloop_iterator +// CHECK-SAME: (float* [[A:%.*]], float* [[B:%.*]], float* [[C:%.*]]) [[ATTR0:#.*]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca float*, align 8 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca float*, align 8 +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca float*, align 8 +// CHECK-NEXT: [[IT:%.*]] = alloca [[STRUCT_MYITERATOR:%.*]], align 1 +// CHECK-NEXT: [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 8 +// CHECK-NEXT: [[AGG_CAPTURED1:%.*]] = alloca [[STRUCT_ANON_0:%.*]], align 1 +// CHECK-NEXT: [[DOTCOUNT_ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[P_LASTITER:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[P_LOWERBOUND:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[P_UPPERBOUND:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[P_STRIDE:%.*]] = alloca i64, align 8 +// CHECK-NEXT: store float* [[A]], float** [[A_ADDR]], align 8 +// CHECK-NEXT: store float* [[B]], float** [[B_ADDR]], align 8 +// CHECK-NEXT: store float* [[C]], float** [[C_ADDR]], align 8 +// CHECK-NEXT: call void @_ZN10MyIteratorC1Ej(%struct.MyIterator* nonnull dereferenceable(1) [[IT]], i32 7) +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[AGG_CAPTURED]], i32 0, i32 0 +// CHECK-NEXT: store %struct.MyIterator* [[IT]], %struct.MyIterator** [[TMP0]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_ANON_0]], %struct.anon.0* [[AGG_CAPTURED1]], i32 0, i32 0 +// CHECK-NEXT: call void @_ZN10MyIteratorC1ERKS_(%struct.MyIterator* nonnull dereferenceable(1) [[TMP1]], %struct.MyIterator* nonnull align 1 dereferenceable(1) [[IT]]) +// CHECK-NEXT: call void @__captured_stmt(i64* [[DOTCOUNT_ADDR]], %struct.anon* [[AGG_CAPTURED]]) +// CHECK-NEXT: [[DOTCOUNT:%.*]] = load i64, i64* [[DOTCOUNT_ADDR]], align 8 +// CHECK-NEXT: br label [[OMP_LOOP_PREHEADER:%.*]] +// CHECK: omp_loop.preheader: +// CHECK-NEXT: store i64 0, i64* [[P_LOWERBOUND]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[DOTCOUNT]], 1 +// CHECK-NEXT: store i64 [[TMP2]], i64* [[P_UPPERBOUND]], align 8 +// CHECK-NEXT: store i64 1, i64* [[P_STRIDE]], align 8 +// CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1:@.*]]) +// CHECK-NEXT: call void @__kmpc_for_static_init_8u(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 34, i32* [[P_LASTITER]], i64* [[P_LOWERBOUND]], i64* [[P_UPPERBOUND]], i64* [[P_STRIDE]], i64 1, i64 1) +// CHECK-NEXT: [[TMP3:%.*]] = load i64, i64* [[P_LOWERBOUND]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load i64, i64* [[P_UPPERBOUND]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = sub i64 [[TMP4]], [[TMP3]] +// CHECK-NEXT: [[TMP6:%.*]] = add i64 [[TMP5]], 1 +// CHECK-NEXT: br label [[OMP_LOOP_HEADER:%.*]] +// CHECK: omp_loop.header: +// CHECK-NEXT: [[OMP_LOOP_IV:%.*]] = phi i64 [ 0, [[OMP_LOOP_PREHEADER]] ], [ [[OMP_LOOP_NEXT:%.*]], [[OMP_LOOP_INC:%.*]] ] +// CHECK-NEXT: br label [[OMP_LOOP_COND:%.*]] +// CHECK: omp_loop.cond: +// CHECK-NEXT: [[OMP_LOOP_CMP:%.*]] = icmp ult i64 [[OMP_LOOP_IV]], [[TMP6]] +// CHECK-NEXT: br i1 [[OMP_LOOP_CMP]], label [[OMP_LOOP_BODY:%.*]], label [[OMP_LOOP_EXIT:%.*]] +// CHECK: omp_loop.body: +// CHECK-NEXT: [[TMP7:%.*]] = add i64 [[OMP_LOOP_IV]], [[TMP3]] +// CHECK-NEXT: call void @__captured_stmt.1(%struct.MyIterator* [[IT]], i64 [[TMP7]], %struct.anon.0* [[AGG_CAPTURED1]]) +// CHECK-NEXT: [[CALL:%.*]] = call i32 @_ZNK10MyIteratordeEv(%struct.MyIterator* nonnull dereferenceable(1) [[IT]]) +// CHECK-NEXT: store i32 [[CALL]], i32* [[I]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load float*, float** [[B_ADDR]], align 8 +// CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[I]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[TMP9]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[TMP8]], i64 [[IDXPROM]] +// CHECK-NEXT: [[TMP10:%.*]] = load float, float* [[ARRAYIDX]], align 4 +// CHECK-NEXT: [[TMP11:%.*]] = load float*, float** [[C_ADDR]], align 8 +// CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[I]], align 4 +// CHECK-NEXT: [[IDXPROM2:%.*]] = zext i32 [[TMP12]] to i64 +// CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[TMP11]], i64 [[IDXPROM2]] +// CHECK-NEXT: [[TMP13:%.*]] = load float, float* [[ARRAYIDX3]], align 4 +// CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP10]], [[TMP13]] +// CHECK-NEXT: [[TMP14:%.*]] = load float*, float** [[A_ADDR]], align 8 +// CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK-NEXT: [[IDXPROM4:%.*]] = zext i32 [[TMP15]] to i64 +// CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 [[IDXPROM4]] +// CHECK-NEXT: store float [[MUL]], float* [[ARRAYIDX5]], align 4 +// CHECK-NEXT: br label [[OMP_LOOP_INC]] +// CHECK: omp_loop.inc: +// CHECK-NEXT: [[OMP_LOOP_NEXT]] = add nuw i64 [[OMP_LOOP_IV]], 1 +// CHECK-NEXT: br label [[OMP_LOOP_HEADER]] +// CHECK: omp_loop.exit: +// CHECK-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]]) +// CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]]) +// CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* [[GLOB2:@.*]], i32 [[OMP_GLOBAL_THREAD_NUM6]]) +// CHECK-NEXT: br label [[OMP_LOOP_AFTER:%.*]] +// CHECK: omp_loop.after: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@__captured_stmt +// CHECK-SAME: (i64* nonnull align 8 dereferenceable(8) [[DISTANCE:%.*]], %struct.anon* noalias [[__CONTEXT:%.*]]) [[ATTR2:#.*]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DISTANCE_ADDR:%.*]] = alloca i64*, align 8 +// CHECK-NEXT: [[__CONTEXT_ADDR:%.*]] = alloca %struct.anon*, align 8 +// CHECK-NEXT: [[REF_TMP:%.*]] = alloca [[STRUCT_MYITERATOR:%.*]], align 1 +// CHECK-NEXT: store i64* [[DISTANCE]], i64** [[DISTANCE_ADDR]], align 8 +// CHECK-NEXT: store %struct.anon* [[__CONTEXT]], %struct.anon** [[__CONTEXT_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load %struct.anon*, %struct.anon** [[__CONTEXT_ADDR]], align 8 +// CHECK-NEXT: call void @_ZN10MyIteratorC1Ej(%struct.MyIterator* nonnull dereferenceable(1) [[REF_TMP]], i32 41) +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], %struct.anon* [[TMP0]], i32 0, i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = load %struct.MyIterator*, %struct.MyIterator** [[TMP1]], align 8 +// CHECK-NEXT: [[CALL:%.*]] = call i32 @_ZNK10MyIteratormiERKS_(%struct.MyIterator* nonnull dereferenceable(1) [[REF_TMP]], %struct.MyIterator* nonnull align 1 dereferenceable(1) [[TMP2]]) +// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[CALL]] to i64 +// CHECK-NEXT: [[DIV:%.*]] = udiv i64 [[CONV]], 1 +// CHECK-NEXT: [[TMP3:%.*]] = load i64*, i64** [[DISTANCE_ADDR]], align 8 +// CHECK-NEXT: store i64 [[DIV]], i64* [[TMP3]], align 8 +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@__captured_stmt.1 +// CHECK-SAME: (%struct.MyIterator* nonnull align 1 dereferenceable(1) [[LOOPVAR:%.*]], i64 [[LOGICAL:%.*]], %struct.anon.0* noalias [[__CONTEXT:%.*]]) [[ATTR2]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[LOOPVAR_ADDR:%.*]] = alloca %struct.MyIterator*, align 8 +// CHECK-NEXT: [[LOGICAL_ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[__CONTEXT_ADDR:%.*]] = alloca %struct.anon.0*, align 8 +// CHECK-NEXT: [[REF_TMP:%.*]] = alloca [[STRUCT_MYITERATOR:%.*]], align 1 +// CHECK-NEXT: store %struct.MyIterator* [[LOOPVAR]], %struct.MyIterator** [[LOOPVAR_ADDR]], align 8 +// CHECK-NEXT: store i64 [[LOGICAL]], i64* [[LOGICAL_ADDR]], align 8 +// CHECK-NEXT: store %struct.anon.0* [[__CONTEXT]], %struct.anon.0** [[__CONTEXT_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load %struct.anon.0*, %struct.anon.0** [[__CONTEXT_ADDR]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_ANON_0:%.*]], %struct.anon.0* [[TMP0]], i32 0, i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = load i64, i64* [[LOGICAL_ADDR]], align 8 +// CHECK-NEXT: [[MUL:%.*]] = mul i64 1, [[TMP2]] +// CHECK-NEXT: [[CONV:%.*]] = trunc i64 [[MUL]] to i32 +// CHECK-NEXT: call void @_ZNK10MyIteratorplEj(%struct.MyIterator* sret(%struct.MyIterator) align 1 [[REF_TMP]], %struct.MyIterator* nonnull dereferenceable(1) [[TMP1]], i32 [[CONV]]) +// CHECK-NEXT: [[TMP3:%.*]] = load %struct.MyIterator*, %struct.MyIterator** [[LOOPVAR_ADDR]], align 8 +// CHECK-NEXT: [[CALL:%.*]] = call nonnull align 1 dereferenceable(1) %struct.MyIterator* @_ZN10MyIteratoraSERKS_(%struct.MyIterator* nonnull dereferenceable(1) [[TMP3]], %struct.MyIterator* nonnull align 1 dereferenceable(1) [[REF_TMP]]) +// CHECK-NEXT: ret void +// diff --git a/clang/test/OpenMP/irbuilder_for_rangefor.cpp b/clang/test/OpenMP/irbuilder_for_rangefor.cpp new file mode 100644 --- /dev/null +++ b/clang/test/OpenMP/irbuilder_for_rangefor.cpp @@ -0,0 +1,164 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs +// RUN: %clang_cc1 -fopenmp-enable-irbuilder -verify -fopenmp -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s +// expected-no-diagnostics + +#ifndef HEADER +#define HEADER + +struct MyIterator { + MyIterator(unsigned pos); + MyIterator(const MyIterator &other); + const MyIterator &operator=(const MyIterator &that); + MyIterator &operator++(); + int operator-(const MyIterator &that) const; + MyIterator &operator+=(unsigned a); + MyIterator operator+(unsigned a) const; + bool operator==(const MyIterator &that) const; + bool operator!=(const MyIterator &that) const; + unsigned operator*() const; +}; + +struct MyRange { + MyRange(int n); + + MyIterator begin(); + MyIterator end(); +}; + +extern "C" void workshareloop_rangefor(float *a, float *b, float *c) { +#pragma omp for + for (unsigned i : MyRange(42)) { + a[i] = b[i] * c[i]; + } +} + +#endif // HEADER +// CHECK-LABEL: define {{[^@]+}}@workshareloop_rangefor +// CHECK-SAME: (float* [[A:%.*]], float* [[B:%.*]], float* [[C:%.*]]) [[ATTR0:#.*]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca float*, align 8 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca float*, align 8 +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca float*, align 8 +// CHECK-NEXT: [[__RANGE2:%.*]] = alloca %struct.MyRange*, align 8 +// CHECK-NEXT: [[REF_TMP:%.*]] = alloca [[STRUCT_MYRANGE:%.*]], align 1 +// CHECK-NEXT: [[__BEGIN2:%.*]] = alloca [[STRUCT_MYITERATOR:%.*]], align 1 +// CHECK-NEXT: [[__END2:%.*]] = alloca [[STRUCT_MYITERATOR]], align 1 +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 8 +// CHECK-NEXT: [[AGG_CAPTURED1:%.*]] = alloca [[STRUCT_ANON_0:%.*]], align 1 +// CHECK-NEXT: [[DOTCOUNT_ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[P_LASTITER:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[P_LOWERBOUND:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[P_UPPERBOUND:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[P_STRIDE:%.*]] = alloca i64, align 8 +// CHECK-NEXT: store float* [[A]], float** [[A_ADDR]], align 8 +// CHECK-NEXT: store float* [[B]], float** [[B_ADDR]], align 8 +// CHECK-NEXT: store float* [[C]], float** [[C_ADDR]], align 8 +// CHECK-NEXT: call void @_ZN7MyRangeC1Ei(%struct.MyRange* nonnull dereferenceable(1) [[REF_TMP]], i32 42) +// CHECK-NEXT: store %struct.MyRange* [[REF_TMP]], %struct.MyRange** [[__RANGE2]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load %struct.MyRange*, %struct.MyRange** [[__RANGE2]], align 8 +// CHECK-NEXT: call void @_ZN7MyRange5beginEv(%struct.MyIterator* sret(%struct.MyIterator) align 1 [[__BEGIN2]], %struct.MyRange* nonnull dereferenceable(1) [[TMP0]]) +// CHECK-NEXT: [[TMP1:%.*]] = load %struct.MyRange*, %struct.MyRange** [[__RANGE2]], align 8 +// CHECK-NEXT: call void @_ZN7MyRange3endEv(%struct.MyIterator* sret(%struct.MyIterator) align 1 [[__END2]], %struct.MyRange* nonnull dereferenceable(1) [[TMP1]]) +// CHECK-NEXT: [[CALL:%.*]] = call i32 @_ZNK10MyIteratordeEv(%struct.MyIterator* nonnull dereferenceable(1) [[__BEGIN2]]) +// CHECK-NEXT: store i32 [[CALL]], i32* [[I]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[AGG_CAPTURED]], i32 0, i32 0 +// CHECK-NEXT: store %struct.MyIterator* [[__BEGIN2]], %struct.MyIterator** [[TMP2]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[AGG_CAPTURED]], i32 0, i32 1 +// CHECK-NEXT: store %struct.MyIterator* [[__END2]], %struct.MyIterator** [[TMP3]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ANON_0]], %struct.anon.0* [[AGG_CAPTURED1]], i32 0, i32 0 +// CHECK-NEXT: call void @_ZN10MyIteratorC1ERKS_(%struct.MyIterator* nonnull dereferenceable(1) [[TMP4]], %struct.MyIterator* nonnull align 1 dereferenceable(1) [[__BEGIN2]]) +// CHECK-NEXT: call void @__captured_stmt(i64* [[DOTCOUNT_ADDR]], %struct.anon* [[AGG_CAPTURED]]) +// CHECK-NEXT: [[DOTCOUNT:%.*]] = load i64, i64* [[DOTCOUNT_ADDR]], align 8 +// CHECK-NEXT: br label [[OMP_LOOP_PREHEADER:%.*]] +// CHECK: omp_loop.preheader: +// CHECK-NEXT: store i64 0, i64* [[P_LOWERBOUND]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = sub i64 [[DOTCOUNT]], 1 +// CHECK-NEXT: store i64 [[TMP5]], i64* [[P_UPPERBOUND]], align 8 +// CHECK-NEXT: store i64 1, i64* [[P_STRIDE]], align 8 +// CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1:@.*]]) +// CHECK-NEXT: call void @__kmpc_for_static_init_8u(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 34, i32* [[P_LASTITER]], i64* [[P_LOWERBOUND]], i64* [[P_UPPERBOUND]], i64* [[P_STRIDE]], i64 1, i64 1) +// CHECK-NEXT: [[TMP6:%.*]] = load i64, i64* [[P_LOWERBOUND]], align 8 +// CHECK-NEXT: [[TMP7:%.*]] = load i64, i64* [[P_UPPERBOUND]], align 8 +// CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], [[TMP6]] +// CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 1 +// CHECK-NEXT: br label [[OMP_LOOP_HEADER:%.*]] +// CHECK: omp_loop.header: +// CHECK-NEXT: [[OMP_LOOP_IV:%.*]] = phi i64 [ 0, [[OMP_LOOP_PREHEADER]] ], [ [[OMP_LOOP_NEXT:%.*]], [[OMP_LOOP_INC:%.*]] ] +// CHECK-NEXT: br label [[OMP_LOOP_COND:%.*]] +// CHECK: omp_loop.cond: +// CHECK-NEXT: [[OMP_LOOP_CMP:%.*]] = icmp ult i64 [[OMP_LOOP_IV]], [[TMP9]] +// CHECK-NEXT: br i1 [[OMP_LOOP_CMP]], label [[OMP_LOOP_BODY:%.*]], label [[OMP_LOOP_EXIT:%.*]] +// CHECK: omp_loop.body: +// CHECK-NEXT: [[TMP10:%.*]] = add i64 [[OMP_LOOP_IV]], [[TMP6]] +// CHECK-NEXT: call void @__captured_stmt.1(i32* [[I]], i64 [[TMP10]], %struct.anon.0* [[AGG_CAPTURED1]]) +// CHECK-NEXT: [[TMP11:%.*]] = load float*, float** [[B_ADDR]], align 8 +// CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[I]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[TMP12]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[TMP11]], i64 [[IDXPROM]] +// CHECK-NEXT: [[TMP13:%.*]] = load float, float* [[ARRAYIDX]], align 4 +// CHECK-NEXT: [[TMP14:%.*]] = load float*, float** [[C_ADDR]], align 8 +// CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[I]], align 4 +// CHECK-NEXT: [[IDXPROM2:%.*]] = zext i32 [[TMP15]] to i64 +// CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 [[IDXPROM2]] +// CHECK-NEXT: [[TMP16:%.*]] = load float, float* [[ARRAYIDX3]], align 4 +// CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP13]], [[TMP16]] +// CHECK-NEXT: [[TMP17:%.*]] = load float*, float** [[A_ADDR]], align 8 +// CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* [[I]], align 4 +// CHECK-NEXT: [[IDXPROM4:%.*]] = zext i32 [[TMP18]] to i64 +// CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[TMP17]], i64 [[IDXPROM4]] +// CHECK-NEXT: store float [[MUL]], float* [[ARRAYIDX5]], align 4 +// CHECK-NEXT: br label [[OMP_LOOP_INC]] +// CHECK: omp_loop.inc: +// CHECK-NEXT: [[OMP_LOOP_NEXT]] = add nuw i64 [[OMP_LOOP_IV]], 1 +// CHECK-NEXT: br label [[OMP_LOOP_HEADER]] +// CHECK: omp_loop.exit: +// CHECK-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]]) +// CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]]) +// CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* [[GLOB2:@.*]], i32 [[OMP_GLOBAL_THREAD_NUM6]]) +// CHECK-NEXT: br label [[OMP_LOOP_AFTER:%.*]] +// CHECK: omp_loop.after: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@__captured_stmt +// CHECK-SAME: (i64* nonnull align 8 dereferenceable(8) [[DISTANCE:%.*]], %struct.anon* noalias [[__CONTEXT:%.*]]) [[ATTR2:#.*]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DISTANCE_ADDR:%.*]] = alloca i64*, align 8 +// CHECK-NEXT: [[__CONTEXT_ADDR:%.*]] = alloca %struct.anon*, align 8 +// CHECK-NEXT: store i64* [[DISTANCE]], i64** [[DISTANCE_ADDR]], align 8 +// CHECK-NEXT: store %struct.anon* [[__CONTEXT]], %struct.anon** [[__CONTEXT_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load %struct.anon*, %struct.anon** [[__CONTEXT_ADDR]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], %struct.anon* [[TMP0]], i32 0, i32 1 +// CHECK-NEXT: [[TMP2:%.*]] = load %struct.MyIterator*, %struct.MyIterator** [[TMP1]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP0]], i32 0, i32 0 +// CHECK-NEXT: [[TMP4:%.*]] = load %struct.MyIterator*, %struct.MyIterator** [[TMP3]], align 8 +// CHECK-NEXT: [[CALL:%.*]] = call i32 @_ZNK10MyIteratormiERKS_(%struct.MyIterator* nonnull dereferenceable(1) [[TMP2]], %struct.MyIterator* nonnull align 1 dereferenceable(1) [[TMP4]]) +// CHECK-NEXT: [[CONV:%.*]] = sext i32 [[CALL]] to i64 +// CHECK-NEXT: [[DIV:%.*]] = udiv i64 [[CONV]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = load i64*, i64** [[DISTANCE_ADDR]], align 8 +// CHECK-NEXT: store i64 [[DIV]], i64* [[TMP5]], align 8 +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@__captured_stmt.1 +// CHECK-SAME: (i32* nonnull align 4 dereferenceable(4) [[LOOPVAR:%.*]], i64 [[LOGICAL:%.*]], %struct.anon.0* noalias [[__CONTEXT:%.*]]) [[ATTR2]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[LOOPVAR_ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[LOGICAL_ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[__CONTEXT_ADDR:%.*]] = alloca %struct.anon.0*, align 8 +// CHECK-NEXT: [[REF_TMP:%.*]] = alloca [[STRUCT_MYITERATOR:%.*]], align 1 +// CHECK-NEXT: store i32* [[LOOPVAR]], i32** [[LOOPVAR_ADDR]], align 8 +// CHECK-NEXT: store i64 [[LOGICAL]], i64* [[LOGICAL_ADDR]], align 8 +// CHECK-NEXT: store %struct.anon.0* [[__CONTEXT]], %struct.anon.0** [[__CONTEXT_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load %struct.anon.0*, %struct.anon.0** [[__CONTEXT_ADDR]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_ANON_0:%.*]], %struct.anon.0* [[TMP0]], i32 0, i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = load i64, i64* [[LOGICAL_ADDR]], align 8 +// CHECK-NEXT: [[MUL:%.*]] = mul i64 1, [[TMP2]] +// CHECK-NEXT: [[CONV:%.*]] = trunc i64 [[MUL]] to i32 +// CHECK-NEXT: call void @_ZNK10MyIteratorplEj(%struct.MyIterator* sret(%struct.MyIterator) align 1 [[REF_TMP]], %struct.MyIterator* nonnull dereferenceable(1) [[TMP1]], i32 [[CONV]]) +// CHECK-NEXT: [[CALL:%.*]] = call i32 @_ZNK10MyIteratordeEv(%struct.MyIterator* nonnull dereferenceable(1) [[REF_TMP]]) +// CHECK-NEXT: [[TMP3:%.*]] = load i32*, i32** [[LOOPVAR_ADDR]], align 8 +// CHECK-NEXT: store i32 [[CALL]], i32* [[TMP3]], align 4 +// CHECK-NEXT: ret void +// diff --git a/clang/test/OpenMP/irbuilder_for_unsigned.c b/clang/test/OpenMP/irbuilder_for_unsigned.c new file mode 100644 --- /dev/null +++ b/clang/test/OpenMP/irbuilder_for_unsigned.c @@ -0,0 +1,147 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs +// RUN: %clang_cc1 -fopenmp-enable-irbuilder -verify -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s +// expected-no-diagnostics + +#ifndef HEADER +#define HEADER + +extern "C" void workshareloop_unsigned(float *a, float *b, float *c, float *d) { +#pragma omp for + for (unsigned i = 33; i < 32000000; i += 7) { + a[i] = b[i] * c[i] * d[i]; + } +} + +#endif // HEADER +// CHECK-LABEL: define {{[^@]+}}@workshareloop_unsigned +// CHECK-SAME: (float* [[A:%.*]], float* [[B:%.*]], float* [[C:%.*]], float* [[D:%.*]]) [[ATTR0:#.*]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca float*, align 8 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca float*, align 8 +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca float*, align 8 +// CHECK-NEXT: [[D_ADDR:%.*]] = alloca float*, align 8 +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 8 +// CHECK-NEXT: [[AGG_CAPTURED1:%.*]] = alloca [[STRUCT_ANON_0:%.*]], align 4 +// CHECK-NEXT: [[DOTCOUNT_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[P_LASTITER:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[P_LOWERBOUND:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[P_UPPERBOUND:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[P_STRIDE:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store float* [[A]], float** [[A_ADDR]], align 8 +// CHECK-NEXT: store float* [[B]], float** [[B_ADDR]], align 8 +// CHECK-NEXT: store float* [[C]], float** [[C_ADDR]], align 8 +// CHECK-NEXT: store float* [[D]], float** [[D_ADDR]], align 8 +// CHECK-NEXT: store i32 33, i32* [[I]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[AGG_CAPTURED]], i32 0, i32 0 +// CHECK-NEXT: store i32* [[I]], i32** [[TMP0]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_ANON_0]], %struct.anon.0* [[AGG_CAPTURED1]], i32 0, i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[I]], align 4 +// CHECK-NEXT: store i32 [[TMP2]], i32* [[TMP1]], align 4 +// CHECK-NEXT: call void @__captured_stmt(i32* [[DOTCOUNT_ADDR]], %struct.anon* [[AGG_CAPTURED]]) +// CHECK-NEXT: [[DOTCOUNT:%.*]] = load i32, i32* [[DOTCOUNT_ADDR]], align 4 +// CHECK-NEXT: br label [[OMP_LOOP_PREHEADER:%.*]] +// CHECK: omp_loop.preheader: +// CHECK-NEXT: store i32 0, i32* [[P_LOWERBOUND]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = sub i32 [[DOTCOUNT]], 1 +// CHECK-NEXT: store i32 [[TMP3]], i32* [[P_UPPERBOUND]], align 4 +// CHECK-NEXT: store i32 1, i32* [[P_STRIDE]], align 4 +// CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1:@.*]]) +// CHECK-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 34, i32* [[P_LASTITER]], i32* [[P_LOWERBOUND]], i32* [[P_UPPERBOUND]], i32* [[P_STRIDE]], i32 1, i32 1) +// CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[P_LOWERBOUND]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[P_UPPERBOUND]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = sub i32 [[TMP5]], [[TMP4]] +// CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], 1 +// CHECK-NEXT: br label [[OMP_LOOP_HEADER:%.*]] +// CHECK: omp_loop.header: +// CHECK-NEXT: [[OMP_LOOP_IV:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER]] ], [ [[OMP_LOOP_NEXT:%.*]], [[OMP_LOOP_INC:%.*]] ] +// CHECK-NEXT: br label [[OMP_LOOP_COND:%.*]] +// CHECK: omp_loop.cond: +// CHECK-NEXT: [[OMP_LOOP_CMP:%.*]] = icmp ult i32 [[OMP_LOOP_IV]], [[TMP7]] +// CHECK-NEXT: br i1 [[OMP_LOOP_CMP]], label [[OMP_LOOP_BODY:%.*]], label [[OMP_LOOP_EXIT:%.*]] +// CHECK: omp_loop.body: +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[OMP_LOOP_IV]], [[TMP4]] +// CHECK-NEXT: call void @__captured_stmt.1(i32* [[I]], i32 [[TMP8]], %struct.anon.0* [[AGG_CAPTURED1]]) +// CHECK-NEXT: [[TMP9:%.*]] = load float*, float** [[B_ADDR]], align 8 +// CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[I]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[TMP10]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[TMP9]], i64 [[IDXPROM]] +// CHECK-NEXT: [[TMP11:%.*]] = load float, float* [[ARRAYIDX]], align 4 +// CHECK-NEXT: [[TMP12:%.*]] = load float*, float** [[C_ADDR]], align 8 +// CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[I]], align 4 +// CHECK-NEXT: [[IDXPROM2:%.*]] = zext i32 [[TMP13]] to i64 +// CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[TMP12]], i64 [[IDXPROM2]] +// CHECK-NEXT: [[TMP14:%.*]] = load float, float* [[ARRAYIDX3]], align 4 +// CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP11]], [[TMP14]] +// CHECK-NEXT: [[TMP15:%.*]] = load float*, float** [[D_ADDR]], align 8 +// CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[I]], align 4 +// CHECK-NEXT: [[IDXPROM4:%.*]] = zext i32 [[TMP16]] to i64 +// CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[TMP15]], i64 [[IDXPROM4]] +// CHECK-NEXT: [[TMP17:%.*]] = load float, float* [[ARRAYIDX5]], align 4 +// CHECK-NEXT: [[MUL6:%.*]] = fmul float [[MUL]], [[TMP17]] +// CHECK-NEXT: [[TMP18:%.*]] = load float*, float** [[A_ADDR]], align 8 +// CHECK-NEXT: [[TMP19:%.*]] = load i32, i32* [[I]], align 4 +// CHECK-NEXT: [[IDXPROM7:%.*]] = zext i32 [[TMP19]] to i64 +// CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[TMP18]], i64 [[IDXPROM7]] +// CHECK-NEXT: store float [[MUL6]], float* [[ARRAYIDX8]], align 4 +// CHECK-NEXT: br label [[OMP_LOOP_INC]] +// CHECK: omp_loop.inc: +// CHECK-NEXT: [[OMP_LOOP_NEXT]] = add nuw i32 [[OMP_LOOP_IV]], 1 +// CHECK-NEXT: br label [[OMP_LOOP_HEADER]] +// CHECK: omp_loop.exit: +// CHECK-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]]) +// CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM9:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]]) +// CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* [[GLOB2:@.*]], i32 [[OMP_GLOBAL_THREAD_NUM9]]) +// CHECK-NEXT: br label [[OMP_LOOP_AFTER:%.*]] +// CHECK: omp_loop.after: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@__captured_stmt +// CHECK-SAME: (i32* nonnull align 4 dereferenceable(4) [[DISTANCE:%.*]], %struct.anon* noalias [[__CONTEXT:%.*]]) [[ATTR1:#.*]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DISTANCE_ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[__CONTEXT_ADDR:%.*]] = alloca %struct.anon*, align 8 +// CHECK-NEXT: store i32* [[DISTANCE]], i32** [[DISTANCE_ADDR]], align 8 +// CHECK-NEXT: store %struct.anon* [[__CONTEXT]], %struct.anon** [[__CONTEXT_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load %struct.anon*, %struct.anon** [[__CONTEXT_ADDR]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], %struct.anon* [[TMP0]], i32 0, i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = load i32*, i32** [[TMP1]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[TMP3]], 32000000 +// CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK: cond.true: +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP0]], i32 0, i32 0 +// CHECK-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK-NEXT: [[SUB:%.*]] = sub i32 32000000, [[TMP6]] +// CHECK-NEXT: [[DIV:%.*]] = udiv i32 [[SUB]], 7 +// CHECK-NEXT: br label [[COND_END:%.*]] +// CHECK: cond.false: +// CHECK-NEXT: br label [[COND_END]] +// CHECK: cond.end: +// CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[DIV]], [[COND_TRUE]] ], [ 0, [[COND_FALSE]] ] +// CHECK-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DISTANCE_ADDR]], align 8 +// CHECK-NEXT: store i32 [[COND]], i32* [[TMP7]], align 4 +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@__captured_stmt.1 +// CHECK-SAME: (i32* nonnull align 4 dereferenceable(4) [[LOOPVAR:%.*]], i32 [[LOGICAL:%.*]], %struct.anon.0* noalias [[__CONTEXT:%.*]]) [[ATTR1]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[LOOPVAR_ADDR:%.*]] = alloca i32*, align 8 +// CHECK-NEXT: [[LOGICAL_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[__CONTEXT_ADDR:%.*]] = alloca %struct.anon.0*, align 8 +// CHECK-NEXT: store i32* [[LOOPVAR]], i32** [[LOOPVAR_ADDR]], align 8 +// CHECK-NEXT: store i32 [[LOGICAL]], i32* [[LOGICAL_ADDR]], align 4 +// CHECK-NEXT: store %struct.anon.0* [[__CONTEXT]], %struct.anon.0** [[__CONTEXT_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load %struct.anon.0*, %struct.anon.0** [[__CONTEXT_ADDR]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_ANON_0:%.*]], %struct.anon.0* [[TMP0]], i32 0, i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[LOGICAL_ADDR]], align 4 +// CHECK-NEXT: [[MUL:%.*]] = mul i32 7, [[TMP3]] +// CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP2]], [[MUL]] +// CHECK-NEXT: [[TMP4:%.*]] = load i32*, i32** [[LOOPVAR_ADDR]], align 8 +// CHECK-NEXT: store i32 [[ADD]], i32* [[TMP4]], align 4 +// CHECK-NEXT: ret void +// diff --git a/clang/test/OpenMP/irbuilder_nested_parallel_for.c b/clang/test/OpenMP/irbuilder_nested_parallel_for.c --- a/clang/test/OpenMP/irbuilder_nested_parallel_for.c +++ b/clang/test/OpenMP/irbuilder_nested_parallel_for.c @@ -23,15 +23,15 @@ // // CHECK-DEBUG-LABEL: @_Z14parallel_for_0v( // CHECK-DEBUG-NEXT: entry: -// CHECK-DEBUG-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1:@.*]]), [[DBG10:!dbg !.*]] +// CHECK-DEBUG-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1:@.*]]), [[DBG12:!dbg !.*]] // CHECK-DEBUG-NEXT: br label [[OMP_PARALLEL:%.*]] // CHECK-DEBUG: omp_parallel: -// CHECK-DEBUG-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @_Z14parallel_for_0v..omp_par to void (i32*, i32*, ...)*)), [[DBG11:!dbg !.*]] +// CHECK-DEBUG-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @_Z14parallel_for_0v..omp_par to void (i32*, i32*, ...)*)), [[DBG13:!dbg !.*]] // CHECK-DEBUG-NEXT: br label [[OMP_PAR_OUTLINED_EXIT:%.*]] // CHECK-DEBUG: omp.par.outlined.exit: // CHECK-DEBUG-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]] // CHECK-DEBUG: omp.par.exit.split: -// CHECK-DEBUG-NEXT: ret void, [[DBG14:!dbg !.*]] +// CHECK-DEBUG-NEXT: ret void, [[DBG17:!dbg !.*]] // void parallel_for_0(void) { #pragma omp parallel @@ -53,9 +53,9 @@ // CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]]) // CHECK-NEXT: br label [[OMP_PARALLEL:%.*]] // CHECK: omp_parallel: -// CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, double*, float**)* @_Z14parallel_for_1Pfid..omp_par.1 to void (i32*, i32*, ...)*), i32* [[A_ADDR]], double* [[B_ADDR]], float** [[R_ADDR]]) -// CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT19:%.*]] -// CHECK: omp.par.outlined.exit19: +// CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, double*, float**)* @_Z14parallel_for_1Pfid..omp_par.4 to void (i32*, i32*, ...)*), i32* [[A_ADDR]], double* [[B_ADDR]], float** [[R_ADDR]]) +// CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT16:%.*]] +// CHECK: omp.par.outlined.exit16: // CHECK-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]] // CHECK: omp.par.exit.split: // CHECK-NEXT: ret void @@ -66,20 +66,20 @@ // CHECK-DEBUG-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 // CHECK-DEBUG-NEXT: [[B_ADDR:%.*]] = alloca double, align 8 // CHECK-DEBUG-NEXT: store float* [[R:%.*]], float** [[R_ADDR]], align 8 -// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata float** [[R_ADDR]], [[META41:metadata !.*]], metadata !DIExpression()), [[DBG42:!dbg !.*]] +// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata float** [[R_ADDR]], [[META67:metadata !.*]], metadata !DIExpression()), [[DBG68:!dbg !.*]] // CHECK-DEBUG-NEXT: store i32 [[A:%.*]], i32* [[A_ADDR]], align 4 -// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], [[META43:metadata !.*]], metadata !DIExpression()), [[DBG44:!dbg !.*]] +// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], [[META69:metadata !.*]], metadata !DIExpression()), [[DBG70:!dbg !.*]] // CHECK-DEBUG-NEXT: store double [[B:%.*]], double* [[B_ADDR]], align 8 -// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata double* [[B_ADDR]], [[META45:metadata !.*]], metadata !DIExpression()), [[DBG46:!dbg !.*]] -// CHECK-DEBUG-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB10:@.*]]), [[DBG47:!dbg !.*]] +// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata double* [[B_ADDR]], [[META71:metadata !.*]], metadata !DIExpression()), [[DBG72:!dbg !.*]] +// CHECK-DEBUG-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB6:@.*]]), [[DBG73:!dbg !.*]] // CHECK-DEBUG-NEXT: br label [[OMP_PARALLEL:%.*]] // CHECK-DEBUG: omp_parallel: -// CHECK-DEBUG-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB10]], i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, double*, float**)* @_Z14parallel_for_1Pfid..omp_par.1 to void (i32*, i32*, ...)*), i32* [[A_ADDR]], double* [[B_ADDR]], float** [[R_ADDR]]), [[DBG48:!dbg !.*]] -// CHECK-DEBUG-NEXT: br label [[OMP_PAR_OUTLINED_EXIT19:%.*]] -// CHECK-DEBUG: omp.par.outlined.exit19: +// CHECK-DEBUG-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB6]], i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, double*, float**)* @_Z14parallel_for_1Pfid..omp_par.4 to void (i32*, i32*, ...)*), i32* [[A_ADDR]], double* [[B_ADDR]], float** [[R_ADDR]]), [[DBG74:!dbg !.*]] +// CHECK-DEBUG-NEXT: br label [[OMP_PAR_OUTLINED_EXIT16:%.*]] +// CHECK-DEBUG: omp.par.outlined.exit16: // CHECK-DEBUG-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]] // CHECK-DEBUG: omp.par.exit.split: -// CHECK-DEBUG-NEXT: ret void, [[DBG50:!dbg !.*]] +// CHECK-DEBUG-NEXT: ret void, [[DBG76:!dbg !.*]] // void parallel_for_1(float *r, int a, double b) { #pragma omp parallel @@ -99,76 +99,72 @@ // CHECK-NEXT: [[R_ADDR:%.*]] = alloca float*, align 8 // CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 // CHECK-NEXT: [[B_ADDR:%.*]] = alloca double, align 8 -// CHECK-NEXT: [[DOTOMP_IV212:%.*]] = alloca i32, align 4 -// CHECK-NEXT: [[TMP213:%.*]] = alloca i32, align 4 -// CHECK-NEXT: [[DOTOMP_LB214:%.*]] = alloca i32, align 4 -// CHECK-NEXT: [[DOTOMP_UB215:%.*]] = alloca i32, align 4 -// CHECK-NEXT: [[DOTOMP_STRIDE216:%.*]] = alloca i32, align 4 -// CHECK-NEXT: [[DOTOMP_IS_LAST217:%.*]] = alloca i32, align 4 -// CHECK-NEXT: [[I218:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[I185:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[AGG_CAPTURED186:%.*]] = alloca [[STRUCT_ANON_17:%.*]], align 8 +// CHECK-NEXT: [[AGG_CAPTURED187:%.*]] = alloca [[STRUCT_ANON_18:%.*]], align 4 +// CHECK-NEXT: [[DOTCOUNT_ADDR188:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[P_LASTITER203:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[P_LOWERBOUND204:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[P_UPPERBOUND205:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[P_STRIDE206:%.*]] = alloca i32, align 4 // CHECK-NEXT: store float* [[R:%.*]], float** [[R_ADDR]], align 8 // CHECK-NEXT: store i32 [[A:%.*]], i32* [[A_ADDR]], align 4 // CHECK-NEXT: store double [[B:%.*]], double* [[B_ADDR]], align 8 // CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]]) // CHECK-NEXT: br label [[OMP_PARALLEL:%.*]] // CHECK: omp_parallel: -// CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, double*, float**)* @_Z14parallel_for_2Pfid..omp_par.4 to void (i32*, i32*, ...)*), i32* [[A_ADDR]], double* [[B_ADDR]], float** [[R_ADDR]]) -// CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT211:%.*]] -// CHECK: omp.par.outlined.exit211: +// CHECK-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB1]], i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, double*, float**)* @_Z14parallel_for_2Pfid..omp_par.23 to void (i32*, i32*, ...)*), i32* [[A_ADDR]], double* [[B_ADDR]], float** [[R_ADDR]]) +// CHECK-NEXT: br label [[OMP_PAR_OUTLINED_EXIT184:%.*]] +// CHECK: omp.par.outlined.exit184: // CHECK-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]] // CHECK: omp.par.exit.split: -// CHECK-NEXT: store i32 0, i32* [[DOTOMP_LB214]], align 4 -// CHECK-NEXT: store i32 99, i32* [[DOTOMP_UB215]], align 4 -// CHECK-NEXT: store i32 1, i32* [[DOTOMP_STRIDE216]], align 4 -// CHECK-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST217]], align 4 -// CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM219:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB41:@.*]]) -// CHECK-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* [[GLOB2:@.*]], i32 [[OMP_GLOBAL_THREAD_NUM219]], i32 34, i32* [[DOTOMP_IS_LAST217]], i32* [[DOTOMP_LB214]], i32* [[DOTOMP_UB215]], i32* [[DOTOMP_STRIDE216]], i32 1, i32 1) -// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTOMP_UB215]], align 4 -// CHECK-NEXT: [[CMP220:%.*]] = icmp sgt i32 [[TMP0]], 99 -// CHECK-NEXT: br i1 [[CMP220]], label [[COND_TRUE221:%.*]], label [[COND_FALSE222:%.*]] -// CHECK: cond.true221: -// CHECK-NEXT: br label [[COND_END223:%.*]] -// CHECK: cond.false222: -// CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTOMP_UB215]], align 4 -// CHECK-NEXT: br label [[COND_END223]] -// CHECK: cond.end223: -// CHECK-NEXT: [[COND224:%.*]] = phi i32 [ 99, [[COND_TRUE221]] ], [ [[TMP1]], [[COND_FALSE222]] ] -// CHECK-NEXT: store i32 [[COND224]], i32* [[DOTOMP_UB215]], align 4 -// CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB214]], align 4 -// CHECK-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_IV212]], align 4 -// CHECK-NEXT: br label [[OMP_INNER_FOR_COND225:%.*]] -// CHECK: omp.inner.for.cond225: -// CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_IV212]], align 4 -// CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB215]], align 4 -// CHECK-NEXT: [[CMP226:%.*]] = icmp sle i32 [[TMP3]], [[TMP4]] -// CHECK-NEXT: br i1 [[CMP226]], label [[OMP_INNER_FOR_BODY227:%.*]], label [[OMP_INNER_FOR_END236:%.*]] -// CHECK: omp.inner.for.body227: -// CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV212]], align 4 -// CHECK-NEXT: [[MUL228:%.*]] = mul nsw i32 [[TMP5]], 1 -// CHECK-NEXT: [[ADD229:%.*]] = add nsw i32 0, [[MUL228]] -// CHECK-NEXT: store i32 [[ADD229]], i32* [[I218]], align 4 -// CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[A_ADDR]], align 4 -// CHECK-NEXT: [[CONV230:%.*]] = sitofp i32 [[TMP6]] to double -// CHECK-NEXT: [[TMP7:%.*]] = load double, double* [[B_ADDR]], align 8 -// CHECK-NEXT: [[ADD231:%.*]] = fadd double [[CONV230]], [[TMP7]] -// CHECK-NEXT: [[CONV232:%.*]] = fptrunc double [[ADD231]] to float -// CHECK-NEXT: [[TMP8:%.*]] = load float*, float** [[R_ADDR]], align 8 -// CHECK-NEXT: store float [[CONV232]], float* [[TMP8]], align 4 -// CHECK-NEXT: br label [[OMP_BODY_CONTINUE233:%.*]] -// CHECK: omp.body.continue233: -// CHECK-NEXT: br label [[OMP_INNER_FOR_INC234:%.*]] -// CHECK: omp.inner.for.inc234: -// CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV212]], align 4 -// CHECK-NEXT: [[ADD235:%.*]] = add nsw i32 [[TMP9]], 1 -// CHECK-NEXT: store i32 [[ADD235]], i32* [[DOTOMP_IV212]], align 4 -// CHECK-NEXT: br label [[OMP_INNER_FOR_COND225]] -// CHECK: omp.inner.for.end236: -// CHECK-NEXT: br label [[OMP_LOOP_EXIT237:%.*]] -// CHECK: omp.loop.exit237: -// CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM238:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB43:@.*]]) -// CHECK-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* [[GLOB2]], i32 [[OMP_GLOBAL_THREAD_NUM238]]) -// CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM239:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]]) -// CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* [[GLOB7:@.*]], i32 [[OMP_GLOBAL_THREAD_NUM239]]) +// CHECK-NEXT: store i32 0, i32* [[I185]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ANON_17]], %struct.anon.17* [[AGG_CAPTURED186]], i32 0, i32 0 +// CHECK-NEXT: store i32* [[I185]], i32** [[TMP0]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_ANON_18]], %struct.anon.18* [[AGG_CAPTURED187]], i32 0, i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[I185]], align 4 +// CHECK-NEXT: store i32 [[TMP2]], i32* [[TMP1]], align 4 +// CHECK-NEXT: call void @__captured_stmt.19(i32* [[DOTCOUNT_ADDR188]], %struct.anon.17* [[AGG_CAPTURED186]]) +// CHECK-NEXT: [[DOTCOUNT189:%.*]] = load i32, i32* [[DOTCOUNT_ADDR188]], align 4 +// CHECK-NEXT: br label [[OMP_LOOP_PREHEADER190:%.*]] +// CHECK: omp_loop.preheader190: +// CHECK-NEXT: store i32 0, i32* [[P_LOWERBOUND204]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = sub i32 [[DOTCOUNT189]], 1 +// CHECK-NEXT: store i32 [[TMP3]], i32* [[P_UPPERBOUND205]], align 4 +// CHECK-NEXT: store i32 1, i32* [[P_STRIDE206]], align 4 +// CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM207:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]]) +// CHECK-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM207]], i32 34, i32* [[P_LASTITER203]], i32* [[P_LOWERBOUND204]], i32* [[P_UPPERBOUND205]], i32* [[P_STRIDE206]], i32 1, i32 1) +// CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[P_LOWERBOUND204]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[P_UPPERBOUND205]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = sub i32 [[TMP5]], [[TMP4]] +// CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], 1 +// CHECK-NEXT: br label [[OMP_LOOP_HEADER191:%.*]] +// CHECK: omp_loop.header191: +// CHECK-NEXT: [[OMP_LOOP_IV197:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER190]] ], [ [[OMP_LOOP_NEXT199:%.*]], [[OMP_LOOP_INC194:%.*]] ] +// CHECK-NEXT: br label [[OMP_LOOP_COND192:%.*]] +// CHECK: omp_loop.cond192: +// CHECK-NEXT: [[OMP_LOOP_CMP198:%.*]] = icmp ult i32 [[OMP_LOOP_IV197]], [[TMP7]] +// CHECK-NEXT: br i1 [[OMP_LOOP_CMP198]], label [[OMP_LOOP_BODY193:%.*]], label [[OMP_LOOP_EXIT195:%.*]] +// CHECK: omp_loop.body193: +// CHECK-NEXT: [[TMP8:%.*]] = add i32 [[OMP_LOOP_IV197]], [[TMP4]] +// CHECK-NEXT: call void @__captured_stmt.20(i32* [[I185]], i32 [[TMP8]], %struct.anon.18* [[AGG_CAPTURED187]]) +// CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[A_ADDR]], align 4 +// CHECK-NEXT: [[CONV200:%.*]] = sitofp i32 [[TMP9]] to double +// CHECK-NEXT: [[TMP10:%.*]] = load double, double* [[B_ADDR]], align 8 +// CHECK-NEXT: [[ADD201:%.*]] = fadd double [[CONV200]], [[TMP10]] +// CHECK-NEXT: [[CONV202:%.*]] = fptrunc double [[ADD201]] to float +// CHECK-NEXT: [[TMP11:%.*]] = load float*, float** [[R_ADDR]], align 8 +// CHECK-NEXT: store float [[CONV202]], float* [[TMP11]], align 4 +// CHECK-NEXT: br label [[OMP_LOOP_INC194]] +// CHECK: omp_loop.inc194: +// CHECK-NEXT: [[OMP_LOOP_NEXT199]] = add nuw i32 [[OMP_LOOP_IV197]], 1 +// CHECK-NEXT: br label [[OMP_LOOP_HEADER191]] +// CHECK: omp_loop.exit195: +// CHECK-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* [[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM207]]) +// CHECK-NEXT: [[OMP_GLOBAL_THREAD_NUM208:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB1]]) +// CHECK-NEXT: call void @__kmpc_barrier(%struct.ident_t* [[GLOB2:@.*]], i32 [[OMP_GLOBAL_THREAD_NUM208]]) +// CHECK-NEXT: br label [[OMP_LOOP_AFTER196:%.*]] +// CHECK: omp_loop.after196: // CHECK-NEXT: ret void // // CHECK-DEBUG-LABEL: @_Z14parallel_for_2Pfid( @@ -176,86 +172,77 @@ // CHECK-DEBUG-NEXT: [[R_ADDR:%.*]] = alloca float*, align 8 // CHECK-DEBUG-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 // CHECK-DEBUG-NEXT: [[B_ADDR:%.*]] = alloca double, align 8 -// CHECK-DEBUG-NEXT: [[DOTOMP_IV212:%.*]] = alloca i32, align 4 -// CHECK-DEBUG-NEXT: [[TMP213:%.*]] = alloca i32, align 4 -// CHECK-DEBUG-NEXT: [[DOTOMP_LB214:%.*]] = alloca i32, align 4 -// CHECK-DEBUG-NEXT: [[DOTOMP_UB215:%.*]] = alloca i32, align 4 -// CHECK-DEBUG-NEXT: [[DOTOMP_STRIDE216:%.*]] = alloca i32, align 4 -// CHECK-DEBUG-NEXT: [[DOTOMP_IS_LAST217:%.*]] = alloca i32, align 4 -// CHECK-DEBUG-NEXT: [[I218:%.*]] = alloca i32, align 4 +// CHECK-DEBUG-NEXT: [[I185:%.*]] = alloca i32, align 4 +// CHECK-DEBUG-NEXT: [[AGG_CAPTURED186:%.*]] = alloca [[STRUCT_ANON_17:%.*]], align 8 +// CHECK-DEBUG-NEXT: [[AGG_CAPTURED187:%.*]] = alloca [[STRUCT_ANON_18:%.*]], align 4 +// CHECK-DEBUG-NEXT: [[DOTCOUNT_ADDR188:%.*]] = alloca i32, align 4 +// CHECK-DEBUG-NEXT: [[P_LASTITER203:%.*]] = alloca i32, align 4 +// CHECK-DEBUG-NEXT: [[P_LOWERBOUND204:%.*]] = alloca i32, align 4 +// CHECK-DEBUG-NEXT: [[P_UPPERBOUND205:%.*]] = alloca i32, align 4 +// CHECK-DEBUG-NEXT: [[P_STRIDE206:%.*]] = alloca i32, align 4 // CHECK-DEBUG-NEXT: store float* [[R:%.*]], float** [[R_ADDR]], align 8 -// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata float** [[R_ADDR]], [[META77:metadata !.*]], metadata !DIExpression()), [[DBG78:!dbg !.*]] +// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata float** [[R_ADDR]], [[META123:metadata !.*]], metadata !DIExpression()), [[DBG124:!dbg !.*]] // CHECK-DEBUG-NEXT: store i32 [[A:%.*]], i32* [[A_ADDR]], align 4 -// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], [[META79:metadata !.*]], metadata !DIExpression()), [[DBG80:!dbg !.*]] +// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata i32* [[A_ADDR]], [[META125:metadata !.*]], metadata !DIExpression()), [[DBG126:!dbg !.*]] // CHECK-DEBUG-NEXT: store double [[B:%.*]], double* [[B_ADDR]], align 8 -// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata double* [[B_ADDR]], [[META81:metadata !.*]], metadata !DIExpression()), [[DBG82:!dbg !.*]] -// CHECK-DEBUG-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB21:@.*]]), [[DBG83:!dbg !.*]] +// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata double* [[B_ADDR]], [[META127:metadata !.*]], metadata !DIExpression()), [[DBG128:!dbg !.*]] +// CHECK-DEBUG-NEXT: [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB13:@.*]]), [[DBG129:!dbg !.*]] // CHECK-DEBUG-NEXT: br label [[OMP_PARALLEL:%.*]] // CHECK-DEBUG: omp_parallel: -// CHECK-DEBUG-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB21]], i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, double*, float**)* @_Z14parallel_for_2Pfid..omp_par.4 to void (i32*, i32*, ...)*), i32* [[A_ADDR]], double* [[B_ADDR]], float** [[R_ADDR]]), [[DBG84:!dbg !.*]] -// CHECK-DEBUG-NEXT: br label [[OMP_PAR_OUTLINED_EXIT211:%.*]] -// CHECK-DEBUG: omp.par.outlined.exit211: +// CHECK-DEBUG-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* [[GLOB13]], i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, double*, float**)* @_Z14parallel_for_2Pfid..omp_par.23 to void (i32*, i32*, ...)*), i32* [[A_ADDR]], double* [[B_ADDR]], float** [[R_ADDR]]), [[DBG130:!dbg !.*]] +// CHECK-DEBUG-NEXT: br label [[OMP_PAR_OUTLINED_EXIT184:%.*]] +// CHECK-DEBUG: omp.par.outlined.exit184: // CHECK-DEBUG-NEXT: br label [[OMP_PAR_EXIT_SPLIT:%.*]] // CHECK-DEBUG: omp.par.exit.split: -// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata i32* [[DOTOMP_IV212]], [[META87:metadata !.*]], metadata !DIExpression()), [[DBG89:!dbg !.*]] -// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata i32* [[DOTOMP_LB214]], [[META90:metadata !.*]], metadata !DIExpression()), [[DBG89]] -// CHECK-DEBUG-NEXT: store i32 0, i32* [[DOTOMP_LB214]], align 4, [[DBG91:!dbg !.*]] -// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata i32* [[DOTOMP_UB215]], [[META92:metadata !.*]], metadata !DIExpression()), [[DBG89]] -// CHECK-DEBUG-NEXT: store i32 99, i32* [[DOTOMP_UB215]], align 4, [[DBG91]] -// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata i32* [[DOTOMP_STRIDE216]], [[META93:metadata !.*]], metadata !DIExpression()), [[DBG89]] -// CHECK-DEBUG-NEXT: store i32 1, i32* [[DOTOMP_STRIDE216]], align 4, [[DBG91]] -// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata i32* [[DOTOMP_IS_LAST217]], [[META94:metadata !.*]], metadata !DIExpression()), [[DBG89]] -// CHECK-DEBUG-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST217]], align 4, [[DBG91]] -// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata i32* [[I218]], [[META95:metadata !.*]], metadata !DIExpression()), [[DBG89]] -// CHECK-DEBUG-NEXT: [[OMP_GLOBAL_THREAD_NUM219:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB79:@.*]]) -// CHECK-DEBUG-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* [[GLOB78:@.*]], i32 [[OMP_GLOBAL_THREAD_NUM219]], i32 34, i32* [[DOTOMP_IS_LAST217]], i32* [[DOTOMP_LB214]], i32* [[DOTOMP_UB215]], i32* [[DOTOMP_STRIDE216]], i32 1, i32 1), [[DBG96:!dbg !.*]] -// CHECK-DEBUG-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTOMP_UB215]], align 4, [[DBG91]] -// CHECK-DEBUG-NEXT: [[CMP220:%.*]] = icmp sgt i32 [[TMP0]], 99, [[DBG91]] -// CHECK-DEBUG-NEXT: br i1 [[CMP220]], label [[COND_TRUE221:%.*]], label [[COND_FALSE222:%.*]], [[DBG91]] -// CHECK-DEBUG: cond.true221: -// CHECK-DEBUG-NEXT: br label [[COND_END223:%.*]], [[DBG91]] -// CHECK-DEBUG: cond.false222: -// CHECK-DEBUG-NEXT: [[TMP1:%.*]] = load i32, i32* [[DOTOMP_UB215]], align 4, [[DBG91]] -// CHECK-DEBUG-NEXT: br label [[COND_END223]], [[DBG91]] -// CHECK-DEBUG: cond.end223: -// CHECK-DEBUG-NEXT: [[COND224:%.*]] = phi i32 [ 99, [[COND_TRUE221]] ], [ [[TMP1]], [[COND_FALSE222]] ], [[DBG91]] -// CHECK-DEBUG-NEXT: store i32 [[COND224]], i32* [[DOTOMP_UB215]], align 4, [[DBG91]] -// CHECK-DEBUG-NEXT: [[TMP2:%.*]] = load i32, i32* [[DOTOMP_LB214]], align 4, [[DBG91]] -// CHECK-DEBUG-NEXT: store i32 [[TMP2]], i32* [[DOTOMP_IV212]], align 4, [[DBG91]] -// CHECK-DEBUG-NEXT: br label [[OMP_INNER_FOR_COND225:%.*]], [[DBG97:!dbg !.*]] -// CHECK-DEBUG: omp.inner.for.cond225: -// CHECK-DEBUG-NEXT: [[TMP3:%.*]] = load i32, i32* [[DOTOMP_IV212]], align 4, [[DBG91]] -// CHECK-DEBUG-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB215]], align 4, [[DBG91]] -// CHECK-DEBUG-NEXT: [[CMP226:%.*]] = icmp sle i32 [[TMP3]], [[TMP4]], [[DBG98:!dbg !.*]] -// CHECK-DEBUG-NEXT: br i1 [[CMP226]], label [[OMP_INNER_FOR_BODY227:%.*]], label [[OMP_INNER_FOR_END236:%.*]], [[DBG97]] -// CHECK-DEBUG: omp.inner.for.body227: -// CHECK-DEBUG-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_IV212]], align 4, [[DBG91]] -// CHECK-DEBUG-NEXT: [[MUL228:%.*]] = mul nsw i32 [[TMP5]], 1, [[DBG99:!dbg !.*]] -// CHECK-DEBUG-NEXT: [[ADD229:%.*]] = add nsw i32 0, [[MUL228]], [[DBG99]] -// CHECK-DEBUG-NEXT: store i32 [[ADD229]], i32* [[I218]], align 4, [[DBG99]] -// CHECK-DEBUG-NEXT: [[TMP6:%.*]] = load i32, i32* [[A_ADDR]], align 4, [[DBG100:!dbg !.*]] -// CHECK-DEBUG-NEXT: [[CONV230:%.*]] = sitofp i32 [[TMP6]] to double, [[DBG100]] -// CHECK-DEBUG-NEXT: [[TMP7:%.*]] = load double, double* [[B_ADDR]], align 8, [[DBG101:!dbg !.*]] -// CHECK-DEBUG-NEXT: [[ADD231:%.*]] = fadd double [[CONV230]], [[TMP7]], [[DBG102:!dbg !.*]] -// CHECK-DEBUG-NEXT: [[CONV232:%.*]] = fptrunc double [[ADD231]] to float, [[DBG100]] -// CHECK-DEBUG-NEXT: [[TMP8:%.*]] = load float*, float** [[R_ADDR]], align 8, [[DBG103:!dbg !.*]] -// CHECK-DEBUG-NEXT: store float [[CONV232]], float* [[TMP8]], align 4, [[DBG104:!dbg !.*]] -// CHECK-DEBUG-NEXT: br label [[OMP_BODY_CONTINUE233:%.*]], [[DBG105:!dbg !.*]] -// CHECK-DEBUG: omp.body.continue233: -// CHECK-DEBUG-NEXT: br label [[OMP_INNER_FOR_INC234:%.*]], [[DBG96]] -// CHECK-DEBUG: omp.inner.for.inc234: -// CHECK-DEBUG-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV212]], align 4, [[DBG91]] -// CHECK-DEBUG-NEXT: [[ADD235:%.*]] = add nsw i32 [[TMP9]], 1, [[DBG98]] -// CHECK-DEBUG-NEXT: store i32 [[ADD235]], i32* [[DOTOMP_IV212]], align 4, [[DBG98]] -// CHECK-DEBUG-NEXT: br label [[OMP_INNER_FOR_COND225]], [[DBG96]], [[LOOP106:!llvm.loop !.*]] -// CHECK-DEBUG: omp.inner.for.end236: -// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_EXIT237:%.*]], [[DBG96]] -// CHECK-DEBUG: omp.loop.exit237: -// CHECK-DEBUG-NEXT: [[OMP_GLOBAL_THREAD_NUM238:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB82:@.*]]) -// CHECK-DEBUG-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* [[GLOB81:@.*]], i32 [[OMP_GLOBAL_THREAD_NUM238]]), [[DBG107:!dbg !.*]] -// CHECK-DEBUG-NEXT: [[OMP_GLOBAL_THREAD_NUM239:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB82]]), [[DBG107]] -// CHECK-DEBUG-NEXT: call void @__kmpc_barrier(%struct.ident_t* [[GLOB83:@.*]], i32 [[OMP_GLOBAL_THREAD_NUM239]]), [[DBG107]] -// CHECK-DEBUG-NEXT: ret void, [[DBG108:!dbg !.*]] +// CHECK-DEBUG-NEXT: call void @llvm.dbg.declare(metadata i32* [[I185]], [[META134:metadata !.*]], metadata !DIExpression()), [[DBG137:!dbg !.*]] +// CHECK-DEBUG-NEXT: store i32 0, i32* [[I185]], align 4, [[DBG137]] +// CHECK-DEBUG-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ANON_17]], %struct.anon.17* [[AGG_CAPTURED186]], i32 0, i32 0, [[DBG138:!dbg !.*]] +// CHECK-DEBUG-NEXT: store i32* [[I185]], i32** [[TMP0]], align 8, [[DBG138]] +// CHECK-DEBUG-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_ANON_18]], %struct.anon.18* [[AGG_CAPTURED187]], i32 0, i32 0, [[DBG138]] +// CHECK-DEBUG-NEXT: [[TMP2:%.*]] = load i32, i32* [[I185]], align 4, [[DBG139:!dbg !.*]] +// CHECK-DEBUG-NEXT: store i32 [[TMP2]], i32* [[TMP1]], align 4, [[DBG138]] +// CHECK-DEBUG-NEXT: call void @__captured_stmt.19(i32* [[DOTCOUNT_ADDR188]], %struct.anon.17* [[AGG_CAPTURED186]]), [[DBG138]] +// CHECK-DEBUG-NEXT: [[DOTCOUNT189:%.*]] = load i32, i32* [[DOTCOUNT_ADDR188]], align 4, [[DBG138]] +// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_PREHEADER190:%.*]], [[DBG138]] +// CHECK-DEBUG: omp_loop.preheader190: +// CHECK-DEBUG-NEXT: store i32 0, i32* [[P_LOWERBOUND204]], align 4, [[DBG138]] +// CHECK-DEBUG-NEXT: [[TMP3:%.*]] = sub i32 [[DOTCOUNT189]], 1, [[DBG138]] +// CHECK-DEBUG-NEXT: store i32 [[TMP3]], i32* [[P_UPPERBOUND205]], align 4, [[DBG138]] +// CHECK-DEBUG-NEXT: store i32 1, i32* [[P_STRIDE206]], align 4, [[DBG138]] +// CHECK-DEBUG-NEXT: [[OMP_GLOBAL_THREAD_NUM207:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB42:@.*]]), [[DBG138]] +// CHECK-DEBUG-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* [[GLOB42]], i32 [[OMP_GLOBAL_THREAD_NUM207]], i32 34, i32* [[P_LASTITER203]], i32* [[P_LOWERBOUND204]], i32* [[P_UPPERBOUND205]], i32* [[P_STRIDE206]], i32 1, i32 1), [[DBG138]] +// CHECK-DEBUG-NEXT: [[TMP4:%.*]] = load i32, i32* [[P_LOWERBOUND204]], align 4, [[DBG138]] +// CHECK-DEBUG-NEXT: [[TMP5:%.*]] = load i32, i32* [[P_UPPERBOUND205]], align 4, [[DBG138]] +// CHECK-DEBUG-NEXT: [[TMP6:%.*]] = sub i32 [[TMP5]], [[TMP4]], [[DBG138]] +// CHECK-DEBUG-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], 1, [[DBG138]] +// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_HEADER191:%.*]], [[DBG138]] +// CHECK-DEBUG: omp_loop.header191: +// CHECK-DEBUG-NEXT: [[OMP_LOOP_IV197:%.*]] = phi i32 [ 0, [[OMP_LOOP_PREHEADER190]] ], [ [[OMP_LOOP_NEXT199:%.*]], [[OMP_LOOP_INC194:%.*]] ], [[DBG138]] +// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_COND192:%.*]], [[DBG138]] +// CHECK-DEBUG: omp_loop.cond192: +// CHECK-DEBUG-NEXT: [[OMP_LOOP_CMP198:%.*]] = icmp ult i32 [[OMP_LOOP_IV197]], [[TMP7]], [[DBG138]] +// CHECK-DEBUG-NEXT: br i1 [[OMP_LOOP_CMP198]], label [[OMP_LOOP_BODY193:%.*]], label [[OMP_LOOP_EXIT195:%.*]], [[DBG138]] +// CHECK-DEBUG: omp_loop.body193: +// CHECK-DEBUG-NEXT: [[TMP8:%.*]] = add i32 [[OMP_LOOP_IV197]], [[TMP4]], [[DBG138]] +// CHECK-DEBUG-NEXT: call void @__captured_stmt.20(i32* [[I185]], i32 [[TMP8]], %struct.anon.18* [[AGG_CAPTURED187]]), [[DBG138]] +// CHECK-DEBUG-NEXT: [[TMP9:%.*]] = load i32, i32* [[A_ADDR]], align 4, [[DBG140:!dbg !.*]] +// CHECK-DEBUG-NEXT: [[CONV200:%.*]] = sitofp i32 [[TMP9]] to double, [[DBG140]] +// CHECK-DEBUG-NEXT: [[TMP10:%.*]] = load double, double* [[B_ADDR]], align 8, [[DBG141:!dbg !.*]] +// CHECK-DEBUG-NEXT: [[ADD201:%.*]] = fadd double [[CONV200]], [[TMP10]], [[DBG142:!dbg !.*]] +// CHECK-DEBUG-NEXT: [[CONV202:%.*]] = fptrunc double [[ADD201]] to float, [[DBG140]] +// CHECK-DEBUG-NEXT: [[TMP11:%.*]] = load float*, float** [[R_ADDR]], align 8, [[DBG143:!dbg !.*]] +// CHECK-DEBUG-NEXT: store float [[CONV202]], float* [[TMP11]], align 4, [[DBG144:!dbg !.*]] +// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_INC194]], [[DBG138]] +// CHECK-DEBUG: omp_loop.inc194: +// CHECK-DEBUG-NEXT: [[OMP_LOOP_NEXT199]] = add nuw i32 [[OMP_LOOP_IV197]], 1, [[DBG138]] +// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_HEADER191]], [[DBG138]] +// CHECK-DEBUG: omp_loop.exit195: +// CHECK-DEBUG-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* [[GLOB42]], i32 [[OMP_GLOBAL_THREAD_NUM207]]), [[DBG138]] +// CHECK-DEBUG-NEXT: [[OMP_GLOBAL_THREAD_NUM208:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[GLOB42]]), [[DBG141]] +// CHECK-DEBUG-NEXT: call void @__kmpc_barrier(%struct.ident_t* [[GLOB43:@.*]], i32 [[OMP_GLOBAL_THREAD_NUM208]]), [[DBG141]] +// CHECK-DEBUG-NEXT: br label [[OMP_LOOP_AFTER196:%.*]], [[DBG138]] +// CHECK-DEBUG: omp_loop.after196: +// CHECK-DEBUG-NEXT: ret void, [[DBG145:!dbg !.*]] // void parallel_for_2(float *r, int a, double b) { #pragma omp parallel diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp --- a/clang/tools/libclang/CIndex.cpp +++ b/clang/tools/libclang/CIndex.cpp @@ -2037,6 +2037,7 @@ void VisitPseudoObjectExpr(const PseudoObjectExpr *E); void VisitOpaqueValueExpr(const OpaqueValueExpr *E); void VisitLambdaExpr(const LambdaExpr *E); + void VisitOMPCanonicalLoop(const OMPCanonicalLoop *L); void VisitOMPExecutableDirective(const OMPExecutableDirective *D); void VisitOMPLoopDirective(const OMPLoopDirective *D); void VisitOMPParallelDirective(const OMPParallelDirective *D); @@ -2833,6 +2834,11 @@ Visit(E->getSyntacticForm()); } +void EnqueueVisitor::VisitOMPCanonicalLoop(const OMPCanonicalLoop *L) { + VisitStmt(L); + EnqueueChildren(L); +} + void EnqueueVisitor::VisitOMPExecutableDirective( const OMPExecutableDirective *D) { EnqueueChildren(D); @@ -5649,6 +5655,8 @@ return cxstring::createRef("attribute(warn_unused_result)"); case CXCursor_AlignedAttr: return cxstring::createRef("attribute(aligned)"); + case CXCursor_OMPCanonicalLoop: + return cxstring::createRef("OMPCanonicalLoop"); } llvm_unreachable("Unhandled CXCursorKind"); diff --git a/clang/tools/libclang/CXCursor.cpp b/clang/tools/libclang/CXCursor.cpp --- a/clang/tools/libclang/CXCursor.cpp +++ b/clang/tools/libclang/CXCursor.cpp @@ -639,6 +639,9 @@ case Stmt::MSDependentExistsStmtClass: K = CXCursor_UnexposedStmt; break; + case Stmt::OMPCanonicalLoopClass: + K = CXCursor_OMPCanonicalLoop; + break; case Stmt::OMPParallelDirectiveClass: K = CXCursor_OMPParallelDirective; break; diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -31,6 +31,7 @@ /// Create a new OpenMPIRBuilder operating on the given module \p M. This will /// not have an effect on \p M (see initialize). OpenMPIRBuilder(Module &M) : M(M), Builder(M.getContext()) {} + ~OpenMPIRBuilder(); /// Initialize the internal state, this will put structures types and /// potentially other helpers into the underlying module. Must be called @@ -38,10 +39,11 @@ void initialize(); /// Finalize the underlying module, e.g., by outlining regions. + /// \param Fn The function to be finalized. /// \param AllowExtractorSinking Flag to include sinking instructions, /// emitted by CodeExtractor, in the /// outlined region. Default is false. - void finalize(bool AllowExtractorSinking = false); + void finalize(Function *Fn, bool AllowExtractorSinking = false); /// Add attributes known for \p FnID to \p Fn. void addAttributes(omp::RuntimeFunction FnID, Function &Fn); @@ -364,6 +366,31 @@ bool NeedsBarrier, Value *Chunk = nullptr); + /// Modifies the canonical loop to be a workshare loop. + /// + /// This takes a \p LoopInfo representing a canonical loop, such as the one + /// created by \p createCanonicalLoop and emits additional instructions to + /// turn it into a workshare loop. In particular, it calls to an OpenMP + /// runtime function in the preheader to obtain the loop bounds to be used in + /// the current thread, updates the relevant instructions in the canonical + /// loop and calls to an OpenMP runtime finalization function after the loop. + /// + /// \param Loc The source location description, the insertion location + /// is not used. + /// \param CLI A descriptor of the canonical loop to workshare. + /// \param AllocaIP An insertion point for Alloca instructions usable in the + /// preheader of the loop. + /// \param NeedsBarrier Indicates whether a barrier must be insterted after + /// the loop. + /// \param Chunk The size of loop chunk considered as a unit when + /// scheduling. If \p nullptr, defaults to 1. + /// + /// \returns Updated CanonicalLoopInfo. + CanonicalLoopInfo *createWorkshareLoop(const LocationDescription &Loc, + CanonicalLoopInfo *CLI, + InsertPointTy AllocaIP, + bool NeedsBarrier); + /// Tile a loop nest. /// /// Tiles the loops of \p Loops by the tile sizes in \p TileSizes. Loops in @@ -543,6 +570,9 @@ /// vector and set. void collectBlocks(SmallPtrSetImpl &BlockSet, SmallVectorImpl &BlockVector); + + /// Return the function that contains the region to be outlined. + Function *getFunction() const { return EntryBB->getParent(); } }; /// Collection of regions that need to be outlined during finalization. @@ -916,6 +946,8 @@ return {After, After->begin()}; }; + Function *getFunction() const { return Header->getParent(); } + /// Consistency self-check. void assertOK() const; }; diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -126,15 +126,23 @@ void OpenMPIRBuilder::initialize() { initializeTypes(M); } -void OpenMPIRBuilder::finalize(bool AllowExtractorSinking) { +void OpenMPIRBuilder::finalize(Function *Fn, bool AllowExtractorSinking) { SmallPtrSet ParallelRegionBlockSet; SmallVector Blocks; + SmallVector DeferredOutlines; for (OutlineInfo &OI : OutlineInfos) { + // Skip functions that have not finalized yet; may happen with nested + // function generation. + if (OI.getFunction() != Fn) { + DeferredOutlines.push_back(OI); + continue; + } + ParallelRegionBlockSet.clear(); Blocks.clear(); OI.collectBlocks(ParallelRegionBlockSet, Blocks); - Function *OuterFn = OI.EntryBB->getParent(); + Function *OuterFn = OI.getFunction(); CodeExtractorAnalysisCache CEAC(*OuterFn); CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr, /* AggregateArgs */ false, @@ -199,8 +207,12 @@ OI.PostOutlineCB(*OutlinedFn); } - // Allow finalize to be called multiple times. - OutlineInfos.clear(); + // Remove work items that have been completed. + OutlineInfos = std::move(DeferredOutlines); +} + +OpenMPIRBuilder::~OpenMPIRBuilder() { + assert(OutlineInfos.empty() && "There must be no outstanding outlinings"); } Value *OpenMPIRBuilder::getOrCreateIdent(Constant *SrcLocStr, @@ -971,7 +983,8 @@ // Emit the body content. We do it after connecting the loop to the CFG to // avoid that the callback encounters degenerate BBs. - BodyGenCB(CL->getBodyIP(), CL->getIndVar()); + if (BodyGenCB) + BodyGenCB(CL->getBodyIP(), CL->getIndVar()); #ifndef NDEBUG CL->assertOK(); @@ -1164,6 +1177,13 @@ return CLI; } +CanonicalLoopInfo *OpenMPIRBuilder::createWorkshareLoop( + const LocationDescription &Loc, CanonicalLoopInfo *CLI, + InsertPointTy AllocaIP, bool NeedsBarrier) { + // Currently only supports static schedules. + return createStaticWorkshareLoop(Loc, CLI, AllocaIP, NeedsBarrier); +} + /// Make \p Source branch to \p Target. /// /// Handles two situations: diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -794,7 +794,8 @@ BranchInst::Create(AfterBB, AfterIP.getBlock()); // Perform the actual outlining. - OMPInfoCache.OMPBuilder.finalize(/* AllowExtractorSinking */ true); + OMPInfoCache.OMPBuilder.finalize(OriginalFn, + /* AllowExtractorSinking */ true); Function *OutlinedFn = MergableCIs.front()->getCaller();