diff --git a/.gitignore b/.gitignore --- a/.gitignore +++ b/.gitignore @@ -26,6 +26,7 @@ # Nested build directory /build* +*.FileCheck #==============================================================================# # Explicit files to ignore (only matches one). diff --git a/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h --- a/clang/include/clang-c/Index.h +++ b/clang/include/clang-c/Index.h @@ -2588,7 +2588,11 @@ */ CXCursor_OMPMaskedDirective = 292, - CXCursor_LastStmt = CXCursor_OMPMaskedDirective, + /** OpenMP unroll directive. + */ + CXCursor_OMPUnrollDirective = 293, + + CXCursor_LastStmt = CXCursor_OMPUnrollDirective, /** * Cursor that represents the translation unit itself. diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h --- a/clang/include/clang/AST/OpenMPClause.h +++ b/clang/include/clang/AST/OpenMPClause.h @@ -888,6 +888,113 @@ } }; +/// Representation of the 'full' clause of the '#pragma omp unroll' directive. +/// +/// \code +/// #pragma omp unroll full +/// for (int i = 0; i < 64; ++i) +/// \endcode +class OMPFullClause final : public OMPClause { + friend class OMPClauseReader; + + /// Build an empty clause. + explicit OMPFullClause() : OMPClause(llvm::omp::OMPC_full, {}, {}) {} + +public: + /// Build an AST node for a 'full' clause. + /// + /// \param C Context of the AST. + /// \param Loc Location of the 'full' identifier. + static OMPFullClause *Create(const ASTContext &C, SourceLocation StartLoc, + SourceLocation EndLoc); + + /// Build an empty 'full' AST node for deserialization. + /// + /// \param C Context of the AST. + static OMPFullClause *CreateEmpty(const ASTContext &C); + + child_range children() { return {child_iterator(), child_iterator()}; } + const_child_range children() const { + return {const_child_iterator(), const_child_iterator()}; + } + + child_range used_children() { + return child_range(child_iterator(), child_iterator()); + } + const_child_range used_children() const { + return const_child_range(const_child_iterator(), const_child_iterator()); + } + + static bool classof(const OMPClause *T) { + return T->getClauseKind() == llvm::omp::OMPC_full; + } +}; + +/// Representation of the 'partial' clause of the '#pragma omp unroll' +/// directive. +/// +/// \code +/// #pragma omp unroll partial(4) +/// for (int i = start; i < end; ++i) +/// \endcode +class OMPPartialClause final : public OMPClause { + friend class OMPClauseReader; + + /// Location of '('. + SourceLocation LParenLoc; + + /// Optional argument to the clause (unroll factor). + Stmt *Factor; + + /// Build an empty clause. + explicit OMPPartialClause() : OMPClause(llvm::omp::OMPC_partial, {}, {}) {} + + /// Set the unroll factor. + void setFactor(Expr *E) { Factor = E; } + +public: + /// Build an AST node for a 'partial' clause. + /// + /// \param C Context of the AST. + /// \param StartLoc Location of the 'partial' identifier. + /// \param LParenLoc Location of '('. + /// \param EndLoc Location of ')'. + /// \param Factor Clause argument. + static OMPPartialClause *Create(const ASTContext &C, SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc, Expr *Factor); + + /// Build an empty 'partial' AST node for deserialization. + /// + /// \param C Context of the AST. + static OMPPartialClause *CreateEmpty(const ASTContext &C); + + /// Sets the location of '('. + void setLParenLoc(SourceLocation Loc) { LParenLoc = Loc; } + + /// Returns the location of '('. + SourceLocation getLParenLoc() const { return LParenLoc; } + + /// Returns the argument of the clause or nullptr if not set. + Expr *getFactor() const { return cast_or_null(Factor); } + + child_range children() { return child_range(&Factor, &Factor + 1); } + const_child_range children() const { + return const_child_range(&Factor, &Factor + 1); + } + + child_range used_children() { + return child_range(child_iterator(), child_iterator()); + } + const_child_range used_children() const { + return const_child_range(const_child_iterator(), const_child_iterator()); + } + + static bool classof(const OMPClause *T) { + return T->getClauseKind() == llvm::omp::OMPC_partial; + } +}; + /// This represents 'collapse' clause in the '#pragma omp ...' /// directive. /// diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h --- a/clang/include/clang/AST/RecursiveASTVisitor.h +++ b/clang/include/clang/AST/RecursiveASTVisitor.h @@ -2810,6 +2810,9 @@ DEF_TRAVERSE_STMT(OMPTileDirective, { TRY_TO(TraverseOMPExecutableDirective(S)); }) +DEF_TRAVERSE_STMT(OMPUnrollDirective, + { TRY_TO(TraverseOMPExecutableDirective(S)); }) + DEF_TRAVERSE_STMT(OMPForDirective, { TRY_TO(TraverseOMPExecutableDirective(S)); }) @@ -3066,6 +3069,17 @@ return true; } +template +bool RecursiveASTVisitor::VisitOMPFullClause(OMPFullClause *C) { + return true; +} + +template +bool RecursiveASTVisitor::VisitOMPPartialClause(OMPPartialClause *C) { + TRY_TO(TraverseStmt(C->getFactor())); + return true; +} + template bool RecursiveASTVisitor::VisitOMPCollapseClause(OMPCollapseClause *C) { diff --git a/clang/include/clang/AST/StmtOpenMP.h b/clang/include/clang/AST/StmtOpenMP.h --- a/clang/include/clang/AST/StmtOpenMP.h +++ b/clang/include/clang/AST/StmtOpenMP.h @@ -460,17 +460,22 @@ /// directive). Returns nullptr if no clause of this kind is associated with /// the directive. template - const SpecificClause *getSingleClause() const { - auto Clauses = getClausesOfKind(); + static const SpecificClause *getSingleClause(ArrayRef Clauses) { + auto ClausesOfKind = getClausesOfKind(Clauses); - if (Clauses.begin() != Clauses.end()) { - assert(std::next(Clauses.begin()) == Clauses.end() && + if (ClausesOfKind.begin() != ClausesOfKind.end()) { + assert(std::next(ClausesOfKind.begin()) == ClausesOfKind.end() && "There are at least 2 clauses of the specified kind"); - return *Clauses.begin(); + return *ClausesOfKind.begin(); } return nullptr; } + template + const SpecificClause *getSingleClause() const { + return getSingleClause(clauses()); + } + /// Returns true if the current directive has one or more clauses of a /// specific kind. template @@ -5062,6 +5067,78 @@ } }; +/// This represents the '#pragma omp unroll' loop transformation directive. +/// +/// \code +/// #pragma omp unroll +/// for (int i = 0; i < 64; ++i) +/// \endcode +class OMPUnrollDirective final : public OMPLoopBasedDirective { + friend class ASTStmtReader; + friend class OMPExecutableDirective; + + /// Default list of offsets. + enum { + PreInitsOffset = 0, + TransformedStmtOffset, + }; + + explicit OMPUnrollDirective(SourceLocation StartLoc, SourceLocation EndLoc) + : OMPLoopBasedDirective(OMPUnrollDirectiveClass, llvm::omp::OMPD_unroll, + StartLoc, EndLoc, 1) {} + + /// Set the pre-init statements. + void setPreInits(Stmt *PreInits) { + Data->getChildren()[PreInitsOffset] = PreInits; + } + + /// Set the de-sugared statement. + void setTransformedStmt(Stmt *S) { + Data->getChildren()[TransformedStmtOffset] = S; + } + +public: + /// Create a new AST node representation for '#pragma omp unroll'. + /// + /// \param C Context of the AST. + /// \param StartLoc Location of the introducer (e.g. the 'omp' token). + /// \param EndLoc Location of the directive's end (e.g. the tok::eod). + /// \param Clauses The directive's clauses. + /// \param AssociatedStmt The outermost associated loop. + /// \param TransformedStmt The loop nest after tiling, or nullptr in + /// dependent contexts. + /// \param PreInits Helper preinits statements for the loop nest. + static OMPUnrollDirective * + Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc, + ArrayRef Clauses, Stmt *AssociatedStmt, + Stmt *TransformedStmt, Stmt *PreInits); + + /// Build an empty '#pragma omp unroll' AST node for deserialization. + /// + /// \param C Context of the AST. + /// \param NumClauses Number of clauses to allocate. + static OMPUnrollDirective *CreateEmpty(const ASTContext &C, + unsigned NumClauses); + + /// Get the de-sugared associated loops after unrolling. + /// + /// This is only used if the unrolled loop becomes an associated loop of + /// another directive, otherwise the loop is emitted directly using loop + /// transformation metadata. When the unrolled loop cannot be used by another + /// directive (e.g. because of the full clause), the transformed stmt can also + /// be nullptr. + Stmt *getTransformedStmt() const { + return Data->getChildren()[TransformedStmtOffset]; + } + + /// Return the pre-init statements. + Stmt *getPreInits() const { return Data->getChildren()[PreInitsOffset]; } + + static bool classof(const Stmt *T) { + return T->getStmtClass() == OMPUnrollDirectiveClass; + } +}; + /// This represents '#pragma omp scan' directive. /// /// \code diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -10691,6 +10691,9 @@ def err_omp_dispatch_statement_call : Error<"statement after '#pragma omp dispatch' must be a direct call" " to a target function or an assignment to one">; +def err_omp_unroll_full_variable_trip_count : Error< + "loop to be fully unrolled must have a constant trip count">; +def note_omp_directive_here : Note<"'%0' directive found here">; } // end of OpenMP category let CategoryName = "Related Result Type Issue" in { diff --git a/clang/include/clang/Basic/StmtNodes.td b/clang/include/clang/Basic/StmtNodes.td --- a/clang/include/clang/Basic/StmtNodes.td +++ b/clang/include/clang/Basic/StmtNodes.td @@ -223,6 +223,7 @@ def OMPParallelDirective : StmtNode; def OMPSimdDirective : StmtNode; def OMPTileDirective : StmtNode; +def OMPUnrollDirective : StmtNode; def OMPForDirective : StmtNode; def OMPForSimdDirective : StmtNode; def OMPSectionsDirective : StmtNode; diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -32,6 +32,7 @@ #include "clang/AST/NSAPI.h" #include "clang/AST/PrettyPrinter.h" #include "clang/AST/StmtCXX.h" +#include "clang/AST/StmtOpenMP.h" #include "clang/AST/TypeLoc.h" #include "clang/AST/TypeOrdering.h" #include "clang/Basic/BitmaskEnum.h" @@ -10247,6 +10248,25 @@ MapT &Map, unsigned Selector = 0, SourceRange SrcRange = SourceRange()); + /// Analyzes and checks a loop nest for use by a loop transformation. + /// + /// \param Kind The loop transformation directive kind. + /// \param NumLoops How many nested loops the directive is expecting. + /// \param AStmt Associated statement of the transformation directive. + /// \param LoopHelpers [out] The loop analysis result. + /// \param Body [out] The body code nested in \p NumLoops loop. + /// \param OriginalInits [out] Collection of statements and declarations that + /// must have been executed/declared before entering the + /// loop. + /// + /// \return Whether there was any error. + bool checkTransformableLoopNest( + OpenMPDirectiveKind Kind, Stmt *AStmt, int NumLoops, + SmallVectorImpl &LoopHelpers, + Stmt *&Body, + SmallVectorImpl>> + &OriginalInits); + /// Helper to keep information about the current `omp begin/end declare /// variant` nesting. struct OMPDeclareVariantScope { @@ -10543,6 +10563,11 @@ StmtResult ActOnOpenMPTileDirective(ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc); + /// Called on well-formed '#pragma omp unroll' after parsing of its clauses + /// and the associated statement. + StmtResult ActOnOpenMPUnrollDirective(ArrayRef Clauses, + Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc); /// Called on well-formed '\#pragma omp for' after parsing /// of the associated statement. StmtResult @@ -10898,6 +10923,13 @@ SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc); + /// Called on well-form 'full' clauses. + OMPClause *ActOnOpenMPFullClause(SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-form 'partial' clauses. + OMPClause *ActOnOpenMPPartialClause(Expr *FactorExpr, SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); /// Called on well-formed 'collapse' clause. OMPClause *ActOnOpenMPCollapseClause(Expr *NumForLoops, SourceLocation StartLoc, diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h --- a/clang/include/clang/Serialization/ASTBitCodes.h +++ b/clang/include/clang/Serialization/ASTBitCodes.h @@ -1888,6 +1888,7 @@ STMT_OMP_PARALLEL_DIRECTIVE, STMT_OMP_SIMD_DIRECTIVE, STMT_OMP_TILE_DIRECTIVE, + STMT_OMP_UNROLL_DIRECTIVE, STMT_OMP_FOR_DIRECTIVE, STMT_OMP_FOR_SIMD_DIRECTIVE, STMT_OMP_SECTIONS_DIRECTIVE, diff --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp --- a/clang/lib/AST/OpenMPClause.cpp +++ b/clang/lib/AST/OpenMPClause.cpp @@ -942,6 +942,36 @@ return new (Mem) OMPSizesClause(NumSizes); } +OMPFullClause *OMPFullClause::Create(const ASTContext &C, + SourceLocation StartLoc, + SourceLocation EndLoc) { + OMPFullClause *Clause = CreateEmpty(C); + Clause->setLocStart(StartLoc); + Clause->setLocEnd(EndLoc); + return Clause; +} + +OMPFullClause *OMPFullClause::CreateEmpty(const ASTContext &C) { + return new (C) OMPFullClause(); +} + +OMPPartialClause *OMPPartialClause::Create(const ASTContext &C, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc, + Expr *Factor) { + OMPPartialClause *Clause = CreateEmpty(C); + Clause->setLocStart(StartLoc); + Clause->setLParenLoc(LParenLoc); + Clause->setLocEnd(EndLoc); + Clause->setFactor(Factor); + return Clause; +} + +OMPPartialClause *OMPPartialClause::CreateEmpty(const ASTContext &C) { + return new (C) OMPPartialClause(); +} + OMPAllocateClause * OMPAllocateClause::Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc, Expr *Allocator, @@ -1602,6 +1632,18 @@ OS << ")"; } +void OMPClausePrinter::VisitOMPFullClause(OMPFullClause *Node) { OS << "full"; } + +void OMPClausePrinter::VisitOMPPartialClause(OMPPartialClause *Node) { + OS << "partial"; + + if (Expr *Factor = Node->getFactor()) { + OS << '('; + Factor->printPretty(OS, nullptr, Policy, 0); + OS << ')'; + } +} + void OMPClausePrinter::VisitOMPAllocatorClause(OMPAllocatorClause *Node) { OS << "allocator("; Node->getAllocator()->printPretty(OS, nullptr, Policy, 0); diff --git a/clang/lib/AST/StmtOpenMP.cpp b/clang/lib/AST/StmtOpenMP.cpp --- a/clang/lib/AST/StmtOpenMP.cpp +++ b/clang/lib/AST/StmtOpenMP.cpp @@ -129,9 +129,23 @@ OnTransformationCallback) { CurStmt = CurStmt->IgnoreContainers(); for (unsigned Cnt = 0; Cnt < NumLoops; ++Cnt) { - while (auto *Dir = dyn_cast(CurStmt)) { - OnTransformationCallback(Dir); - CurStmt = Dir->getTransformedStmt(); + while (true) { + auto *OrigStmt = CurStmt; + if (auto *Dir = dyn_cast(OrigStmt)) { + OnTransformationCallback(Dir); + CurStmt = Dir->getTransformedStmt(); + } else if (auto *Dir = dyn_cast(OrigStmt)) { + OnTransformationCallback(Dir); + CurStmt = Dir->getTransformedStmt(); + } else + break; + + if (!CurStmt) { + // May happen if the loop transformation does not result in a generated + // loop (such as full unrolling). + CurStmt = OrigStmt; + break; + } } if (auto *CanonLoop = dyn_cast(CurStmt)) CurStmt = CanonLoop->getLoopStmt(); @@ -359,6 +373,25 @@ SourceLocation(), SourceLocation(), NumLoops); } +OMPUnrollDirective * +OMPUnrollDirective::Create(const ASTContext &C, SourceLocation StartLoc, + SourceLocation EndLoc, ArrayRef Clauses, + Stmt *AssociatedStmt, Stmt *TransformedStmt, + Stmt *PreInits) { + OMPUnrollDirective *Dir = createDirective( + C, Clauses, AssociatedStmt, TransformedStmtOffset + 1, StartLoc, EndLoc); + Dir->setTransformedStmt(TransformedStmt); + Dir->setPreInits(PreInits); + return Dir; +} + +OMPUnrollDirective *OMPUnrollDirective::CreateEmpty(const ASTContext &C, + unsigned NumClauses) { + return createEmptyDirective( + C, NumClauses, /*HasAssociatedStmt=*/true, TransformedStmtOffset + 1, + SourceLocation(), SourceLocation()); +} + OMPForSimdDirective * OMPForSimdDirective::Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc, unsigned CollapsedNum, diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp --- a/clang/lib/AST/StmtPrinter.cpp +++ b/clang/lib/AST/StmtPrinter.cpp @@ -669,6 +669,11 @@ PrintOMPExecutableDirective(Node); } +void StmtPrinter::VisitOMPUnrollDirective(OMPUnrollDirective *Node) { + Indent() << "#pragma omp unroll"; + PrintOMPExecutableDirective(Node); +} + void StmtPrinter::VisitOMPForDirective(OMPForDirective *Node) { Indent() << "#pragma omp for"; PrintOMPExecutableDirective(Node); diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp --- a/clang/lib/AST/StmtProfile.cpp +++ b/clang/lib/AST/StmtProfile.cpp @@ -468,6 +468,13 @@ Profiler->VisitExpr(E); } +void OMPClauseProfiler::VisitOMPFullClause(const OMPFullClause *C) {} + +void OMPClauseProfiler::VisitOMPPartialClause(const OMPPartialClause *C) { + if (Expr *Factor = C->getFactor()) + Profiler->VisitExpr(Factor); +} + void OMPClauseProfiler::VisitOMPAllocatorClause(const OMPAllocatorClause *C) { if (C->getAllocator()) Profiler->VisitStmt(C->getAllocator()); @@ -908,6 +915,10 @@ VisitOMPLoopBasedDirective(S); } +void StmtProfiler::VisitOMPUnrollDirective(const OMPUnrollDirective *S) { + VisitOMPLoopBasedDirective(S); +} + void StmtProfiler::VisitOMPForDirective(const OMPForDirective *S) { VisitOMPLoopDirective(S); } diff --git a/clang/lib/Basic/OpenMPKinds.cpp b/clang/lib/Basic/OpenMPKinds.cpp --- a/clang/lib/Basic/OpenMPKinds.cpp +++ b/clang/lib/Basic/OpenMPKinds.cpp @@ -452,7 +452,8 @@ DKind == OMPD_target_teams_distribute || DKind == OMPD_target_teams_distribute_parallel_for || DKind == OMPD_target_teams_distribute_parallel_for_simd || - DKind == OMPD_target_teams_distribute_simd || DKind == OMPD_tile; + DKind == OMPD_target_teams_distribute_simd || DKind == OMPD_tile || + DKind == OMPD_unroll; } bool clang::isOpenMPWorksharingDirective(OpenMPDirectiveKind DKind) { @@ -580,7 +581,7 @@ } bool clang::isOpenMPLoopTransformationDirective(OpenMPDirectiveKind DKind) { - return DKind == OMPD_tile; + return DKind == OMPD_tile || DKind == OMPD_unroll; } void clang::getOpenMPCaptureRegions( @@ -668,6 +669,7 @@ CaptureRegions.push_back(OMPD_unknown); break; case OMPD_tile: + case OMPD_unroll: // loop transformations do not introduce captures. break; case OMPD_threadprivate: diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -6679,6 +6679,7 @@ case OMPD_task: case OMPD_simd: case OMPD_tile: + case OMPD_unroll: case OMPD_sections: case OMPD_section: case OMPD_single: @@ -6997,6 +6998,7 @@ case OMPD_task: case OMPD_simd: case OMPD_tile: + case OMPD_unroll: case OMPD_sections: case OMPD_section: case OMPD_single: @@ -9623,6 +9625,7 @@ case OMPD_task: case OMPD_simd: case OMPD_tile: + case OMPD_unroll: case OMPD_sections: case OMPD_section: case OMPD_single: @@ -10452,6 +10455,7 @@ case OMPD_task: case OMPD_simd: case OMPD_tile: + case OMPD_unroll: case OMPD_sections: case OMPD_section: case OMPD_single: @@ -11135,6 +11139,7 @@ case OMPD_task: case OMPD_simd: case OMPD_tile: + case OMPD_unroll: case OMPD_sections: case OMPD_section: case OMPD_single: diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp --- a/clang/lib/CodeGen/CGStmt.cpp +++ b/clang/lib/CodeGen/CGStmt.cpp @@ -208,6 +208,9 @@ case Stmt::OMPTileDirectiveClass: EmitOMPTileDirective(cast(*S)); break; + case Stmt::OMPUnrollDirectiveClass: + EmitOMPUnrollDirective(cast(*S)); + break; case Stmt::OMPForDirectiveClass: EmitOMPForDirective(cast(*S)); break; diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -176,6 +176,8 @@ PreInits = cast_or_null(LD->getPreInits()); } else if (const auto *Tile = dyn_cast(&S)) { PreInits = cast_or_null(Tile->getPreInits()); + } else if (const auto *Unroll = dyn_cast(&S)) { + PreInits = cast_or_null(Unroll->getPreInits()); } else { llvm_unreachable("Unknown loop-based directive kind."); } @@ -1819,6 +1821,8 @@ if (SimplifiedS == NextLoop) { if (auto *Dir = dyn_cast(SimplifiedS)) SimplifiedS = Dir->getTransformedStmt(); + if (auto *Dir = dyn_cast(SimplifiedS)) + SimplifiedS = Dir->getTransformedStmt(); if (const auto *CanonLoop = dyn_cast(SimplifiedS)) SimplifiedS = CanonLoop->getLoopStmt(); if (const auto *For = dyn_cast(SimplifiedS)) { @@ -2577,6 +2581,34 @@ EmitStmt(S.getTransformedStmt()); } +void CodeGenFunction::EmitOMPUnrollDirective(const OMPUnrollDirective &S) { + // This function is only called if the unrolled loop is not consumed by any + // other loop-associated construct. Such a loop-associated construct will have + // used the transformed AST. + + // Set the unroll metadata for the next emitted loop. + LoopStack.setUnrollState(LoopAttributes::Enable); + + auto *FullClause = S.getSingleClause(); + if (FullClause) + LoopStack.setUnrollState(LoopAttributes::Full); + + auto *PartialClause = S.getSingleClause(); + uint64_t Factor = 0; + if (PartialClause) { + if (Expr *FactorExpr = PartialClause->getFactor()) { + RValue FactorRVal = EmitAnyExpr(FactorExpr, AggValueSlot::ignored(), + /*ignoreResult=*/true); + Factor = + cast(FactorRVal.getScalarVal())->getZExtValue(); + assert(Factor >= 1 && "Only positive factors are valid"); + LoopStack.setUnrollCount(Factor); + } + } + + EmitStmt(S.getAssociatedStmt()); +} + void CodeGenFunction::EmitOMPOuterLoop( bool DynamicOrOrdered, bool IsMonotonic, const OMPLoopDirective &S, CodeGenFunction::OMPPrivateScope &LoopScope, @@ -5754,6 +5786,8 @@ case OMPC_safelen: case OMPC_simdlen: case OMPC_sizes: + case OMPC_full: + case OMPC_partial: case OMPC_allocator: case OMPC_allocate: case OMPC_collapse: diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -3434,6 +3434,7 @@ void EmitOMPParallelDirective(const OMPParallelDirective &S); void EmitOMPSimdDirective(const OMPSimdDirective &S); void EmitOMPTileDirective(const OMPTileDirective &S); + void EmitOMPUnrollDirective(const OMPUnrollDirective &S); void EmitOMPForDirective(const OMPForDirective &S); void EmitOMPForSimdDirective(const OMPForSimdDirective &S); void EmitOMPSectionsDirective(const OMPSectionsDirective &S); diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp --- a/clang/lib/Parse/ParseOpenMP.cpp +++ b/clang/lib/Parse/ParseOpenMP.cpp @@ -2154,6 +2154,7 @@ case OMPD_parallel: case OMPD_simd: case OMPD_tile: + case OMPD_unroll: case OMPD_task: case OMPD_taskyield: case OMPD_barrier: @@ -2392,6 +2393,7 @@ case OMPD_parallel: case OMPD_simd: case OMPD_tile: + case OMPD_unroll: case OMPD_for: case OMPD_for_simd: case OMPD_sections: @@ -2781,6 +2783,7 @@ case OMPC_novariants: case OMPC_nocontext: case OMPC_filter: + case OMPC_partial: // OpenMP [2.5, Restrictions] // At most one num_threads clause can appear on the directive. // OpenMP [2.8.1, simd construct, Restrictions] @@ -2812,7 +2815,8 @@ ErrorFound = true; } - if (CKind == OMPC_ordered && PP.LookAhead(/*N=*/0).isNot(tok::l_paren)) + if ((CKind == OMPC_ordered || CKind == OMPC_partial) && + PP.LookAhead(/*N=*/0).isNot(tok::l_paren)) Clause = ParseOpenMPClause(CKind, WrongDirective); else Clause = ParseOpenMPSingleExprClause(CKind, WrongDirective); @@ -2875,6 +2879,7 @@ case OMPC_unified_shared_memory: case OMPC_reverse_offload: case OMPC_dynamic_allocators: + case OMPC_full: // OpenMP [2.7.1, Restrictions, p. 9] // Only one ordered clause can appear on a loop directive. // OpenMP [2.7.1, Restrictions, C/C++, p. 4] diff --git a/clang/lib/Sema/SemaExceptionSpec.cpp b/clang/lib/Sema/SemaExceptionSpec.cpp --- a/clang/lib/Sema/SemaExceptionSpec.cpp +++ b/clang/lib/Sema/SemaExceptionSpec.cpp @@ -1462,6 +1462,7 @@ case Stmt::OMPSectionsDirectiveClass: case Stmt::OMPSimdDirectiveClass: case Stmt::OMPTileDirectiveClass: + case Stmt::OMPUnrollDirectiveClass: case Stmt::OMPSingleDirectiveClass: case Stmt::OMPTargetDataDirectiveClass: case Stmt::OMPTargetDirectiveClass: diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -3809,6 +3809,11 @@ VisitStmt(S); } + void VisitOMPUnrollDirective(OMPUnrollDirective *S) { + // #pragma omp unroll does not introduce data sharing. + VisitStmt(S); + } + void VisitStmt(Stmt *S) { for (Stmt *C : S->children()) { if (C) { @@ -3975,6 +3980,7 @@ case OMPD_master: case OMPD_masked: case OMPD_tile: + case OMPD_unroll: break; case OMPD_simd: case OMPD_for: @@ -5845,6 +5851,10 @@ Res = ActOnOpenMPTileDirective(ClausesWithImplicit, AStmt, StartLoc, EndLoc); break; + case OMPD_unroll: + Res = ActOnOpenMPUnrollDirective(ClausesWithImplicit, AStmt, StartLoc, + EndLoc); + break; case OMPD_for: Res = ActOnOpenMPForDirective(ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA); @@ -8964,8 +8974,15 @@ return false; }, [&SemaRef, &Captures](OMPLoopBasedDirective *Transform) { - Stmt *DependentPreInits = - cast(Transform)->getPreInits(); + Stmt *DependentPreInits; + if (auto *Dir = dyn_cast(Transform)) { + DependentPreInits = Dir->getPreInits(); + } else if (auto *Dir = dyn_cast(Transform)) { + DependentPreInits = Dir->getPreInits(); + } else + llvm_unreachable("Unexpected loop transformation"); + if (!DependentPreInits) + return; for (Decl *C : cast(DependentPreInits)->getDeclGroup()) { auto *D = cast(C); DeclRefExpr *Ref = buildDeclRefExpr(SemaRef, D, D->getType(), @@ -10117,14 +10134,14 @@ DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion()); } -/// detach and mergeable clauses are mutially exclusive, check for it. -static bool checkDetachMergeableClauses(Sema &S, - ArrayRef Clauses) { +/// Find and diagnose mutually exclusive clause kinds. +static bool checkMutuallyExclusiveClauses( + Sema &S, ArrayRef Clauses, + ArrayRef MutuallyExclusiveClauses) { const OMPClause *PrevClause = nullptr; bool ErrorFound = false; for (const OMPClause *C : Clauses) { - if (C->getClauseKind() == OMPC_detach || - C->getClauseKind() == OMPC_mergeable) { + if (llvm::is_contained(MutuallyExclusiveClauses, C->getClauseKind())) { if (!PrevClause) { PrevClause = C; } else if (PrevClause->getClauseKind() != C->getClauseKind()) { @@ -10149,7 +10166,8 @@ // OpenMP 5.0, 2.10.1 task Construct // If a detach clause appears on the directive, then a mergeable clause cannot // appear on the same directive. - if (checkDetachMergeableClauses(*this, Clauses)) + if (checkMutuallyExclusiveClauses(*this, Clauses, + {OMPC_detach, OMPC_mergeable})) return StmtError(); auto *CS = cast(AStmt); @@ -11453,28 +11471,6 @@ CancelRegion); } -static bool checkGrainsizeNumTasksClauses(Sema &S, - ArrayRef Clauses) { - const OMPClause *PrevClause = nullptr; - bool ErrorFound = false; - for (const OMPClause *C : Clauses) { - if (C->getClauseKind() == OMPC_grainsize || - C->getClauseKind() == OMPC_num_tasks) { - if (!PrevClause) - PrevClause = C; - else if (PrevClause->getClauseKind() != C->getClauseKind()) { - S.Diag(C->getBeginLoc(), diag::err_omp_clauses_mutually_exclusive) - << getOpenMPClauseName(C->getClauseKind()) - << getOpenMPClauseName(PrevClause->getClauseKind()); - S.Diag(PrevClause->getBeginLoc(), diag::note_omp_previous_clause) - << getOpenMPClauseName(PrevClause->getClauseKind()); - ErrorFound = true; - } - } - } - return ErrorFound; -} - static bool checkReductionClauseWithNogroup(Sema &S, ArrayRef Clauses) { const OMPClause *ReductionClause = nullptr; @@ -11525,7 +11521,8 @@ // OpenMP, [2.9.2 taskloop Construct, Restrictions] // The grainsize clause and num_tasks clause are mutually exclusive and may // not appear on the same taskloop directive. - if (checkGrainsizeNumTasksClauses(*this, Clauses)) + if (checkMutuallyExclusiveClauses(*this, Clauses, + {OMPC_grainsize, OMPC_num_tasks})) return StmtError(); // OpenMP, [2.9.2 taskloop Construct, Restrictions] // If a reduction clause is present on the taskloop directive, the nogroup @@ -11573,7 +11570,8 @@ // OpenMP, [2.9.2 taskloop Construct, Restrictions] // The grainsize clause and num_tasks clause are mutually exclusive and may // not appear on the same taskloop directive. - if (checkGrainsizeNumTasksClauses(*this, Clauses)) + if (checkMutuallyExclusiveClauses(*this, Clauses, + {OMPC_grainsize, OMPC_num_tasks})) return StmtError(); // OpenMP, [2.9.2 taskloop Construct, Restrictions] // If a reduction clause is present on the taskloop directive, the nogroup @@ -11611,7 +11609,8 @@ // OpenMP, [2.9.2 taskloop Construct, Restrictions] // The grainsize clause and num_tasks clause are mutually exclusive and may // not appear on the same taskloop directive. - if (checkGrainsizeNumTasksClauses(*this, Clauses)) + if (checkMutuallyExclusiveClauses(*this, Clauses, + {OMPC_grainsize, OMPC_num_tasks})) return StmtError(); // OpenMP, [2.9.2 taskloop Construct, Restrictions] // If a reduction clause is present on the taskloop directive, the nogroup @@ -11659,7 +11658,8 @@ // OpenMP, [2.9.2 taskloop Construct, Restrictions] // The grainsize clause and num_tasks clause are mutually exclusive and may // not appear on the same taskloop directive. - if (checkGrainsizeNumTasksClauses(*this, Clauses)) + if (checkMutuallyExclusiveClauses(*this, Clauses, + {OMPC_grainsize, OMPC_num_tasks})) return StmtError(); // OpenMP, [2.9.2 taskloop Construct, Restrictions] // If a reduction clause is present on the taskloop directive, the nogroup @@ -11716,7 +11716,8 @@ // OpenMP, [2.9.2 taskloop Construct, Restrictions] // The grainsize clause and num_tasks clause are mutually exclusive and may // not appear on the same taskloop directive. - if (checkGrainsizeNumTasksClauses(*this, Clauses)) + if (checkMutuallyExclusiveClauses(*this, Clauses, + {OMPC_grainsize, OMPC_num_tasks})) return StmtError(); // OpenMP, [2.9.2 taskloop Construct, Restrictions] // If a reduction clause is present on the taskloop directive, the nogroup @@ -11783,7 +11784,8 @@ // OpenMP, [2.9.2 taskloop Construct, Restrictions] // The grainsize clause and num_tasks clause are mutually exclusive and may // not appear on the same taskloop directive. - if (checkGrainsizeNumTasksClauses(*this, Clauses)) + if (checkMutuallyExclusiveClauses(*this, Clauses, + {OMPC_grainsize, OMPC_num_tasks})) return StmtError(); // OpenMP, [2.9.2 taskloop Construct, Restrictions] // If a reduction clause is present on the taskloop directive, the nogroup @@ -12553,6 +12555,55 @@ Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B); } +bool Sema::checkTransformableLoopNest( + OpenMPDirectiveKind Kind, Stmt *AStmt, int NumLoops, + SmallVectorImpl &LoopHelpers, + Stmt *&Body, + SmallVectorImpl>> + &OriginalInits) { + OriginalInits.emplace_back(); + bool Result = OMPLoopBasedDirective::doForAllLoops( + AStmt->IgnoreContainers(), /*TryImperfectlyNestedLoops=*/false, NumLoops, + [this, &LoopHelpers, &Body, &OriginalInits, Kind](unsigned Cnt, + Stmt *CurStmt) { + VarsWithInheritedDSAType TmpDSA; + unsigned SingleNumLoops = + checkOpenMPLoop(Kind, nullptr, nullptr, CurStmt, *this, *DSAStack, + TmpDSA, LoopHelpers[Cnt]); + if (SingleNumLoops == 0) + return true; + assert(SingleNumLoops == 1 && "Expect single loop iteration space"); + if (auto *For = dyn_cast(CurStmt)) { + OriginalInits.back().push_back(For->getInit()); + Body = For->getBody(); + } else { + assert(isa(CurStmt) && + "Expected canonical for or range-based for loops."); + auto *CXXFor = cast(CurStmt); + OriginalInits.back().push_back(CXXFor->getBeginStmt()); + Body = CXXFor->getBody(); + } + OriginalInits.emplace_back(); + return false; + }, + [&OriginalInits](OMPLoopBasedDirective *Transform) { + Stmt *DependentPreInits; + if (auto *Dir = dyn_cast(Transform)) + DependentPreInits = Dir->getPreInits(); + else if (auto *Dir = dyn_cast(Transform)) + DependentPreInits = Dir->getPreInits(); + else + llvm_unreachable("Unhandled loop transformation"); + if (!DependentPreInits) + return; + for (Decl *C : cast(DependentPreInits)->getDeclGroup()) + OriginalInits.back().push_back(C); + }); + assert(OriginalInits.back().empty()); + OriginalInits.pop_back(); + return Result; +} + StmtResult Sema::ActOnOpenMPTileDirective(ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc) { @@ -12571,39 +12622,10 @@ // Verify and diagnose loop nest. SmallVector LoopHelpers(NumLoops); - SmallVector PreInits; Stmt *Body = nullptr; - SmallVector OriginalInits; - if (!OMPLoopBasedDirective::doForAllLoops( - AStmt->IgnoreContainers(), /*TryImperfectlyNestedLoops=*/false, - NumLoops, - [this, &LoopHelpers, &Body, &OriginalInits](unsigned Cnt, - Stmt *CurStmt) { - VarsWithInheritedDSAType TmpDSA; - unsigned SingleNumLoops = - checkOpenMPLoop(OMPD_tile, nullptr, nullptr, CurStmt, *this, - *DSAStack, TmpDSA, LoopHelpers[Cnt]); - if (SingleNumLoops == 0) - return true; - assert(SingleNumLoops == 1 && "Expect single loop iteration space"); - if (auto *For = dyn_cast(CurStmt)) { - OriginalInits.push_back(For->getInit()); - Body = For->getBody(); - } else { - assert(isa(CurStmt) && - "Expected canonical for or range-based for loops."); - auto *CXXFor = cast(CurStmt); - OriginalInits.push_back(CXXFor->getBeginStmt()); - Body = CXXFor->getBody(); - } - return false; - }, - [this, &PreInits](OMPLoopBasedDirective *Transform) { - Stmt *DependentPreInits = - cast(Transform)->getPreInits(); - for (Decl *C : cast(DependentPreInits)->getDeclGroup()) - PreInits.push_back(C); - })) + SmallVector>, 4> OriginalInits; + if (!checkTransformableLoopNest(OMPD_tile, AStmt, NumLoops, LoopHelpers, Body, + OriginalInits)) return StmtError(); // Delay tiling to when template is completely instantiated. @@ -12611,6 +12633,7 @@ return OMPTileDirective::Create(Context, StartLoc, EndLoc, Clauses, NumLoops, AStmt, nullptr, nullptr); + SmallVector PreInits; // Create iteration variables for the generated loops. SmallVector FloorIndVars; @@ -12619,8 +12642,7 @@ TileIndVars.resize(NumLoops); for (unsigned I = 0; I < NumLoops; ++I) { OMPLoopBasedDirective::HelperExprs &LoopHelper = LoopHelpers[I]; - if (auto *PI = cast_or_null(LoopHelper.PreInits)) - PreInits.append(PI->decl_begin(), PI->decl_end()); + assert(LoopHelper.Counters.size() == 1 && "Expect single-dimensional loop iteration space"); auto *OrigCntVar = cast(LoopHelper.Counters.front()); @@ -12649,7 +12671,13 @@ TileCntDecl->setDeclName(&PP.getIdentifierTable().get(TileCntName)); TileIndVars[I] = TileCntDecl; } - if (auto *PI = dyn_cast_or_null(OriginalInits[I])) + for (auto &P : OriginalInits[I]) { + if (auto *D = P.dyn_cast()) + PreInits.push_back(D); + else if (auto *PI = dyn_cast_or_null(P.dyn_cast())) + PreInits.append(PI->decl_begin(), PI->decl_end()); + } + if (auto *PI = cast_or_null(LoopHelper.PreInits)) PreInits.append(PI->decl_begin(), PI->decl_end()); // Gather declarations for the data members used as counters. for (Expr *CounterRef : LoopHelper.Counters) { @@ -12786,6 +12814,294 @@ buildPreInits(Context, PreInits)); } +/// Determine whether an expression is constant without emitting diagnostic. +static bool isConstantExpression(Sema &SemaRef, Expr *E) { + struct ConstTripcountDiagnoser : public Sema::VerifyICEDiagnoser { + ConstTripcountDiagnoser() : VerifyICEDiagnoser(/*Suppress=*/true) {} + Sema::SemaDiagnosticBuilder diagnoseNotICE(Sema &S, + SourceLocation Loc) override { + llvm_unreachable("Diagnostic suppressed"); + } + } Diagnoser; + ExprResult ConstResult = SemaRef.VerifyIntegerConstantExpression( + E, nullptr, Diagnoser, Sema::AllowFold); + return ConstResult.isUsable(); +} + +StmtResult Sema::ActOnOpenMPUnrollDirective(ArrayRef Clauses, + Stmt *AStmt, + SourceLocation StartLoc, + SourceLocation EndLoc) { + // Empty statement should only be possible if there already was an error. + if (!AStmt) + return StmtError(); + + if (checkMutuallyExclusiveClauses(*this, Clauses, {OMPC_partial, OMPC_full})) + return StmtError(); + + const OMPFullClause *FullClause = + OMPExecutableDirective::getSingleClause(Clauses); + const OMPPartialClause *PartialClause = + OMPExecutableDirective::getSingleClause(Clauses); + assert(!(FullClause && PartialClause) && + "mutual exclusivity must have been checked before"); + + constexpr unsigned NumLoops = 1; + Stmt *Body = nullptr; + SmallVector LoopHelpers( + NumLoops); + SmallVector>, NumLoops + 1> + OriginalInits; + if (!checkTransformableLoopNest(OMPD_unroll, AStmt, NumLoops, LoopHelpers, + Body, OriginalInits)) + return StmtError(); + + // Delay unrolling to when template is completely instantiated. + if (CurContext->isDependentContext()) + return OMPUnrollDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt, + nullptr, nullptr); + + OMPLoopBasedDirective::HelperExprs &LoopHelper = LoopHelpers.front(); + + if (FullClause) { + if (!isConstantExpression(*this, LoopHelper.NumIterations)) { + Diag(AStmt->getBeginLoc(), diag::err_omp_unroll_full_variable_trip_count); + Diag(FullClause->getBeginLoc(), diag::note_omp_directive_here) + << "#pragma omp unroll full"; + return StmtError(); + } + } + + // The generated loop may only be passed to other loop-associated directive + // when a partial clause is specified. Without the requirement it is + // sufficient to generate loop unroll metadata at code-generation. + if (!PartialClause) + return OMPUnrollDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt, + nullptr, nullptr); + + // Otherwise, we need to provide a de-sugared/transformed AST that can be + // associated with another loop directive. + // + // The canonical loop analysis return by checkTransformableLoopNest assumes + // the following structure to be the same loop without transformations or + // directives applied: \code OriginalInits; LoopHelper.PreInits; + // LoopHelper.Counters; + // for (; IV < LoopHelper.NumIterations; ++IV) { + // LoopHelper.Updates; + // Body; + // } + // \endcode + // where IV is a variable declared and initialized to 0 in LoopHelper.PreInits + // and referenced by LoopHelper.IterationVarRef. + // + // The unrolling directive transforms this into the following loop: + // \code + // OriginalInits; \ + // LoopHelper.PreInits; > NewPreInits + // LoopHelper.Counters; / + // for (auto UIV = 0; UIV < LoopHelper.NumIterations; UIV+=Factor) { + // #pragma clang loop unroll_count(Factor) + // for (IV = UIV; IV < UIV + Factor && UIV < LoopHelper.NumIterations; ++IV) + // { + // LoopHelper.Updates; + // Body; + // } + // } + // \endcode + // where UIV is a new logical iteration counter. IV must be the same VarDecl + // as the original LoopHelper.IterationVarRef because LoopHelper.Updates + // references it. If the partially unrolled loop is associated with another + // loop directive (like an OMPForDirective), it will use checkOpenMPLoop to + // analyze this loop, i.e. the outer loop must fulfill the constraints of an + // OpenMP canonical loop. The inner loop is not an associable canonical loop + // and only exists to defer its unrolling to LLVM's LoopUnroll instead of + // doing it in the frontend (by adding loop metadata). NewPreInits becomes a + // property of the OMPLoopBasedDirective instead of statements in + // CompoundStatement. This is to allow the loop to become a non-outermost loop + // of a canonical loop nest where these PreInits are emitted before the + // outermost directive. + + // Determine the PreInit declarations. + SmallVector PreInits; + assert(OriginalInits.size() == 1 && + "Expecting a single-dimensional loop iteration space"); + for (auto &P : OriginalInits[0]) { + if (auto *D = P.dyn_cast()) + PreInits.push_back(D); + else if (auto *PI = dyn_cast_or_null(P.dyn_cast())) + PreInits.append(PI->decl_begin(), PI->decl_end()); + } + if (auto *PI = cast_or_null(LoopHelper.PreInits)) + PreInits.append(PI->decl_begin(), PI->decl_end()); + // Gather declarations for the data members used as counters. + for (Expr *CounterRef : LoopHelper.Counters) { + auto *CounterDecl = cast(CounterRef)->getDecl(); + if (isa(CounterDecl)) + PreInits.push_back(CounterDecl); + } + + auto *IterationVarRef = cast(LoopHelper.IterationVarRef); + QualType IVTy = IterationVarRef->getType(); + assert(LoopHelper.Counters.size() == 1 && + "Expecting a single-dimensional loop iteration space"); + auto *OrigVar = cast(LoopHelper.Counters.front()); + + // Determine the unroll factor. + uint64_t Factor; + SourceLocation FactorLoc; + if (auto *FactorConst = + cast_or_null(PartialClause->getFactor())) { + Factor = FactorConst->getResultAsAPSInt().getZExtValue(); + FactorLoc = PartialClause->getFactor()->getExprLoc(); + } else { + // TODO: Use a better profitability model. + Factor = 2; + } + assert(Factor > 0 && "Expected positive unroll factor"); + auto makeFactorExpr = [this, Factor, IVTy, FactorLoc]() { + return IntegerLiteral::Create( + Context, llvm::APInt(Context.getIntWidth(IVTy), Factor), IVTy, + FactorLoc); + }; + + // Iteration variable SourceLocations. + + SourceLocation OrigVarLoc = OrigVar->getExprLoc(); + SourceLocation OrigVarLocBegin = OrigVar->getBeginLoc(); + SourceLocation OrigVarLocEnd = OrigVar->getEndLoc(); + + // Internal variable names. + std::string OrigVarName = OrigVar->getNameInfo().getAsString(); + std::string OuterIVName = (Twine(".unrolled.iv.") + OrigVarName).str(); + std::string InnerIVName = (Twine(".unroll_inner.iv.") + OrigVarName).str(); + std::string InnerTripCountName = + (Twine(".unroll_inner.tripcount.") + OrigVarName).str(); + + // Create the iteration variable for the unrolled loop. + VarDecl *OuterIVDecl = + buildVarDecl(*this, {}, IVTy, OuterIVName, nullptr, OrigVar); + auto makeOuterRef = [this, OuterIVDecl, IVTy, OrigVarLoc]() { + return buildDeclRefExpr(*this, OuterIVDecl, IVTy, OrigVarLoc); + }; + + // Iteration variable for the inner loop: Reuse the iteration variable created + // by checkOpenMPLoop. + auto *InnerIVDecl = cast(IterationVarRef->getDecl()); + InnerIVDecl->setDeclName(&PP.getIdentifierTable().get(InnerIVName)); + auto makeInnerRef = [this, InnerIVDecl, IVTy, OrigVarLoc]() { + return buildDeclRefExpr(*this, InnerIVDecl, IVTy, OrigVarLoc); + }; + + // Make a copy of the NumIterations expression for each use: By the AST + // constraints, every expression object in a DeclContext must be unique. + CaptureVars CopyTransformer(*this); + auto makeNumIterations = [&CopyTransformer, &LoopHelper]() -> Expr * { + return AssertSuccess( + CopyTransformer.TransformExpr(LoopHelper.NumIterations)); + }; + + // Inner For init-statement: auto .unroll_inner.iv = .unrolled.iv + ExprResult LValueConv = DefaultLvalueConversion(makeOuterRef()); + AddInitializerToDecl(InnerIVDecl, LValueConv.get(), /*DirectInit=*/false); + StmtResult InnerInit = new (Context) + DeclStmt(DeclGroupRef(InnerIVDecl), OrigVarLocBegin, OrigVarLocEnd); + if (!InnerInit.isUsable()) + return StmtError(); + + // Inner For cond-expression: + // \code + // .unroll_inner.iv < .unrolled.iv + Factor && + // .unroll_inner.iv < NumIterations + // \endcode + // This conjunction of two conditions allows ScalarEvolution to derive the + // maximum trip count of the inner loop. + ExprResult EndOfTile = BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(), + BO_Add, makeOuterRef(), makeFactorExpr()); + if (!EndOfTile.isUsable()) + return StmtError(); + ExprResult InnerCond1 = BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(), + BO_LE, makeInnerRef(), EndOfTile.get()); + if (!InnerCond1.isUsable()) + return StmtError(); + ExprResult InnerCond2 = + BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(), BO_LE, makeInnerRef(), + makeNumIterations()); + if (!InnerCond2.isUsable()) + return StmtError(); + ExprResult InnerCond = + BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(), BO_LAnd, + InnerCond1.get(), InnerCond2.get()); + if (!InnerCond.isUsable()) + return StmtError(); + + // Inner For incr-statement: ++.unroll_inner.iv + ExprResult InnerIncr = BuildUnaryOp(CurScope, LoopHelper.Inc->getExprLoc(), + UO_PreInc, makeInnerRef()); + if (!InnerIncr.isUsable()) + return StmtError(); + + // Inner For statement. + SmallVector InnerBodyStmts; + InnerBodyStmts.append(LoopHelper.Updates.begin(), LoopHelper.Updates.end()); + InnerBodyStmts.push_back(Body); + CompoundStmt *InnerBody = CompoundStmt::Create( + Context, InnerBodyStmts, Body->getBeginLoc(), Body->getEndLoc()); + ForStmt *InnerFor = new (Context) + ForStmt(Context, InnerInit.get(), InnerCond.get(), nullptr, + InnerIncr.get(), InnerBody, LoopHelper.Init->getBeginLoc(), + LoopHelper.Init->getBeginLoc(), LoopHelper.Inc->getEndLoc()); + + // Unroll metadata for the inner loop. + // This needs to take into account the remainder portion of the unrolled loop, + // hence `unroll(full)` does not apply here, even though the LoopUnroll pass + // supports multiple loop exits. Instead, unroll using a factor equivalent to + // the maximum trip count, which will also generate a remainder loop. Just + // `unroll(enable)` (which could have been useful if the user has not + // specified a concrete factor; even though the outer loop cannot be + // influenced anymore, would avoid more code bloat than necessary) will refuse + // the loop because "Won't unroll; remainder loop could not be generated when + // assuming runtime trip count". Even if it did work, it must not choose a + // larger unroll factor than the maximum loop length, or it would always just + // execute the remainder loop. + LoopHintAttr *UnrollHintAttr = + LoopHintAttr::CreateImplicit(Context, LoopHintAttr::UnrollCount, + LoopHintAttr::Numeric, makeFactorExpr()); + AttributedStmt *InnerUnrolled = + AttributedStmt::Create(Context, StartLoc, {UnrollHintAttr}, InnerFor); + + // Outer For init-statement: auto .unrolled.iv = 0 + AddInitializerToDecl( + OuterIVDecl, ActOnIntegerConstant(LoopHelper.Init->getExprLoc(), 0).get(), + /*DirectInit=*/false); + StmtResult OuterInit = new (Context) + DeclStmt(DeclGroupRef(OuterIVDecl), OrigVarLocBegin, OrigVarLocEnd); + if (!OuterInit.isUsable()) + return StmtError(); + + // Outer For cond-expression: .unrolled.iv < NumIterations + ExprResult OuterConde = + BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(), BO_LT, makeOuterRef(), + makeNumIterations()); + if (!OuterConde.isUsable()) + return StmtError(); + + // Outer For incr-statement: .unrolled.iv += Factor + ExprResult OuterIncr = + BuildBinOp(CurScope, LoopHelper.Inc->getExprLoc(), BO_AddAssign, + makeOuterRef(), makeFactorExpr()); + if (!OuterIncr.isUsable()) + return StmtError(); + + // Outer For statement. + ForStmt *OuterFor = new (Context) + ForStmt(Context, OuterInit.get(), OuterConde.get(), nullptr, + OuterIncr.get(), InnerUnrolled, LoopHelper.Init->getBeginLoc(), + LoopHelper.Init->getBeginLoc(), LoopHelper.Inc->getEndLoc()); + + return OMPUnrollDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt, + OuterFor, buildPreInits(Context, PreInits)); +} + OMPClause *Sema::ActOnOpenMPSingleExprClause(OpenMPClauseKind Kind, Expr *Expr, SourceLocation StartLoc, SourceLocation LParenLoc, @@ -12846,6 +13162,9 @@ case OMPC_filter: Res = ActOnOpenMPFilterClause(Expr, StartLoc, LParenLoc, EndLoc); break; + case OMPC_partial: + Res = ActOnOpenMPPartialClause(Expr, StartLoc, LParenLoc, EndLoc); + break; case OMPC_device: case OMPC_if: case OMPC_default: @@ -13048,6 +13367,7 @@ case OMPD_end_declare_target: case OMPD_teams: case OMPD_tile: + case OMPD_unroll: case OMPD_for: case OMPD_sections: case OMPD_section: @@ -13126,6 +13446,7 @@ case OMPD_teams: case OMPD_simd: case OMPD_tile: + case OMPD_unroll: case OMPD_for: case OMPD_for_simd: case OMPD_sections: @@ -13207,6 +13528,7 @@ case OMPD_end_declare_target: case OMPD_simd: case OMPD_tile: + case OMPD_unroll: case OMPD_for: case OMPD_for_simd: case OMPD_sections: @@ -13286,6 +13608,7 @@ case OMPD_end_declare_target: case OMPD_simd: case OMPD_tile: + case OMPD_unroll: case OMPD_for: case OMPD_for_simd: case OMPD_sections: @@ -13366,6 +13689,7 @@ case OMPD_end_declare_target: case OMPD_simd: case OMPD_tile: + case OMPD_unroll: case OMPD_sections: case OMPD_section: case OMPD_single: @@ -13445,6 +13769,7 @@ case OMPD_end_declare_target: case OMPD_simd: case OMPD_tile: + case OMPD_unroll: case OMPD_for: case OMPD_for_simd: case OMPD_sections: @@ -13525,6 +13850,7 @@ case OMPD_end_declare_target: case OMPD_simd: case OMPD_tile: + case OMPD_unroll: case OMPD_for: case OMPD_for_simd: case OMPD_sections: @@ -13606,6 +13932,7 @@ case OMPD_end_declare_target: case OMPD_simd: case OMPD_tile: + case OMPD_unroll: case OMPD_for: case OMPD_for_simd: case OMPD_sections: @@ -14304,6 +14631,29 @@ SizeExprs); } +OMPClause *Sema::ActOnOpenMPFullClause(SourceLocation StartLoc, + SourceLocation EndLoc) { + return OMPFullClause::Create(Context, StartLoc, EndLoc); +} + +OMPClause *Sema::ActOnOpenMPPartialClause(Expr *FactorExpr, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { + if (FactorExpr) { + // If an argument is specified, it must be a constant (or an unevaluated + // template expression). + ExprResult FactorResult = VerifyPositiveIntegerConstantInClause( + FactorExpr, OMPC_partial, /*StrictlyPositive=*/true); + if (FactorResult.isInvalid()) + return nullptr; + FactorExpr = FactorResult.get(); + } + + return OMPPartialClause::Create(Context, StartLoc, LParenLoc, EndLoc, + FactorExpr); +} + OMPClause *Sema::ActOnOpenMPSingleExprWithArgClause( OpenMPClauseKind Kind, ArrayRef Argument, Expr *Expr, SourceLocation StartLoc, SourceLocation LParenLoc, @@ -14606,6 +14956,12 @@ /*LParenLoc=*/SourceLocation(), /*VarLoc=*/SourceLocation(), EndLoc); break; + case OMPC_full: + Res = ActOnOpenMPFullClause(StartLoc, EndLoc); + break; + case OMPC_partial: + Res = ActOnOpenMPPartialClause(nullptr, StartLoc, /*LParenLoc=*/{}, EndLoc); + break; case OMPC_if: case OMPC_final: case OMPC_num_threads: diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -1633,6 +1633,20 @@ return getSema().ActOnOpenMPSizesClause(Sizes, StartLoc, LParenLoc, EndLoc); } + /// Build a new OpenMP 'full' clause. + OMPClause *RebuildOMPFullClause(SourceLocation StartLoc, + SourceLocation EndLoc) { + return getSema().ActOnOpenMPFullClause(StartLoc, EndLoc); + } + + /// Build a new OpenMP 'partial' clause. + OMPClause *RebuildOMPPartialClause(Expr *Factor, SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { + return getSema().ActOnOpenMPPartialClause(Factor, StartLoc, LParenLoc, + EndLoc); + } + /// Build a new OpenMP 'allocator' clause. /// /// By default, performs semantic analysis to build the new OpenMP clause. @@ -8519,6 +8533,17 @@ return Res; } +template +StmtResult +TreeTransform::TransformOMPUnrollDirective(OMPUnrollDirective *D) { + DeclarationNameInfo DirName; + getDerived().getSema().StartOpenMPDSABlock(D->getDirectiveKind(), DirName, + nullptr, D->getBeginLoc()); + StmtResult Res = getDerived().TransformOMPExecutableDirective(D); + getDerived().getSema().EndOpenMPDSABlock(Res.get()); + return Res; +} + template StmtResult TreeTransform::TransformOMPForDirective(OMPForDirective *D) { @@ -9218,6 +9243,28 @@ C->getLParenLoc(), C->getEndLoc()); } +template +OMPClause *TreeTransform::TransformOMPFullClause(OMPFullClause *C) { + if (!getDerived().AlwaysRebuild()) + return C; + return RebuildOMPFullClause(C->getBeginLoc(), C->getEndLoc()); +} + +template +OMPClause * +TreeTransform::TransformOMPPartialClause(OMPPartialClause *C) { + ExprResult T = getDerived().TransformExpr(C->getFactor()); + if (T.isInvalid()) + return nullptr; + Expr *Factor = T.get(); + bool Changed = Factor != C->getFactor(); + + if (!Changed && !getDerived().AlwaysRebuild()) + return C; + return RebuildOMPPartialClause(Factor, C->getBeginLoc(), C->getLParenLoc(), + C->getEndLoc()); +} + template OMPClause * TreeTransform::TransformOMPCollapseClause(OMPCollapseClause *C) { diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -11748,6 +11748,12 @@ C = OMPSizesClause::CreateEmpty(Context, NumSizes); break; } + case llvm::omp::OMPC_full: + C = OMPFullClause::CreateEmpty(Context); + break; + case llvm::omp::OMPC_partial: + C = OMPPartialClause::CreateEmpty(Context); + break; case llvm::omp::OMPC_allocator: C = new (Context) OMPAllocatorClause(); break; @@ -12060,6 +12066,13 @@ C->setLParenLoc(Record.readSourceLocation()); } +void OMPClauseReader::VisitOMPFullClause(OMPFullClause *C) {} + +void OMPClauseReader::VisitOMPPartialClause(OMPPartialClause *C) { + C->setFactor(Record.readSubExpr()); + C->setLParenLoc(Record.readSourceLocation()); +} + void OMPClauseReader::VisitOMPAllocatorClause(OMPAllocatorClause *C) { C->setAllocator(Record.readExpr()); C->setLParenLoc(Record.readSourceLocation()); diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp --- a/clang/lib/Serialization/ASTReaderStmt.cpp +++ b/clang/lib/Serialization/ASTReaderStmt.cpp @@ -2309,6 +2309,10 @@ VisitOMPLoopBasedDirective(D); } +void ASTStmtReader::VisitOMPUnrollDirective(OMPUnrollDirective *D) { + VisitOMPLoopBasedDirective(D); +} + void ASTStmtReader::VisitOMPForDirective(OMPForDirective *D) { VisitOMPLoopDirective(D); D->setHasCancel(Record.readBool()); @@ -3185,6 +3189,14 @@ break; } + case STMT_OMP_UNROLL_DIRECTIVE: { + unsigned NumLoops = Record[ASTStmtReader::NumStmtFields]; + assert(NumLoops == 1 && "Unroll directive accepts only a single loop"); + unsigned NumClauses = Record[ASTStmtReader::NumStmtFields + 1]; + S = OMPUnrollDirective::CreateEmpty(Context, NumClauses); + break; + } + case STMT_OMP_FOR_DIRECTIVE: { unsigned CollapsedNum = Record[ASTStmtReader::NumStmtFields]; unsigned NumClauses = Record[ASTStmtReader::NumStmtFields + 1]; diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -6128,6 +6128,13 @@ Record.AddSourceLocation(C->getLParenLoc()); } +void OMPClauseWriter::VisitOMPFullClause(OMPFullClause *C) {} + +void OMPClauseWriter::VisitOMPPartialClause(OMPPartialClause *C) { + Record.AddStmt(C->getFactor()); + Record.AddSourceLocation(C->getLParenLoc()); +} + void OMPClauseWriter::VisitOMPAllocatorClause(OMPAllocatorClause *C) { Record.AddStmt(C->getAllocator()); Record.AddSourceLocation(C->getLParenLoc()); diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp --- a/clang/lib/Serialization/ASTWriterStmt.cpp +++ b/clang/lib/Serialization/ASTWriterStmt.cpp @@ -2210,6 +2210,11 @@ Code = serialization::STMT_OMP_TILE_DIRECTIVE; } +void ASTStmtWriter::VisitOMPUnrollDirective(OMPUnrollDirective *D) { + VisitOMPLoopBasedDirective(D); + Code = serialization::STMT_OMP_UNROLL_DIRECTIVE; +} + void ASTStmtWriter::VisitOMPForDirective(OMPForDirective *D) { VisitOMPLoopDirective(D); Record.writeBool(D->hasCancel()); diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp --- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp +++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp @@ -1297,7 +1297,8 @@ case Stmt::OMPInteropDirectiveClass: case Stmt::OMPDispatchDirectiveClass: case Stmt::OMPMaskedDirectiveClass: - case Stmt::CapturedStmtClass: { + case Stmt::CapturedStmtClass: + case Stmt::OMPUnrollDirectiveClass: { const ExplodedNode *node = Bldr.generateSink(S, Pred, Pred->getState()); Engine.addAbortedBlock(node, currBldrCtx->getBlock()); break; diff --git a/clang/test/OpenMP/tile_codegen.cpp b/clang/test/OpenMP/tile_codegen.cpp --- a/clang/test/OpenMP/tile_codegen.cpp +++ b/clang/test/OpenMP/tile_codegen.cpp @@ -942,14 +942,16 @@ // IR-NEXT: entry: // IR-NEXT: [[START_ADDR:%.*]] = alloca i32, align 4 // IR-NEXT: [[END_ADDR:%.*]] = alloca i32, align 4 +// IR-NEXT: [[I:%.*]] = alloca i32, align 4 // IR-NEXT: [[CAPTURE_EXPR:%.+]] = alloca i32, align 4 // IR-NEXT: [[CAPTURE_EXPR1:%.+]] = alloca i32, align 4 // IR-NEXT: [[CAPTURE_EXPR2:%.+]] = alloca i32, align 4 -// IR-NEXT: [[I:%.*]] = alloca i32, align 4 // IR-NEXT: [[DOTFLOOR_0_IV_I:%.*]] = alloca i32, align 4 // IR-NEXT: [[DOTTILE_0_IV_I:%.*]] = alloca i32, align 4 // IR-NEXT: store i32 [[START:%.*]], i32* [[START_ADDR]], align 4 // IR-NEXT: store i32 [[END:%.*]], i32* [[END_ADDR]], align 4 +// IR-NEXT: [[TMP4:%.+]] = load i32, i32* [[START_ADDR]], align 4 +// IR-NEXT: store i32 [[TMP4]], i32* [[I]], align 4 // IR-NEXT: [[TMP0:%.+]] = load i32, i32* [[START_ADDR]], align 4 // IR-NEXT: store i32 [[TMP0]], i32* [[CAPTURE_EXPR]], align 4 // IR-NEXT: [[TMP1:%.+]] = load i32, i32* [[END_ADDR]], align 4 @@ -962,8 +964,6 @@ // IR-NEXT: [[DIV:%.+]] = udiv i32 [[ADD]], 3 // IR-NEXT: [[SUB4:%.+]] = sub i32 [[DIV]], 1 // IR-NEXT: store i32 [[SUB4]], i32* [[CAPTURE_EXPR2]], align 4 -// IR-NEXT: [[TMP4:%.+]] = load i32, i32* [[START_ADDR]], align 4 -// IR-NEXT: store i32 [[TMP4]], i32* [[I]], align 4 // IR-NEXT: store i32 0, i32* [[DOTFLOOR_0_IV_I]], align 4 // IR-NEXT: br label %[[FOR_COND:.*]] // IR: [[FOR_COND]]: diff --git a/clang/test/OpenMP/tile_codegen_for_dependent.cpp b/clang/test/OpenMP/tile_codegen_for_dependent.cpp --- a/clang/test/OpenMP/tile_codegen_for_dependent.cpp +++ b/clang/test/OpenMP/tile_codegen_for_dependent.cpp @@ -14,11 +14,9 @@ #define HEADER // placeholder for loop body code. - extern "C" void body(...) {} - // IR-LABEL: @func( // IR-NEXT: [[ENTRY:.*]]: // IR-NEXT: %[[START_ADDR:.+]] = alloca i32, align 4 @@ -26,11 +24,11 @@ // IR-NEXT: %[[STEP_ADDR:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTOMP_IV:.+]] = alloca i32, align 4 // IR-NEXT: %[[TMP:.+]] = alloca i32, align 4 +// IR-NEXT: %[[I:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTCAPTURE_EXPR_:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTCAPTURE_EXPR_1:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTCAPTURE_EXPR_2:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTCAPTURE_EXPR_3:.+]] = alloca i32, align 4 -// IR-NEXT: %[[I:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTCAPTURE_EXPR_6:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTCAPTURE_EXPR_8:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTFLOOR_0_IV_I:.+]] = alloca i32, align 4 @@ -45,23 +43,23 @@ // IR-NEXT: store i32 %[[END:.+]], i32* %[[END_ADDR]], align 4 // IR-NEXT: store i32 %[[STEP:.+]], i32* %[[STEP_ADDR]], align 4 // IR-NEXT: %[[TMP1:.+]] = load i32, i32* %[[START_ADDR]], align 4 -// IR-NEXT: store i32 %[[TMP1]], i32* %[[DOTCAPTURE_EXPR_]], align 4 -// IR-NEXT: %[[TMP2:.+]] = load i32, i32* %[[END_ADDR]], align 4 -// IR-NEXT: store i32 %[[TMP2]], i32* %[[DOTCAPTURE_EXPR_1]], align 4 -// IR-NEXT: %[[TMP3:.+]] = load i32, i32* %[[STEP_ADDR]], align 4 -// IR-NEXT: store i32 %[[TMP3]], i32* %[[DOTCAPTURE_EXPR_2]], align 4 -// IR-NEXT: %[[TMP4:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_1]], align 4 -// IR-NEXT: %[[TMP5:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_]], align 4 -// IR-NEXT: %[[SUB:.+]] = sub i32 %[[TMP4]], %[[TMP5]] +// IR-NEXT: store i32 %[[TMP1]], i32* %[[I]], align 4 +// IR-NEXT: %[[TMP2:.+]] = load i32, i32* %[[START_ADDR]], align 4 +// IR-NEXT: store i32 %[[TMP2]], i32* %[[DOTCAPTURE_EXPR_]], align 4 +// IR-NEXT: %[[TMP3:.+]] = load i32, i32* %[[END_ADDR]], align 4 +// IR-NEXT: store i32 %[[TMP3]], i32* %[[DOTCAPTURE_EXPR_1]], align 4 +// IR-NEXT: %[[TMP4:.+]] = load i32, i32* %[[STEP_ADDR]], align 4 +// IR-NEXT: store i32 %[[TMP4]], i32* %[[DOTCAPTURE_EXPR_2]], align 4 +// IR-NEXT: %[[TMP5:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_1]], align 4 +// IR-NEXT: %[[TMP6:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_]], align 4 +// IR-NEXT: %[[SUB:.+]] = sub i32 %[[TMP5]], %[[TMP6]] // IR-NEXT: %[[SUB4:.+]] = sub i32 %[[SUB]], 1 -// IR-NEXT: %[[TMP6:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_2]], align 4 -// IR-NEXT: %[[ADD:.+]] = add i32 %[[SUB4]], %[[TMP6]] // IR-NEXT: %[[TMP7:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_2]], align 4 -// IR-NEXT: %[[DIV:.+]] = udiv i32 %[[ADD]], %[[TMP7]] +// IR-NEXT: %[[ADD:.+]] = add i32 %[[SUB4]], %[[TMP7]] +// IR-NEXT: %[[TMP8:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_2]], align 4 +// IR-NEXT: %[[DIV:.+]] = udiv i32 %[[ADD]], %[[TMP8]] // IR-NEXT: %[[SUB5:.+]] = sub i32 %[[DIV]], 1 // IR-NEXT: store i32 %[[SUB5]], i32* %[[DOTCAPTURE_EXPR_3]], align 4 -// IR-NEXT: %[[TMP8:.+]] = load i32, i32* %[[START_ADDR]], align 4 -// IR-NEXT: store i32 %[[TMP8]], i32* %[[I]], align 4 // IR-NEXT: %[[TMP9:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_3]], align 4 // IR-NEXT: %[[ADD7:.+]] = add i32 %[[TMP9]], 1 // IR-NEXT: store i32 %[[ADD7]], i32* %[[DOTCAPTURE_EXPR_6]], align 4 diff --git a/clang/test/OpenMP/tile_codegen_for_multiple.cpp b/clang/test/OpenMP/tile_codegen_tile_for.cpp rename from clang/test/OpenMP/tile_codegen_for_multiple.cpp rename to clang/test/OpenMP/tile_codegen_tile_for.cpp --- a/clang/test/OpenMP/tile_codegen_for_multiple.cpp +++ b/clang/test/OpenMP/tile_codegen_tile_for.cpp @@ -23,14 +23,14 @@ // IR-NEXT: %[[STEP_ADDR:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTOMP_IV:.+]] = alloca i32, align 4 // IR-NEXT: %[[TMP:.+]] = alloca i32, align 4 +// IR-NEXT: %[[I:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTCAPTURE_EXPR_:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTCAPTURE_EXPR_1:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTCAPTURE_EXPR_2:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTCAPTURE_EXPR_3:.+]] = alloca i32, align 4 -// IR-NEXT: %[[I:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTFLOOR_0_IV_I:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTCAPTURE_EXPR_6:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTCAPTURE_EXPR_8:.+]] = alloca i32, align 4 -// IR-NEXT: %[[DOTFLOOR_0_IV_I:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTCAPTURE_EXPR_12:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTCAPTURE_EXPR_14:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTFLOOR_0_IV__FLOOR_0_IV_I:.+]] = alloca i32, align 4 @@ -46,23 +46,24 @@ // IR-NEXT: store i32 %[[END:.+]], i32* %[[END_ADDR]], align 4 // IR-NEXT: store i32 %[[STEP:.+]], i32* %[[STEP_ADDR]], align 4 // IR-NEXT: %[[TMP1:.+]] = load i32, i32* %[[START_ADDR]], align 4 -// IR-NEXT: store i32 %[[TMP1]], i32* %[[DOTCAPTURE_EXPR_]], align 4 -// IR-NEXT: %[[TMP2:.+]] = load i32, i32* %[[END_ADDR]], align 4 -// IR-NEXT: store i32 %[[TMP2]], i32* %[[DOTCAPTURE_EXPR_1]], align 4 -// IR-NEXT: %[[TMP3:.+]] = load i32, i32* %[[STEP_ADDR]], align 4 -// IR-NEXT: store i32 %[[TMP3]], i32* %[[DOTCAPTURE_EXPR_2]], align 4 -// IR-NEXT: %[[TMP4:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_1]], align 4 -// IR-NEXT: %[[TMP5:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_]], align 4 -// IR-NEXT: %[[SUB:.+]] = sub i32 %[[TMP4]], %[[TMP5]] +// IR-NEXT: store i32 %[[TMP1]], i32* %[[I]], align 4 +// IR-NEXT: %[[TMP2:.+]] = load i32, i32* %[[START_ADDR]], align 4 +// IR-NEXT: store i32 %[[TMP2]], i32* %[[DOTCAPTURE_EXPR_]], align 4 +// IR-NEXT: %[[TMP3:.+]] = load i32, i32* %[[END_ADDR]], align 4 +// IR-NEXT: store i32 %[[TMP3]], i32* %[[DOTCAPTURE_EXPR_1]], align 4 +// IR-NEXT: %[[TMP4:.+]] = load i32, i32* %[[STEP_ADDR]], align 4 +// IR-NEXT: store i32 %[[TMP4]], i32* %[[DOTCAPTURE_EXPR_2]], align 4 +// IR-NEXT: %[[TMP5:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_1]], align 4 +// IR-NEXT: %[[TMP6:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_]], align 4 +// IR-NEXT: %[[SUB:.+]] = sub i32 %[[TMP5]], %[[TMP6]] // IR-NEXT: %[[SUB4:.+]] = sub i32 %[[SUB]], 1 -// IR-NEXT: %[[TMP6:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_2]], align 4 -// IR-NEXT: %[[ADD:.+]] = add i32 %[[SUB4]], %[[TMP6]] // IR-NEXT: %[[TMP7:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_2]], align 4 -// IR-NEXT: %[[DIV:.+]] = udiv i32 %[[ADD]], %[[TMP7]] +// IR-NEXT: %[[ADD:.+]] = add i32 %[[SUB4]], %[[TMP7]] +// IR-NEXT: %[[TMP8:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_2]], align 4 +// IR-NEXT: %[[DIV:.+]] = udiv i32 %[[ADD]], %[[TMP8]] // IR-NEXT: %[[SUB5:.+]] = sub i32 %[[DIV]], 1 // IR-NEXT: store i32 %[[SUB5]], i32* %[[DOTCAPTURE_EXPR_3]], align 4 -// IR-NEXT: %[[TMP8:.+]] = load i32, i32* %[[START_ADDR]], align 4 -// IR-NEXT: store i32 %[[TMP8]], i32* %[[I]], align 4 +// IR-NEXT: store i32 0, i32* %[[DOTFLOOR_0_IV_I]], align 4 // IR-NEXT: %[[TMP9:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_3]], align 4 // IR-NEXT: %[[ADD7:.+]] = add i32 %[[TMP9]], 1 // IR-NEXT: store i32 %[[ADD7]], i32* %[[DOTCAPTURE_EXPR_6]], align 4 @@ -71,7 +72,6 @@ // IR-NEXT: %[[DIV10:.+]] = udiv i32 %[[SUB9]], 4 // IR-NEXT: %[[SUB11:.+]] = sub i32 %[[DIV10]], 1 // IR-NEXT: store i32 %[[SUB11]], i32* %[[DOTCAPTURE_EXPR_8]], align 4 -// IR-NEXT: store i32 0, i32* %[[DOTFLOOR_0_IV_I]], align 4 // IR-NEXT: %[[TMP11:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_8]], align 4 // IR-NEXT: %[[ADD13:.+]] = add i32 %[[TMP11]], 1 // IR-NEXT: store i32 %[[ADD13]], i32* %[[DOTCAPTURE_EXPR_12]], align 4 @@ -246,3 +246,8 @@ } #endif /* HEADER */ +// IR: ![[META0:[0-9]+]] = !{i32 1, !"wchar_size", i32 4} +// IR: ![[META1:[0-9]+]] = !{!"clang version {{[^"]*}}"} +// IR: ![[LOOP2]] = distinct !{![[LOOP2]], ![[LOOPPROP3:[0-9]+]]} +// IR: ![[LOOPPROP3]] = !{!"llvm.loop.mustprogress"} +// IR: ![[LOOP4]] = distinct !{![[LOOP4]], ![[LOOPPROP3]]} diff --git a/clang/test/OpenMP/unroll_ast_print.cpp b/clang/test/OpenMP/unroll_ast_print.cpp new file mode 100644 --- /dev/null +++ b/clang/test/OpenMP/unroll_ast_print.cpp @@ -0,0 +1,127 @@ +// Check no warnings/errors +// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -fsyntax-only -verify %s +// expected-no-diagnostics + +// Check AST and unparsing +// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -ast-dump %s | FileCheck %s --check-prefix=DUMP +// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -ast-print %s | FileCheck %s --check-prefix=PRINT --match-full-lines + +// Check same results after serialization round-trip +// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -emit-pch -o %t %s +// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -include-pch %t -ast-dump-all %s | FileCheck %s --check-prefix=DUMP +// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -include-pch %t -ast-print %s | FileCheck %s --check-prefix=PRINT --match-full-lines + +#ifndef HEADER +#define HEADER + +// placeholder for loop body code. +void body(...); + + +// PRINT-LABEL: void func_unroll() { +// DUMP-LABEL: FunctionDecl {{.*}} func_unroll +void func_unroll() { + // PRINT: #pragma omp unroll + // DUMP: OMPUnrollDirective + #pragma omp unroll + // PRINT-NEXT: for (int i = 7; i < 17; i += 3) + // DUMP-NEXT: ForStmt + for (int i = 7; i < 17; i += 3) + // PRINT-NEXT: body(i); + // DUMP: CallExpr + body(i); +} + + +// PRINT-LABEL: void func_unroll_full() { +// DUMP-LABEL: FunctionDecl {{.*}} func_unroll_full +void func_unroll_full() { + // PRINT: #pragma omp unroll full + // DUMP: OMPUnrollDirective + // DUMP-NEXT: OMPFullClause + #pragma omp unroll full + // PRINT-NEXT: for (int i = 7; i < 17; i += 3) + // DUMP-NEXT: ForStmt + for (int i = 7; i < 17; i += 3) + // PRINT-NEXT: body(i); + // DUMP: CallExpr + body(i); +} + + +// PRINT-LABEL: void func_unroll_partial() { +// DUMP-LABEL: FunctionDecl {{.*}} func_unroll_partial +void func_unroll_partial() { + // PRINT: #pragma omp unroll partial + // DUMP: OMPUnrollDirective + // DUMP-NEXT: OMPPartialClause + // DUMP-NEXT: <<>> + #pragma omp unroll partial + // PRINT-NEXT: for (int i = 7; i < 17; i += 3) + // DUMP-NEXT: ForStmt + for (int i = 7; i < 17; i += 3) + // PRINT: body(i); + // DUMP: CallExpr + body(i); +} + + +// PRINT-LABEL: void func_unroll_partial_factor() { +// DUMP-LABEL: FunctionDecl {{.*}} func_unroll_partial_factor +void func_unroll_partial_factor() { + // PRINT: #pragma omp unroll partial(4) + // DUMP: OMPUnrollDirective + // DUMP-NEXT: OMPPartialClause + // DUMP-NEXT: ConstantExpr + // DUMP-NEXT: value: Int 4 + // DUMP-NEXT: IntegerLiteral {{.*}} 4 + #pragma omp unroll partial(4) + // PRINT-NEXT: for (int i = 7; i < 17; i += 3) + // DUMP-NEXT: ForStmt + for (int i = 7; i < 17; i += 3) + // PRINT-NEXT: body(i); + // DUMP: CallExpr + body(i); +} + + +// PRINT-LABEL: void func_unroll_partial_factor_for() { +// DUMP-LABEL: FunctionDecl {{.*}} func_unroll_partial_factor_for +void func_unroll_partial_factor_for() { + // PRINT: #pragma omp for + // DUMP: OMPForDirective + #pragma omp for + // PRINT: #pragma omp unroll partial(2) + // DUMP: OMPUnrollDirective + // DUMP-NEXT: OMPPartialClause + #pragma omp unroll partial(2) + // PRINT-NEXT: for (int i = 7; i < 17; i += 3) + // DUMP: ForStmt + for (int i = 7; i < 17; i += 3) + // PRINT-NEXT: body(i); + // DUMP: CallExpr + body(i); +} + + +// PRINT-LABEL: template void unroll_templated() { +// DUMP-LABEL: FunctionTemplateDecl {{.*}} unroll_templated +template +void unroll_templated() { + // PRINT: #pragma omp unroll partial(Factor) + // DUMP: OMPUnrollDirective + // DUMP-NEXT: OMPPartialClause + // DUMP-NEXT: DeclRefExpr {{.*}} 'Factor' 'int' + #pragma omp unroll partial(Factor) + // PRINT-NEXT: for (T i = Start; i < End; i += Step) + // DUMP-NEXT: ForStmt + for (T i = Start; i < End; i += Step) + // PRINT-NEXT: body(i); + // DUMP: CallExpr + body(i); +} +void unroll_template() { + unroll_templated(); +} + +#endif diff --git a/clang/test/OpenMP/unroll_codegen_factor.cpp b/clang/test/OpenMP/unroll_codegen_factor.cpp new file mode 100644 --- /dev/null +++ b/clang/test/OpenMP/unroll_codegen_factor.cpp @@ -0,0 +1,65 @@ +// Check code generation +// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -emit-llvm %s -o - | FileCheck %s --check-prefix=IR + +// Check same results after serialization round-trip +// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -emit-pch -o %t %s +// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -include-pch %t -emit-llvm %s -o - | FileCheck %s --check-prefix=IR +// expected-no-diagnostics + +#ifndef HEADER +#define HEADER + +// placeholder for loop body code. +extern "C" void body(...) {} + + +// IR-LABEL: @func( +// IR-NEXT: [[ENTRY:.*]]: +// IR-NEXT: %[[START_ADDR:.+]] = alloca i32, align 4 +// IR-NEXT: %[[END_ADDR:.+]] = alloca i32, align 4 +// IR-NEXT: %[[STEP_ADDR:.+]] = alloca i32, align 4 +// IR-NEXT: %[[I:.+]] = alloca i32, align 4 +// IR-NEXT: store i32 %[[START:.+]], i32* %[[START_ADDR]], align 4 +// IR-NEXT: store i32 %[[END:.+]], i32* %[[END_ADDR]], align 4 +// IR-NEXT: store i32 %[[STEP:.+]], i32* %[[STEP_ADDR]], align 4 +// IR-NEXT: %[[TMP0:.+]] = load i32, i32* %[[START_ADDR]], align 4 +// IR-NEXT: store i32 %[[TMP0]], i32* %[[I]], align 4 +// IR-NEXT: br label %[[FOR_COND:.+]] +// IR-EMPTY: +// IR-NEXT: [[FOR_COND]]: +// IR-NEXT: %[[TMP1:.+]] = load i32, i32* %[[I]], align 4 +// IR-NEXT: %[[TMP2:.+]] = load i32, i32* %[[END_ADDR]], align 4 +// IR-NEXT: %[[CMP:.+]] = icmp slt i32 %[[TMP1]], %[[TMP2]] +// IR-NEXT: br i1 %[[CMP]], label %[[FOR_BODY:.+]], label %[[FOR_END:.+]] +// IR-EMPTY: +// IR-NEXT: [[FOR_BODY]]: +// IR-NEXT: %[[TMP3:.+]] = load i32, i32* %[[START_ADDR]], align 4 +// IR-NEXT: %[[TMP4:.+]] = load i32, i32* %[[END_ADDR]], align 4 +// IR-NEXT: %[[TMP5:.+]] = load i32, i32* %[[STEP_ADDR]], align 4 +// IR-NEXT: %[[TMP6:.+]] = load i32, i32* %[[I]], align 4 +// IR-NEXT: call void (...) @body(i32 %[[TMP3]], i32 %[[TMP4]], i32 %[[TMP5]], i32 %[[TMP6]]) +// IR-NEXT: br label %[[FOR_INC:.+]] +// IR-EMPTY: +// IR-NEXT: [[FOR_INC]]: +// IR-NEXT: %[[TMP7:.+]] = load i32, i32* %[[STEP_ADDR]], align 4 +// IR-NEXT: %[[TMP8:.+]] = load i32, i32* %[[I]], align 4 +// IR-NEXT: %[[ADD:.+]] = add nsw i32 %[[TMP8]], %[[TMP7]] +// IR-NEXT: store i32 %[[ADD]], i32* %[[I]], align 4 +// IR-NEXT: br label %[[FOR_COND]], !llvm.loop ![[LOOP2:[0-9]+]] +// IR-EMPTY: +// IR-NEXT: [[FOR_END]]: +// IR-NEXT: ret void +// IR-NEXT: } +extern "C" void func(int start, int end, int step) { + #pragma omp unroll partial(4) + for (int i = start; i < end; i+=step) + body(start, end, step, i); +} + +#endif /* HEADER */ + + +// IR: ![[LOOP2]] = distinct !{![[LOOP2]], ![[LOOPPROP3:[0-9]+]], ![[LOOPPROP4:[0-9]+]], ![[LOOPPROP5:[0-9]+]]} +// IR: ![[LOOPPROP3]] = !{!"llvm.loop.mustprogress"} +// IR: ![[LOOPPROP4]] = !{!"llvm.loop.unroll.count", i32 4} +// IR: ![[LOOPPROP5]] = !{!"llvm.loop.unroll.enable"} diff --git a/clang/test/OpenMP/unroll_codegen_for_collapse_outer.cpp b/clang/test/OpenMP/unroll_codegen_for_collapse_outer.cpp new file mode 100644 --- /dev/null +++ b/clang/test/OpenMP/unroll_codegen_for_collapse_outer.cpp @@ -0,0 +1,251 @@ +// Check code generation +// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -emit-llvm %s -o - | FileCheck %s --check-prefix=IR + +// Check same results after serialization round-trip +// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -emit-pch -o %t %s +// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -include-pch %t -emit-llvm %s -o - | FileCheck %s --check-prefix=IR +// expected-no-diagnostics + +#ifndef HEADER +#define HEADER + +// placeholder for loop body code. +extern "C" void body(...) {} + + +// IR-LABEL: @func( +// IR-NEXT: [[ENTRY:.*]]: +// IR-NEXT: %[[START_ADDR:.+]] = alloca i32, align 4 +// IR-NEXT: %[[END_ADDR:.+]] = alloca i32, align 4 +// IR-NEXT: %[[STEP_ADDR:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTOMP_IV:.+]] = alloca i64, align 8 +// IR-NEXT: %[[TMP:.+]] = alloca i32, align 4 +// IR-NEXT: %[[TMP1:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTCAPTURE_EXPR_:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTCAPTURE_EXPR_2:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTCAPTURE_EXPR_3:.+]] = alloca i32, align 4 +// IR-NEXT: %[[J:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTCAPTURE_EXPR_4:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTCAPTURE_EXPR_5:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTCAPTURE_EXPR_6:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTCAPTURE_EXPR_7:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTCAPTURE_EXPR_10:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTCAPTURE_EXPR_12:.+]] = alloca i64, align 8 +// IR-NEXT: %[[I:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTUNROLLED_IV_J:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTOMP_LB:.+]] = alloca i64, align 8 +// IR-NEXT: %[[DOTOMP_UB:.+]] = alloca i64, align 8 +// IR-NEXT: %[[DOTOMP_STRIDE:.+]] = alloca i64, align 8 +// IR-NEXT: %[[DOTOMP_IS_LAST:.+]] = alloca i32, align 4 +// IR-NEXT: %[[I22:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTUNROLLED_IV_J23:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTUNROLL_INNER_IV_J:.+]] = alloca i32, align 4 +// IR-NEXT: %[[TMP0:.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) +// IR-NEXT: store i32 %[[START:.+]], i32* %[[START_ADDR]], align 4 +// IR-NEXT: store i32 %[[END:.+]], i32* %[[END_ADDR]], align 4 +// IR-NEXT: store i32 %[[STEP:.+]], i32* %[[STEP_ADDR]], align 4 +// IR-NEXT: %[[TMP1_1:.+]] = load i32, i32* %[[START_ADDR]], align 4 +// IR-NEXT: store i32 %[[TMP1_1]], i32* %[[DOTCAPTURE_EXPR_]], align 4 +// IR-NEXT: %[[TMP2:.+]] = load i32, i32* %[[END_ADDR]], align 4 +// IR-NEXT: store i32 %[[TMP2]], i32* %[[DOTCAPTURE_EXPR_2]], align 4 +// IR-NEXT: %[[TMP3:.+]] = load i32, i32* %[[STEP_ADDR]], align 4 +// IR-NEXT: store i32 %[[TMP3]], i32* %[[DOTCAPTURE_EXPR_3]], align 4 +// IR-NEXT: %[[TMP4:.+]] = load i32, i32* %[[START_ADDR]], align 4 +// IR-NEXT: store i32 %[[TMP4]], i32* %[[J]], align 4 +// IR-NEXT: %[[TMP5:.+]] = load i32, i32* %[[START_ADDR]], align 4 +// IR-NEXT: store i32 %[[TMP5]], i32* %[[DOTCAPTURE_EXPR_4]], align 4 +// IR-NEXT: %[[TMP6:.+]] = load i32, i32* %[[END_ADDR]], align 4 +// IR-NEXT: store i32 %[[TMP6]], i32* %[[DOTCAPTURE_EXPR_5]], align 4 +// IR-NEXT: %[[TMP7:.+]] = load i32, i32* %[[STEP_ADDR]], align 4 +// IR-NEXT: store i32 %[[TMP7]], i32* %[[DOTCAPTURE_EXPR_6]], align 4 +// IR-NEXT: %[[TMP8:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_5]], align 4 +// IR-NEXT: %[[TMP9:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_4]], align 4 +// IR-NEXT: %[[SUB:.+]] = sub i32 %[[TMP8]], %[[TMP9]] +// IR-NEXT: %[[SUB8:.+]] = sub i32 %[[SUB]], 1 +// IR-NEXT: %[[TMP10:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_6]], align 4 +// IR-NEXT: %[[ADD:.+]] = add i32 %[[SUB8]], %[[TMP10]] +// IR-NEXT: %[[TMP11:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_6]], align 4 +// IR-NEXT: %[[DIV:.+]] = udiv i32 %[[ADD]], %[[TMP11]] +// IR-NEXT: %[[SUB9:.+]] = sub i32 %[[DIV]], 1 +// IR-NEXT: store i32 %[[SUB9]], i32* %[[DOTCAPTURE_EXPR_7]], align 4 +// IR-NEXT: %[[TMP12:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_7]], align 4 +// IR-NEXT: %[[ADD11:.+]] = add i32 %[[TMP12]], 1 +// IR-NEXT: store i32 %[[ADD11]], i32* %[[DOTCAPTURE_EXPR_10]], align 4 +// IR-NEXT: %[[TMP13:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_2]], align 4 +// IR-NEXT: %[[TMP14:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_]], align 4 +// IR-NEXT: %[[SUB13:.+]] = sub i32 %[[TMP13]], %[[TMP14]] +// IR-NEXT: %[[SUB14:.+]] = sub i32 %[[SUB13]], 1 +// IR-NEXT: %[[TMP15:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_3]], align 4 +// IR-NEXT: %[[ADD15:.+]] = add i32 %[[SUB14]], %[[TMP15]] +// IR-NEXT: %[[TMP16:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_3]], align 4 +// IR-NEXT: %[[DIV16:.+]] = udiv i32 %[[ADD15]], %[[TMP16]] +// IR-NEXT: %[[CONV:.+]] = zext i32 %[[DIV16]] to i64 +// IR-NEXT: %[[TMP17:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_10]], align 4 +// IR-NEXT: %[[SUB17:.+]] = sub i32 %[[TMP17]], -1 +// IR-NEXT: %[[DIV18:.+]] = udiv i32 %[[SUB17]], 2 +// IR-NEXT: %[[CONV19:.+]] = zext i32 %[[DIV18]] to i64 +// IR-NEXT: %[[MUL:.+]] = mul nsw i64 %[[CONV]], %[[CONV19]] +// IR-NEXT: %[[SUB20:.+]] = sub nsw i64 %[[MUL]], 1 +// IR-NEXT: store i64 %[[SUB20]], i64* %[[DOTCAPTURE_EXPR_12]], align 8 +// IR-NEXT: %[[TMP18:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_]], align 4 +// IR-NEXT: store i32 %[[TMP18]], i32* %[[I]], align 4 +// IR-NEXT: store i32 0, i32* %[[DOTUNROLLED_IV_J]], align 4 +// IR-NEXT: %[[TMP19:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_]], align 4 +// IR-NEXT: %[[TMP20:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_2]], align 4 +// IR-NEXT: %[[CMP:.+]] = icmp slt i32 %[[TMP19]], %[[TMP20]] +// IR-NEXT: br i1 %[[CMP]], label %[[LAND_LHS_TRUE:.+]], label %[[OMP_PRECOND_END:.+]] +// IR-EMPTY: +// IR-NEXT: [[LAND_LHS_TRUE]]: +// IR-NEXT: %[[TMP21:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_10]], align 4 +// IR-NEXT: %[[CMP21:.+]] = icmp ult i32 0, %[[TMP21]] +// IR-NEXT: br i1 %[[CMP21]], label %[[OMP_PRECOND_THEN:.+]], label %[[OMP_PRECOND_END]] +// IR-EMPTY: +// IR-NEXT: [[OMP_PRECOND_THEN]]: +// IR-NEXT: store i64 0, i64* %[[DOTOMP_LB]], align 8 +// IR-NEXT: %[[TMP22:.+]] = load i64, i64* %[[DOTCAPTURE_EXPR_12]], align 8 +// IR-NEXT: store i64 %[[TMP22]], i64* %[[DOTOMP_UB]], align 8 +// IR-NEXT: store i64 1, i64* %[[DOTOMP_STRIDE]], align 8 +// IR-NEXT: store i32 0, i32* %[[DOTOMP_IS_LAST]], align 4 +// IR-NEXT: call void @__kmpc_for_static_init_8(%struct.ident_t* @1, i32 %[[TMP0]], i32 34, i32* %[[DOTOMP_IS_LAST]], i64* %[[DOTOMP_LB]], i64* %[[DOTOMP_UB]], i64* %[[DOTOMP_STRIDE]], i64 1, i64 1) +// IR-NEXT: %[[TMP23:.+]] = load i64, i64* %[[DOTOMP_UB]], align 8 +// IR-NEXT: %[[TMP24:.+]] = load i64, i64* %[[DOTCAPTURE_EXPR_12]], align 8 +// IR-NEXT: %[[CMP24:.+]] = icmp sgt i64 %[[TMP23]], %[[TMP24]] +// IR-NEXT: br i1 %[[CMP24]], label %[[COND_TRUE:.+]], label %[[COND_FALSE:.+]] +// IR-EMPTY: +// IR-NEXT: [[COND_TRUE]]: +// IR-NEXT: %[[TMP25:.+]] = load i64, i64* %[[DOTCAPTURE_EXPR_12]], align 8 +// IR-NEXT: br label %[[COND_END:.+]] +// IR-EMPTY: +// IR-NEXT: [[COND_FALSE]]: +// IR-NEXT: %[[TMP26:.+]] = load i64, i64* %[[DOTOMP_UB]], align 8 +// IR-NEXT: br label %[[COND_END]] +// IR-EMPTY: +// IR-NEXT: [[COND_END]]: +// IR-NEXT: %[[COND:.+]] = phi i64 [ %[[TMP25]], %[[COND_TRUE]] ], [ %[[TMP26]], %[[COND_FALSE]] ] +// IR-NEXT: store i64 %[[COND]], i64* %[[DOTOMP_UB]], align 8 +// IR-NEXT: %[[TMP27:.+]] = load i64, i64* %[[DOTOMP_LB]], align 8 +// IR-NEXT: store i64 %[[TMP27]], i64* %[[DOTOMP_IV]], align 8 +// IR-NEXT: br label %[[OMP_INNER_FOR_COND:.+]] +// IR-EMPTY: +// IR-NEXT: [[OMP_INNER_FOR_COND]]: +// IR-NEXT: %[[TMP28:.+]] = load i64, i64* %[[DOTOMP_IV]], align 8 +// IR-NEXT: %[[TMP29:.+]] = load i64, i64* %[[DOTOMP_UB]], align 8 +// IR-NEXT: %[[CMP25:.+]] = icmp sle i64 %[[TMP28]], %[[TMP29]] +// IR-NEXT: br i1 %[[CMP25]], label %[[OMP_INNER_FOR_BODY:.+]], label %[[OMP_INNER_FOR_END:.+]] +// IR-EMPTY: +// IR-NEXT: [[OMP_INNER_FOR_BODY]]: +// IR-NEXT: %[[TMP30:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_]], align 4 +// IR-NEXT: %[[CONV26:.+]] = sext i32 %[[TMP30]] to i64 +// IR-NEXT: %[[TMP31:.+]] = load i64, i64* %[[DOTOMP_IV]], align 8 +// IR-NEXT: %[[TMP32:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_10]], align 4 +// IR-NEXT: %[[SUB27:.+]] = sub i32 %[[TMP32]], -1 +// IR-NEXT: %[[DIV28:.+]] = udiv i32 %[[SUB27]], 2 +// IR-NEXT: %[[MUL29:.+]] = mul i32 1, %[[DIV28]] +// IR-NEXT: %[[CONV30:.+]] = zext i32 %[[MUL29]] to i64 +// IR-NEXT: %[[DIV31:.+]] = sdiv i64 %[[TMP31]], %[[CONV30]] +// IR-NEXT: %[[TMP33:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_3]], align 4 +// IR-NEXT: %[[CONV32:.+]] = sext i32 %[[TMP33]] to i64 +// IR-NEXT: %[[MUL33:.+]] = mul nsw i64 %[[DIV31]], %[[CONV32]] +// IR-NEXT: %[[ADD34:.+]] = add nsw i64 %[[CONV26]], %[[MUL33]] +// IR-NEXT: %[[CONV35:.+]] = trunc i64 %[[ADD34]] to i32 +// IR-NEXT: store i32 %[[CONV35]], i32* %[[I22]], align 4 +// IR-NEXT: %[[TMP34:.+]] = load i64, i64* %[[DOTOMP_IV]], align 8 +// IR-NEXT: %[[TMP35:.+]] = load i64, i64* %[[DOTOMP_IV]], align 8 +// IR-NEXT: %[[TMP36:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_10]], align 4 +// IR-NEXT: %[[SUB36:.+]] = sub i32 %[[TMP36]], -1 +// IR-NEXT: %[[DIV37:.+]] = udiv i32 %[[SUB36]], 2 +// IR-NEXT: %[[MUL38:.+]] = mul i32 1, %[[DIV37]] +// IR-NEXT: %[[CONV39:.+]] = zext i32 %[[MUL38]] to i64 +// IR-NEXT: %[[DIV40:.+]] = sdiv i64 %[[TMP35]], %[[CONV39]] +// IR-NEXT: %[[TMP37:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_10]], align 4 +// IR-NEXT: %[[SUB41:.+]] = sub i32 %[[TMP37]], -1 +// IR-NEXT: %[[DIV42:.+]] = udiv i32 %[[SUB41]], 2 +// IR-NEXT: %[[MUL43:.+]] = mul i32 1, %[[DIV42]] +// IR-NEXT: %[[CONV44:.+]] = zext i32 %[[MUL43]] to i64 +// IR-NEXT: %[[MUL45:.+]] = mul nsw i64 %[[DIV40]], %[[CONV44]] +// IR-NEXT: %[[SUB46:.+]] = sub nsw i64 %[[TMP34]], %[[MUL45]] +// IR-NEXT: %[[MUL47:.+]] = mul nsw i64 %[[SUB46]], 2 +// IR-NEXT: %[[ADD48:.+]] = add nsw i64 0, %[[MUL47]] +// IR-NEXT: %[[CONV49:.+]] = trunc i64 %[[ADD48]] to i32 +// IR-NEXT: store i32 %[[CONV49]], i32* %[[DOTUNROLLED_IV_J23]], align 4 +// IR-NEXT: %[[TMP38:.+]] = load i32, i32* %[[DOTUNROLLED_IV_J23]], align 4 +// IR-NEXT: store i32 %[[TMP38]], i32* %[[DOTUNROLL_INNER_IV_J]], align 4 +// IR-NEXT: br label %[[FOR_COND:.+]] +// IR-EMPTY: +// IR-NEXT: [[FOR_COND]]: +// IR-NEXT: %[[TMP39:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_J]], align 4 +// IR-NEXT: %[[TMP40:.+]] = load i32, i32* %[[DOTUNROLLED_IV_J23]], align 4 +// IR-NEXT: %[[ADD50:.+]] = add i32 %[[TMP40]], 2 +// IR-NEXT: %[[CMP51:.+]] = icmp ule i32 %[[TMP39]], %[[ADD50]] +// IR-NEXT: br i1 %[[CMP51]], label %[[LAND_RHS:.+]], label %[[LAND_END:.+]] +// IR-EMPTY: +// IR-NEXT: [[LAND_RHS]]: +// IR-NEXT: %[[TMP41:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_J]], align 4 +// IR-NEXT: %[[TMP42:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_7]], align 4 +// IR-NEXT: %[[ADD52:.+]] = add i32 %[[TMP42]], 1 +// IR-NEXT: %[[CMP53:.+]] = icmp ule i32 %[[TMP41]], %[[ADD52]] +// IR-NEXT: br label %[[LAND_END]] +// IR-EMPTY: +// IR-NEXT: [[LAND_END]]: +// IR-NEXT: %[[TMP43:.+]] = phi i1 [ false, %[[FOR_COND]] ], [ %[[CMP53]], %[[LAND_RHS]] ] +// IR-NEXT: br i1 %[[TMP43]], label %[[FOR_BODY:.+]], label %[[FOR_END:.+]] +// IR-EMPTY: +// IR-NEXT: [[FOR_BODY]]: +// IR-NEXT: %[[TMP44:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_4]], align 4 +// IR-NEXT: %[[TMP45:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_J]], align 4 +// IR-NEXT: %[[TMP46:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_6]], align 4 +// IR-NEXT: %[[MUL54:.+]] = mul i32 %[[TMP45]], %[[TMP46]] +// IR-NEXT: %[[ADD55:.+]] = add i32 %[[TMP44]], %[[MUL54]] +// IR-NEXT: store i32 %[[ADD55]], i32* %[[J]], align 4 +// IR-NEXT: %[[TMP47:.+]] = load i32, i32* %[[START_ADDR]], align 4 +// IR-NEXT: %[[TMP48:.+]] = load i32, i32* %[[END_ADDR]], align 4 +// IR-NEXT: %[[TMP49:.+]] = load i32, i32* %[[STEP_ADDR]], align 4 +// IR-NEXT: %[[TMP50:.+]] = load i32, i32* %[[I22]], align 4 +// IR-NEXT: %[[TMP51:.+]] = load i32, i32* %[[J]], align 4 +// IR-NEXT: call void (...) @body(i32 %[[TMP47]], i32 %[[TMP48]], i32 %[[TMP49]], i32 %[[TMP50]], i32 %[[TMP51]]) +// IR-NEXT: br label %[[FOR_INC:.+]] +// IR-EMPTY: +// IR-NEXT: [[FOR_INC]]: +// IR-NEXT: %[[TMP52:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_J]], align 4 +// IR-NEXT: %[[INC:.+]] = add i32 %[[TMP52]], 1 +// IR-NEXT: store i32 %[[INC]], i32* %[[DOTUNROLL_INNER_IV_J]], align 4 +// IR-NEXT: br label %[[FOR_COND]], !llvm.loop ![[LOOP2:[0-9]+]] +// IR-EMPTY: +// IR-NEXT: [[FOR_END]]: +// IR-NEXT: br label %[[OMP_BODY_CONTINUE:.+]] +// IR-EMPTY: +// IR-NEXT: [[OMP_BODY_CONTINUE]]: +// IR-NEXT: br label %[[OMP_INNER_FOR_INC:.+]] +// IR-EMPTY: +// IR-NEXT: [[OMP_INNER_FOR_INC]]: +// IR-NEXT: %[[TMP53:.+]] = load i64, i64* %[[DOTOMP_IV]], align 8 +// IR-NEXT: %[[ADD56:.+]] = add nsw i64 %[[TMP53]], 1 +// IR-NEXT: store i64 %[[ADD56]], i64* %[[DOTOMP_IV]], align 8 +// IR-NEXT: br label %[[OMP_INNER_FOR_COND]] +// IR-EMPTY: +// IR-NEXT: [[OMP_INNER_FOR_END]]: +// IR-NEXT: br label %[[OMP_LOOP_EXIT:.+]] +// IR-EMPTY: +// IR-NEXT: [[OMP_LOOP_EXIT]]: +// IR-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %[[TMP0]]) +// IR-NEXT: br label %[[OMP_PRECOND_END]] +// IR-EMPTY: +// IR-NEXT: [[OMP_PRECOND_END]]: +// IR-NEXT: call void @__kmpc_barrier(%struct.ident_t* @3, i32 %[[TMP0]]) +// IR-NEXT: ret void +// IR-NEXT: } +extern "C" void func(int start, int end, int step) { + #pragma omp for collapse(2) + for (int i = start; i < end; i+=step) { + #pragma omp unroll partial + for (int j = start; j < end; j+=step) + body(start, end, step, i, j); + } +} + +#endif /* HEADER */ + + +// IR: ![[LOOP2]] = distinct !{![[LOOP2]], ![[LOOPPROP3:[0-9]+]], ![[LOOPPROP4:[0-9]+]]} +// IR: ![[LOOPPROP3]] = !{!"llvm.loop.mustprogress"} +// IR: ![[LOOPPROP4]] = !{!"llvm.loop.unroll.count", i32 2} diff --git a/clang/test/OpenMP/tile_codegen_for_dependent.cpp b/clang/test/OpenMP/unroll_codegen_for_partial.cpp copy from clang/test/OpenMP/tile_codegen_for_dependent.cpp copy to clang/test/OpenMP/unroll_codegen_for_partial.cpp --- a/clang/test/OpenMP/tile_codegen_for_dependent.cpp +++ b/clang/test/OpenMP/unroll_codegen_for_partial.cpp @@ -6,40 +6,34 @@ // RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -include-pch %t -emit-llvm %s -o - | FileCheck %s --check-prefix=IR // expected-no-diagnostics -// The loop trip count used by #pragma omp for depends on code generated -// by #pragma omp file. Check that theses PreInits are emitted before -// the code generated by #pragma omp for. - #ifndef HEADER #define HEADER // placeholder for loop body code. - extern "C" void body(...) {} - // IR-LABEL: @func( // IR-NEXT: [[ENTRY:.*]]: // IR-NEXT: %[[START_ADDR:.+]] = alloca i32, align 4 // IR-NEXT: %[[END_ADDR:.+]] = alloca i32, align 4 // IR-NEXT: %[[STEP_ADDR:.+]] = alloca i32, align 4 +// IR-NEXT: %[[I:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTOMP_IV:.+]] = alloca i32, align 4 // IR-NEXT: %[[TMP:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTCAPTURE_EXPR_:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTCAPTURE_EXPR_1:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTCAPTURE_EXPR_2:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTCAPTURE_EXPR_3:.+]] = alloca i32, align 4 -// IR-NEXT: %[[I:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTCAPTURE_EXPR_6:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTCAPTURE_EXPR_8:.+]] = alloca i32, align 4 -// IR-NEXT: %[[DOTFLOOR_0_IV_I:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTUNROLLED_IV_I:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTOMP_LB:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTOMP_UB:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTOMP_STRIDE:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTOMP_IS_LAST:.+]] = alloca i32, align 4 -// IR-NEXT: %[[DOTFLOOR_0_IV_I12:.+]] = alloca i32, align 4 -// IR-NEXT: %[[DOTTILE_0_IV_I:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTUNROLLED_IV_I12:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTUNROLL_INNER_IV_I:.+]] = alloca i32, align 4 // IR-NEXT: %[[TMP0:.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) // IR-NEXT: store i32 %[[START:.+]], i32* %[[START_ADDR]], align 4 // IR-NEXT: store i32 %[[END:.+]], i32* %[[END_ADDR]], align 4 @@ -60,106 +54,98 @@ // IR-NEXT: %[[DIV:.+]] = udiv i32 %[[ADD]], %[[TMP7]] // IR-NEXT: %[[SUB5:.+]] = sub i32 %[[DIV]], 1 // IR-NEXT: store i32 %[[SUB5]], i32* %[[DOTCAPTURE_EXPR_3]], align 4 -// IR-NEXT: %[[TMP8:.+]] = load i32, i32* %[[START_ADDR]], align 4 -// IR-NEXT: store i32 %[[TMP8]], i32* %[[I]], align 4 -// IR-NEXT: %[[TMP9:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_3]], align 4 -// IR-NEXT: %[[ADD7:.+]] = add i32 %[[TMP9]], 1 +// IR-NEXT: %[[TMP8:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_3]], align 4 +// IR-NEXT: %[[ADD7:.+]] = add i32 %[[TMP8]], 1 // IR-NEXT: store i32 %[[ADD7]], i32* %[[DOTCAPTURE_EXPR_6]], align 4 -// IR-NEXT: %[[TMP10:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_6]], align 4 -// IR-NEXT: %[[SUB9:.+]] = sub i32 %[[TMP10]], -3 -// IR-NEXT: %[[DIV10:.+]] = udiv i32 %[[SUB9]], 4 +// IR-NEXT: %[[TMP9:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_6]], align 4 +// IR-NEXT: %[[SUB9:.+]] = sub i32 %[[TMP9]], -1 +// IR-NEXT: %[[DIV10:.+]] = udiv i32 %[[SUB9]], 2 // IR-NEXT: %[[SUB11:.+]] = sub i32 %[[DIV10]], 1 // IR-NEXT: store i32 %[[SUB11]], i32* %[[DOTCAPTURE_EXPR_8]], align 4 -// IR-NEXT: store i32 0, i32* %[[DOTFLOOR_0_IV_I]], align 4 -// IR-NEXT: %[[TMP11:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_6]], align 4 -// IR-NEXT: %[[CMP:.+]] = icmp ult i32 0, %[[TMP11]] +// IR-NEXT: store i32 0, i32* %[[DOTUNROLLED_IV_I]], align 4 +// IR-NEXT: %[[TMP10:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_6]], align 4 +// IR-NEXT: %[[CMP:.+]] = icmp ult i32 0, %[[TMP10]] // IR-NEXT: br i1 %[[CMP]], label %[[OMP_PRECOND_THEN:.+]], label %[[OMP_PRECOND_END:.+]] // IR-EMPTY: // IR-NEXT: [[OMP_PRECOND_THEN]]: // IR-NEXT: store i32 0, i32* %[[DOTOMP_LB]], align 4 -// IR-NEXT: %[[TMP12:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_8]], align 4 -// IR-NEXT: store i32 %[[TMP12]], i32* %[[DOTOMP_UB]], align 4 +// IR-NEXT: %[[TMP11:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_8]], align 4 +// IR-NEXT: store i32 %[[TMP11]], i32* %[[DOTOMP_UB]], align 4 // IR-NEXT: store i32 1, i32* %[[DOTOMP_STRIDE]], align 4 // IR-NEXT: store i32 0, i32* %[[DOTOMP_IS_LAST]], align 4 // IR-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @1, i32 %[[TMP0]], i32 34, i32* %[[DOTOMP_IS_LAST]], i32* %[[DOTOMP_LB]], i32* %[[DOTOMP_UB]], i32* %[[DOTOMP_STRIDE]], i32 1, i32 1) -// IR-NEXT: %[[TMP13:.+]] = load i32, i32* %[[DOTOMP_UB]], align 4 -// IR-NEXT: %[[TMP14:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_8]], align 4 -// IR-NEXT: %[[CMP13:.+]] = icmp ugt i32 %[[TMP13]], %[[TMP14]] +// IR-NEXT: %[[TMP12:.+]] = load i32, i32* %[[DOTOMP_UB]], align 4 +// IR-NEXT: %[[TMP13:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_8]], align 4 +// IR-NEXT: %[[CMP13:.+]] = icmp ugt i32 %[[TMP12]], %[[TMP13]] // IR-NEXT: br i1 %[[CMP13]], label %[[COND_TRUE:.+]], label %[[COND_FALSE:.+]] // IR-EMPTY: // IR-NEXT: [[COND_TRUE]]: -// IR-NEXT: %[[TMP15:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_8]], align 4 +// IR-NEXT: %[[TMP14:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_8]], align 4 // IR-NEXT: br label %[[COND_END:.+]] // IR-EMPTY: // IR-NEXT: [[COND_FALSE]]: -// IR-NEXT: %[[TMP16:.+]] = load i32, i32* %[[DOTOMP_UB]], align 4 +// IR-NEXT: %[[TMP15:.+]] = load i32, i32* %[[DOTOMP_UB]], align 4 // IR-NEXT: br label %[[COND_END]] // IR-EMPTY: // IR-NEXT: [[COND_END]]: -// IR-NEXT: %[[COND:.+]] = phi i32 [ %[[TMP15]], %[[COND_TRUE]] ], [ %[[TMP16]], %[[COND_FALSE]] ] +// IR-NEXT: %[[COND:.+]] = phi i32 [ %[[TMP14]], %[[COND_TRUE]] ], [ %[[TMP15]], %[[COND_FALSE]] ] // IR-NEXT: store i32 %[[COND]], i32* %[[DOTOMP_UB]], align 4 -// IR-NEXT: %[[TMP17:.+]] = load i32, i32* %[[DOTOMP_LB]], align 4 -// IR-NEXT: store i32 %[[TMP17]], i32* %[[DOTOMP_IV]], align 4 +// IR-NEXT: %[[TMP16:.+]] = load i32, i32* %[[DOTOMP_LB]], align 4 +// IR-NEXT: store i32 %[[TMP16]], i32* %[[DOTOMP_IV]], align 4 // IR-NEXT: br label %[[OMP_INNER_FOR_COND:.+]] // IR-EMPTY: // IR-NEXT: [[OMP_INNER_FOR_COND]]: -// IR-NEXT: %[[TMP18:.+]] = load i32, i32* %[[DOTOMP_IV]], align 4 -// IR-NEXT: %[[TMP19:.+]] = load i32, i32* %[[DOTOMP_UB]], align 4 -// IR-NEXT: %[[ADD14:.+]] = add i32 %[[TMP19]], 1 -// IR-NEXT: %[[CMP15:.+]] = icmp ult i32 %[[TMP18]], %[[ADD14]] +// IR-NEXT: %[[TMP17:.+]] = load i32, i32* %[[DOTOMP_IV]], align 4 +// IR-NEXT: %[[TMP18:.+]] = load i32, i32* %[[DOTOMP_UB]], align 4 +// IR-NEXT: %[[ADD14:.+]] = add i32 %[[TMP18]], 1 +// IR-NEXT: %[[CMP15:.+]] = icmp ult i32 %[[TMP17]], %[[ADD14]] // IR-NEXT: br i1 %[[CMP15]], label %[[OMP_INNER_FOR_BODY:.+]], label %[[OMP_INNER_FOR_END:.+]] // IR-EMPTY: // IR-NEXT: [[OMP_INNER_FOR_BODY]]: -// IR-NEXT: %[[TMP20:.+]] = load i32, i32* %[[DOTOMP_IV]], align 4 -// IR-NEXT: %[[MUL:.+]] = mul i32 %[[TMP20]], 4 +// IR-NEXT: %[[TMP19:.+]] = load i32, i32* %[[DOTOMP_IV]], align 4 +// IR-NEXT: %[[MUL:.+]] = mul i32 %[[TMP19]], 2 // IR-NEXT: %[[ADD16:.+]] = add i32 0, %[[MUL]] -// IR-NEXT: store i32 %[[ADD16]], i32* %[[DOTFLOOR_0_IV_I12]], align 4 -// IR-NEXT: %[[TMP21:.+]] = load i32, i32* %[[DOTFLOOR_0_IV_I12]], align 4 -// IR-NEXT: store i32 %[[TMP21]], i32* %[[DOTTILE_0_IV_I]], align 4 +// IR-NEXT: store i32 %[[ADD16]], i32* %[[DOTUNROLLED_IV_I12]], align 4 +// IR-NEXT: %[[TMP20:.+]] = load i32, i32* %[[DOTUNROLLED_IV_I12]], align 4 +// IR-NEXT: store i32 %[[TMP20]], i32* %[[DOTUNROLL_INNER_IV_I]], align 4 // IR-NEXT: br label %[[FOR_COND:.+]] // IR-EMPTY: // IR-NEXT: [[FOR_COND]]: -// IR-NEXT: %[[TMP22:.+]] = load i32, i32* %[[DOTTILE_0_IV_I]], align 4 -// IR-NEXT: %[[TMP23:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_3]], align 4 -// IR-NEXT: %[[ADD17:.+]] = add i32 %[[TMP23]], 1 -// IR-NEXT: %[[TMP24:.+]] = load i32, i32* %[[DOTFLOOR_0_IV_I12]], align 4 -// IR-NEXT: %[[ADD18:.+]] = add nsw i32 %[[TMP24]], 4 -// IR-NEXT: %[[CMP19:.+]] = icmp ult i32 %[[ADD17]], %[[ADD18]] -// IR-NEXT: br i1 %[[CMP19]], label %[[COND_TRUE20:.+]], label %[[COND_FALSE22:.+]] -// IR-EMPTY: -// IR-NEXT: [[COND_TRUE20]]: -// IR-NEXT: %[[TMP25:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_3]], align 4 -// IR-NEXT: %[[ADD21:.+]] = add i32 %[[TMP25]], 1 -// IR-NEXT: br label %[[COND_END24:.+]] -// IR-EMPTY: -// IR-NEXT: [[COND_FALSE22]]: -// IR-NEXT: %[[TMP26:.+]] = load i32, i32* %[[DOTFLOOR_0_IV_I12]], align 4 -// IR-NEXT: %[[ADD23:.+]] = add nsw i32 %[[TMP26]], 4 -// IR-NEXT: br label %[[COND_END24]] -// IR-EMPTY: -// IR-NEXT: [[COND_END24]]: -// IR-NEXT: %[[COND25:.+]] = phi i32 [ %[[ADD21]], %[[COND_TRUE20]] ], [ %[[ADD23]], %[[COND_FALSE22]] ] -// IR-NEXT: %[[CMP26:.+]] = icmp ult i32 %[[TMP22]], %[[COND25]] -// IR-NEXT: br i1 %[[CMP26]], label %[[FOR_BODY:.+]], label %[[FOR_END:.+]] +// IR-NEXT: %[[TMP21:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_I]], align 4 +// IR-NEXT: %[[TMP22:.+]] = load i32, i32* %[[DOTUNROLLED_IV_I12]], align 4 +// IR-NEXT: %[[ADD17:.+]] = add i32 %[[TMP22]], 2 +// IR-NEXT: %[[CMP18:.+]] = icmp ule i32 %[[TMP21]], %[[ADD17]] +// IR-NEXT: br i1 %[[CMP18]], label %[[LAND_RHS:.+]], label %[[LAND_END:.+]] +// IR-EMPTY: +// IR-NEXT: [[LAND_RHS]]: +// IR-NEXT: %[[TMP23:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_I]], align 4 +// IR-NEXT: %[[TMP24:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_3]], align 4 +// IR-NEXT: %[[ADD19:.+]] = add i32 %[[TMP24]], 1 +// IR-NEXT: %[[CMP20:.+]] = icmp ule i32 %[[TMP23]], %[[ADD19]] +// IR-NEXT: br label %[[LAND_END]] +// IR-EMPTY: +// IR-NEXT: [[LAND_END]]: +// IR-NEXT: %[[TMP25:.+]] = phi i1 [ false, %[[FOR_COND]] ], [ %[[CMP20]], %[[LAND_RHS]] ] +// IR-NEXT: br i1 %[[TMP25]], label %[[FOR_BODY:.+]], label %[[FOR_END:.+]] // IR-EMPTY: // IR-NEXT: [[FOR_BODY]]: -// IR-NEXT: %[[TMP27:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_]], align 4 -// IR-NEXT: %[[TMP28:.+]] = load i32, i32* %[[DOTTILE_0_IV_I]], align 4 -// IR-NEXT: %[[TMP29:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_2]], align 4 -// IR-NEXT: %[[MUL27:.+]] = mul i32 %[[TMP28]], %[[TMP29]] -// IR-NEXT: %[[ADD28:.+]] = add i32 %[[TMP27]], %[[MUL27]] -// IR-NEXT: store i32 %[[ADD28]], i32* %[[I]], align 4 -// IR-NEXT: %[[TMP30:.+]] = load i32, i32* %[[START_ADDR]], align 4 -// IR-NEXT: %[[TMP31:.+]] = load i32, i32* %[[END_ADDR]], align 4 -// IR-NEXT: %[[TMP32:.+]] = load i32, i32* %[[STEP_ADDR]], align 4 -// IR-NEXT: %[[TMP33:.+]] = load i32, i32* %[[I]], align 4 -// IR-NEXT: call void (...) @body(i32 %[[TMP30]], i32 %[[TMP31]], i32 %[[TMP32]], i32 %[[TMP33]]) +// IR-NEXT: %[[TMP26:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_]], align 4 +// IR-NEXT: %[[TMP27:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_I]], align 4 +// IR-NEXT: %[[TMP28:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_2]], align 4 +// IR-NEXT: %[[MUL21:.+]] = mul i32 %[[TMP27]], %[[TMP28]] +// IR-NEXT: %[[ADD22:.+]] = add i32 %[[TMP26]], %[[MUL21]] +// IR-NEXT: store i32 %[[ADD22]], i32* %[[I]], align 4 +// IR-NEXT: %[[TMP29:.+]] = load i32, i32* %[[START_ADDR]], align 4 +// IR-NEXT: %[[TMP30:.+]] = load i32, i32* %[[END_ADDR]], align 4 +// IR-NEXT: %[[TMP31:.+]] = load i32, i32* %[[STEP_ADDR]], align 4 +// IR-NEXT: %[[TMP32:.+]] = load i32, i32* %[[I]], align 4 +// IR-NEXT: call void (...) @body(i32 %[[TMP29]], i32 %[[TMP30]], i32 %[[TMP31]], i32 %[[TMP32]]) // IR-NEXT: br label %[[FOR_INC:.+]] // IR-EMPTY: // IR-NEXT: [[FOR_INC]]: -// IR-NEXT: %[[TMP34:.+]] = load i32, i32* %[[DOTTILE_0_IV_I]], align 4 -// IR-NEXT: %[[INC:.+]] = add nsw i32 %[[TMP34]], 1 -// IR-NEXT: store i32 %[[INC]], i32* %[[DOTTILE_0_IV_I]], align 4 +// IR-NEXT: %[[TMP33:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_I]], align 4 +// IR-NEXT: %[[INC:.+]] = add i32 %[[TMP33]], 1 +// IR-NEXT: store i32 %[[INC]], i32* %[[DOTUNROLL_INNER_IV_I]], align 4 // IR-NEXT: br label %[[FOR_COND]], !llvm.loop ![[LOOP2:[0-9]+]] // IR-EMPTY: // IR-NEXT: [[FOR_END]]: @@ -169,9 +155,9 @@ // IR-NEXT: br label %[[OMP_INNER_FOR_INC:.+]] // IR-EMPTY: // IR-NEXT: [[OMP_INNER_FOR_INC]]: -// IR-NEXT: %[[TMP35:.+]] = load i32, i32* %[[DOTOMP_IV]], align 4 -// IR-NEXT: %[[ADD29:.+]] = add i32 %[[TMP35]], 1 -// IR-NEXT: store i32 %[[ADD29]], i32* %[[DOTOMP_IV]], align 4 +// IR-NEXT: %[[TMP34:.+]] = load i32, i32* %[[DOTOMP_IV]], align 4 +// IR-NEXT: %[[ADD23:.+]] = add i32 %[[TMP34]], 1 +// IR-NEXT: store i32 %[[ADD23]], i32* %[[DOTOMP_IV]], align 4 // IR-NEXT: br label %[[OMP_INNER_FOR_COND]] // IR-EMPTY: // IR-NEXT: [[OMP_INNER_FOR_END]]: @@ -186,10 +172,16 @@ // IR-NEXT: ret void // IR-NEXT: } extern "C" void func(int start, int end, int step) { -#pragma omp for -#pragma omp tile sizes(4) - for (int i = start; i < end; i += step) + int i; + #pragma omp for + #pragma omp unroll partial + for (i = start; i < end; i+=step) body(start, end, step, i); } #endif /* HEADER */ + + +// IR: ![[LOOP2]] = distinct !{![[LOOP2]], ![[LOOPPROP3:[0-9]+]], ![[LOOPPROP4:[0-9]+]]} +// IR: ![[LOOPPROP3]] = !{!"llvm.loop.mustprogress"} +// IR: ![[LOOPPROP4]] = !{!"llvm.loop.unroll.count", i32 2} diff --git a/clang/test/OpenMP/unroll_codegen_full.cpp b/clang/test/OpenMP/unroll_codegen_full.cpp new file mode 100644 --- /dev/null +++ b/clang/test/OpenMP/unroll_codegen_full.cpp @@ -0,0 +1,52 @@ +// Check code generation +// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -emit-llvm %s -o - | FileCheck %s --check-prefix=IR + +// Check same results after serialization round-trip +// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -emit-pch -o %t %s +// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -include-pch %t -emit-llvm %s -o - | FileCheck %s --check-prefix=IR +// expected-no-diagnostics + +#ifndef HEADER +#define HEADER + +// placeholder for loop body code. +extern "C" void body(...) {} + + +// IR-LABEL: @func( +// IR-NEXT: [[ENTRY:.*]]: +// IR-NEXT: %[[I:.+]] = alloca i32, align 4 +// IR-NEXT: store i32 7, i32* %[[I]], align 4 +// IR-NEXT: br label %[[FOR_COND:.+]] +// IR-EMPTY: +// IR-NEXT: [[FOR_COND]]: +// IR-NEXT: %[[TMP0:.+]] = load i32, i32* %[[I]], align 4 +// IR-NEXT: %[[CMP:.+]] = icmp slt i32 %[[TMP0]], 17 +// IR-NEXT: br i1 %[[CMP]], label %[[FOR_BODY:.+]], label %[[FOR_END:.+]] +// IR-EMPTY: +// IR-NEXT: [[FOR_BODY]]: +// IR-NEXT: %[[TMP1:.+]] = load i32, i32* %[[I]], align 4 +// IR-NEXT: call void (...) @body(i32 %[[TMP1]]) +// IR-NEXT: br label %[[FOR_INC:.+]] +// IR-EMPTY: +// IR-NEXT: [[FOR_INC]]: +// IR-NEXT: %[[TMP2:.+]] = load i32, i32* %[[I]], align 4 +// IR-NEXT: %[[ADD:.+]] = add nsw i32 %[[TMP2]], 3 +// IR-NEXT: store i32 %[[ADD]], i32* %[[I]], align 4 +// IR-NEXT: br label %[[FOR_COND]], !llvm.loop ![[LOOP2:[0-9]+]] +// IR-EMPTY: +// IR-NEXT: [[FOR_END]]: +// IR-NEXT: ret void +// IR-NEXT: } +extern "C" void func() { + #pragma omp unroll full + for (int i = 7; i < 17; i += 3) + body(i); +} + +#endif /* HEADER */ + + +// IR: ![[LOOP2]] = distinct !{![[LOOP2]], ![[LOOPPROP3:[0-9]+]], ![[LOOPPROP4:[0-9]+]]} +// IR: ![[LOOPPROP3]] = !{!"llvm.loop.mustprogress"} +// IR: ![[LOOPPROP4]] = !{!"llvm.loop.unroll.full"} diff --git a/clang/test/OpenMP/unroll_codegen_heuristic.cpp b/clang/test/OpenMP/unroll_codegen_heuristic.cpp new file mode 100644 --- /dev/null +++ b/clang/test/OpenMP/unroll_codegen_heuristic.cpp @@ -0,0 +1,64 @@ +// Check code generation +// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -emit-llvm %s -o - | FileCheck %s --check-prefix=IR + +// Check same results after serialization round-trip +// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -emit-pch -o %t %s +// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -include-pch %t -emit-llvm %s -o - | FileCheck %s --check-prefix=IR +// expected-no-diagnostics + +#ifndef HEADER +#define HEADER + +// placeholder for loop body code. +extern "C" void body(...) {} + + +// IR-LABEL: @func( +// IR-NEXT: [[ENTRY:.*]]: +// IR-NEXT: %[[START_ADDR:.+]] = alloca i32, align 4 +// IR-NEXT: %[[END_ADDR:.+]] = alloca i32, align 4 +// IR-NEXT: %[[STEP_ADDR:.+]] = alloca i32, align 4 +// IR-NEXT: %[[I:.+]] = alloca i32, align 4 +// IR-NEXT: store i32 %[[START:.+]], i32* %[[START_ADDR]], align 4 +// IR-NEXT: store i32 %[[END:.+]], i32* %[[END_ADDR]], align 4 +// IR-NEXT: store i32 %[[STEP:.+]], i32* %[[STEP_ADDR]], align 4 +// IR-NEXT: %[[TMP0:.+]] = load i32, i32* %[[START_ADDR]], align 4 +// IR-NEXT: store i32 %[[TMP0]], i32* %[[I]], align 4 +// IR-NEXT: br label %[[FOR_COND:.+]] +// IR-EMPTY: +// IR-NEXT: [[FOR_COND]]: +// IR-NEXT: %[[TMP1:.+]] = load i32, i32* %[[I]], align 4 +// IR-NEXT: %[[TMP2:.+]] = load i32, i32* %[[END_ADDR]], align 4 +// IR-NEXT: %[[CMP:.+]] = icmp slt i32 %[[TMP1]], %[[TMP2]] +// IR-NEXT: br i1 %[[CMP]], label %[[FOR_BODY:.+]], label %[[FOR_END:.+]] +// IR-EMPTY: +// IR-NEXT: [[FOR_BODY]]: +// IR-NEXT: %[[TMP3:.+]] = load i32, i32* %[[START_ADDR]], align 4 +// IR-NEXT: %[[TMP4:.+]] = load i32, i32* %[[END_ADDR]], align 4 +// IR-NEXT: %[[TMP5:.+]] = load i32, i32* %[[STEP_ADDR]], align 4 +// IR-NEXT: %[[TMP6:.+]] = load i32, i32* %[[I]], align 4 +// IR-NEXT: call void (...) @body(i32 %[[TMP3]], i32 %[[TMP4]], i32 %[[TMP5]], i32 %[[TMP6]]) +// IR-NEXT: br label %[[FOR_INC:.+]] +// IR-EMPTY: +// IR-NEXT: [[FOR_INC]]: +// IR-NEXT: %[[TMP7:.+]] = load i32, i32* %[[STEP_ADDR]], align 4 +// IR-NEXT: %[[TMP8:.+]] = load i32, i32* %[[I]], align 4 +// IR-NEXT: %[[ADD:.+]] = add nsw i32 %[[TMP8]], %[[TMP7]] +// IR-NEXT: store i32 %[[ADD]], i32* %[[I]], align 4 +// IR-NEXT: br label %[[FOR_COND]], !llvm.loop ![[LOOP2:[0-9]+]] +// IR-EMPTY: +// IR-NEXT: [[FOR_END]]: +// IR-NEXT: ret void +// IR-NEXT: } +extern "C" void func(int start, int end, int step) { + #pragma omp unroll + for (int i = start; i < end; i+=step) + body(start, end, step, i); +} + +#endif /* HEADER */ + + +// IR: ![[LOOP2]] = distinct !{![[LOOP2]], ![[LOOPPROP3:[0-9]+]], ![[LOOPPROP4:[0-9]+]]} +// IR: ![[LOOPPROP3]] = !{!"llvm.loop.mustprogress"} +// IR: ![[LOOPPROP4]] = !{!"llvm.loop.unroll.enable"} diff --git a/clang/test/OpenMP/unroll_codegen_parallel_for_factor.cpp b/clang/test/OpenMP/unroll_codegen_parallel_for_factor.cpp new file mode 100644 --- /dev/null +++ b/clang/test/OpenMP/unroll_codegen_parallel_for_factor.cpp @@ -0,0 +1,210 @@ +// Check code generation +// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -emit-llvm %s -o - | FileCheck %s --check-prefix=IR + +// Check same results after serialization round-trip +// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -emit-pch -o %t %s +// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -include-pch %t -emit-llvm %s -o - | FileCheck %s --check-prefix=IR +// expected-no-diagnostics + +#ifndef HEADER +#define HEADER + +// placeholder for loop body code. +extern "C" void body(...) {} + + +// IR-LABEL: @func( +// IR-NEXT: [[ENTRY:.*]]: +// IR-NEXT: %[[START_ADDR:.+]] = alloca i32, align 4 +// IR-NEXT: %[[END_ADDR:.+]] = alloca i32, align 4 +// IR-NEXT: %[[STEP_ADDR:.+]] = alloca i32, align 4 +// IR-NEXT: store i32 %[[START:.+]], i32* %[[START_ADDR]], align 4 +// IR-NEXT: store i32 %[[END:.+]], i32* %[[END_ADDR]], align 4 +// IR-NEXT: store i32 %[[STEP:.+]], i32* %[[STEP_ADDR]], align 4 +// IR-NEXT: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @2, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* %[[END_ADDR]], i32* %[[STEP_ADDR]], i32* %[[START_ADDR]]) +// IR-NEXT: ret void +// IR-NEXT: } +extern "C" void func(int start, int end, int step) { + #pragma omp parallel for + #pragma omp unroll partial(7) + for (int i = start; i < end; i+=step) + body(start, end, step, i); +} + + +// IR-LABEL: @.omp_outlined.( +// IR-NEXT: [[ENTRY:.*]]: +// IR-NEXT: %[[DOTGLOBAL_TID__ADDR:.+]] = alloca i32*, align 8 +// IR-NEXT: %[[DOTBOUND_TID__ADDR:.+]] = alloca i32*, align 8 +// IR-NEXT: %[[END_ADDR:.+]] = alloca i32*, align 8 +// IR-NEXT: %[[STEP_ADDR:.+]] = alloca i32*, align 8 +// IR-NEXT: %[[START_ADDR:.+]] = alloca i32*, align 8 +// IR-NEXT: %[[DOTOMP_IV:.+]] = alloca i32, align 4 +// IR-NEXT: %[[TMP:.+]] = alloca i32, align 4 +// IR-NEXT: %[[I:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTCAPTURE_EXPR_:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTCAPTURE_EXPR_1:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTCAPTURE_EXPR_2:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTCAPTURE_EXPR_3:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTCAPTURE_EXPR_6:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTCAPTURE_EXPR_8:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTUNROLLED_IV_I:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTOMP_LB:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTOMP_UB:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTOMP_STRIDE:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTOMP_IS_LAST:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTUNROLLED_IV_I12:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTUNROLL_INNER_IV_I:.+]] = alloca i32, align 4 +// IR-NEXT: store i32* %[[DOTGLOBAL_TID_:.+]], i32** %[[DOTGLOBAL_TID__ADDR]], align 8 +// IR-NEXT: store i32* %[[DOTBOUND_TID_:.+]], i32** %[[DOTBOUND_TID__ADDR]], align 8 +// IR-NEXT: store i32* %[[END:.+]], i32** %[[END_ADDR]], align 8 +// IR-NEXT: store i32* %[[STEP:.+]], i32** %[[STEP_ADDR]], align 8 +// IR-NEXT: store i32* %[[START:.+]], i32** %[[START_ADDR]], align 8 +// IR-NEXT: %[[TMP0:.+]] = load i32*, i32** %[[END_ADDR]], align 8 +// IR-NEXT: %[[TMP1:.+]] = load i32*, i32** %[[STEP_ADDR]], align 8 +// IR-NEXT: %[[TMP2:.+]] = load i32*, i32** %[[START_ADDR]], align 8 +// IR-NEXT: %[[TMP3:.+]] = load i32, i32* %[[TMP2]], align 4 +// IR-NEXT: store i32 %[[TMP3]], i32* %[[I]], align 4 +// IR-NEXT: %[[TMP4:.+]] = load i32, i32* %[[TMP2]], align 4 +// IR-NEXT: store i32 %[[TMP4]], i32* %[[DOTCAPTURE_EXPR_]], align 4 +// IR-NEXT: %[[TMP5:.+]] = load i32, i32* %[[TMP0]], align 4 +// IR-NEXT: store i32 %[[TMP5]], i32* %[[DOTCAPTURE_EXPR_1]], align 4 +// IR-NEXT: %[[TMP6:.+]] = load i32, i32* %[[TMP1]], align 4 +// IR-NEXT: store i32 %[[TMP6]], i32* %[[DOTCAPTURE_EXPR_2]], align 4 +// IR-NEXT: %[[TMP7:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_1]], align 4 +// IR-NEXT: %[[TMP8:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_]], align 4 +// IR-NEXT: %[[SUB:.+]] = sub i32 %[[TMP7]], %[[TMP8]] +// IR-NEXT: %[[SUB4:.+]] = sub i32 %[[SUB]], 1 +// IR-NEXT: %[[TMP9:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_2]], align 4 +// IR-NEXT: %[[ADD:.+]] = add i32 %[[SUB4]], %[[TMP9]] +// IR-NEXT: %[[TMP10:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_2]], align 4 +// IR-NEXT: %[[DIV:.+]] = udiv i32 %[[ADD]], %[[TMP10]] +// IR-NEXT: %[[SUB5:.+]] = sub i32 %[[DIV]], 1 +// IR-NEXT: store i32 %[[SUB5]], i32* %[[DOTCAPTURE_EXPR_3]], align 4 +// IR-NEXT: %[[TMP11:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_3]], align 4 +// IR-NEXT: %[[ADD7:.+]] = add i32 %[[TMP11]], 1 +// IR-NEXT: store i32 %[[ADD7]], i32* %[[DOTCAPTURE_EXPR_6]], align 4 +// IR-NEXT: %[[TMP12:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_6]], align 4 +// IR-NEXT: %[[SUB9:.+]] = sub i32 %[[TMP12]], -6 +// IR-NEXT: %[[DIV10:.+]] = udiv i32 %[[SUB9]], 7 +// IR-NEXT: %[[SUB11:.+]] = sub i32 %[[DIV10]], 1 +// IR-NEXT: store i32 %[[SUB11]], i32* %[[DOTCAPTURE_EXPR_8]], align 4 +// IR-NEXT: store i32 0, i32* %[[DOTUNROLLED_IV_I]], align 4 +// IR-NEXT: %[[TMP13:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_6]], align 4 +// IR-NEXT: %[[CMP:.+]] = icmp ult i32 0, %[[TMP13]] +// IR-NEXT: br i1 %[[CMP]], label %[[OMP_PRECOND_THEN:.+]], label %[[OMP_PRECOND_END:.+]] +// IR-EMPTY: +// IR-NEXT: [[OMP_PRECOND_THEN]]: +// IR-NEXT: store i32 0, i32* %[[DOTOMP_LB]], align 4 +// IR-NEXT: %[[TMP14:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_8]], align 4 +// IR-NEXT: store i32 %[[TMP14]], i32* %[[DOTOMP_UB]], align 4 +// IR-NEXT: store i32 1, i32* %[[DOTOMP_STRIDE]], align 4 +// IR-NEXT: store i32 0, i32* %[[DOTOMP_IS_LAST]], align 4 +// IR-NEXT: %[[TMP15:.+]] = load i32*, i32** %[[DOTGLOBAL_TID__ADDR]], align 8 +// IR-NEXT: %[[TMP16:.+]] = load i32, i32* %[[TMP15]], align 4 +// IR-NEXT: call void @__kmpc_for_static_init_4u(%struct.ident_t* @1, i32 %[[TMP16]], i32 34, i32* %[[DOTOMP_IS_LAST]], i32* %[[DOTOMP_LB]], i32* %[[DOTOMP_UB]], i32* %[[DOTOMP_STRIDE]], i32 1, i32 1) +// IR-NEXT: %[[TMP17:.+]] = load i32, i32* %[[DOTOMP_UB]], align 4 +// IR-NEXT: %[[TMP18:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_8]], align 4 +// IR-NEXT: %[[CMP13:.+]] = icmp ugt i32 %[[TMP17]], %[[TMP18]] +// IR-NEXT: br i1 %[[CMP13]], label %[[COND_TRUE:.+]], label %[[COND_FALSE:.+]] +// IR-EMPTY: +// IR-NEXT: [[COND_TRUE]]: +// IR-NEXT: %[[TMP19:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_8]], align 4 +// IR-NEXT: br label %[[COND_END:.+]] +// IR-EMPTY: +// IR-NEXT: [[COND_FALSE]]: +// IR-NEXT: %[[TMP20:.+]] = load i32, i32* %[[DOTOMP_UB]], align 4 +// IR-NEXT: br label %[[COND_END]] +// IR-EMPTY: +// IR-NEXT: [[COND_END]]: +// IR-NEXT: %[[COND:.+]] = phi i32 [ %[[TMP19]], %[[COND_TRUE]] ], [ %[[TMP20]], %[[COND_FALSE]] ] +// IR-NEXT: store i32 %[[COND]], i32* %[[DOTOMP_UB]], align 4 +// IR-NEXT: %[[TMP21:.+]] = load i32, i32* %[[DOTOMP_LB]], align 4 +// IR-NEXT: store i32 %[[TMP21]], i32* %[[DOTOMP_IV]], align 4 +// IR-NEXT: br label %[[OMP_INNER_FOR_COND:.+]] +// IR-EMPTY: +// IR-NEXT: [[OMP_INNER_FOR_COND]]: +// IR-NEXT: %[[TMP22:.+]] = load i32, i32* %[[DOTOMP_IV]], align 4 +// IR-NEXT: %[[TMP23:.+]] = load i32, i32* %[[DOTOMP_UB]], align 4 +// IR-NEXT: %[[ADD14:.+]] = add i32 %[[TMP23]], 1 +// IR-NEXT: %[[CMP15:.+]] = icmp ult i32 %[[TMP22]], %[[ADD14]] +// IR-NEXT: br i1 %[[CMP15]], label %[[OMP_INNER_FOR_BODY:.+]], label %[[OMP_INNER_FOR_END:.+]] +// IR-EMPTY: +// IR-NEXT: [[OMP_INNER_FOR_BODY]]: +// IR-NEXT: %[[TMP24:.+]] = load i32, i32* %[[DOTOMP_IV]], align 4 +// IR-NEXT: %[[MUL:.+]] = mul i32 %[[TMP24]], 7 +// IR-NEXT: %[[ADD16:.+]] = add i32 0, %[[MUL]] +// IR-NEXT: store i32 %[[ADD16]], i32* %[[DOTUNROLLED_IV_I12]], align 4 +// IR-NEXT: %[[TMP25:.+]] = load i32, i32* %[[DOTUNROLLED_IV_I12]], align 4 +// IR-NEXT: store i32 %[[TMP25]], i32* %[[DOTUNROLL_INNER_IV_I]], align 4 +// IR-NEXT: br label %[[FOR_COND:.+]] +// IR-EMPTY: +// IR-NEXT: [[FOR_COND]]: +// IR-NEXT: %[[TMP26:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_I]], align 4 +// IR-NEXT: %[[TMP27:.+]] = load i32, i32* %[[DOTUNROLLED_IV_I12]], align 4 +// IR-NEXT: %[[ADD17:.+]] = add i32 %[[TMP27]], 7 +// IR-NEXT: %[[CMP18:.+]] = icmp ule i32 %[[TMP26]], %[[ADD17]] +// IR-NEXT: br i1 %[[CMP18]], label %[[LAND_RHS:.+]], label %[[LAND_END:.+]] +// IR-EMPTY: +// IR-NEXT: [[LAND_RHS]]: +// IR-NEXT: %[[TMP28:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_I]], align 4 +// IR-NEXT: %[[TMP29:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_3]], align 4 +// IR-NEXT: %[[ADD19:.+]] = add i32 %[[TMP29]], 1 +// IR-NEXT: %[[CMP20:.+]] = icmp ule i32 %[[TMP28]], %[[ADD19]] +// IR-NEXT: br label %[[LAND_END]] +// IR-EMPTY: +// IR-NEXT: [[LAND_END]]: +// IR-NEXT: %[[TMP30:.+]] = phi i1 [ false, %[[FOR_COND]] ], [ %[[CMP20]], %[[LAND_RHS]] ] +// IR-NEXT: br i1 %[[TMP30]], label %[[FOR_BODY:.+]], label %[[FOR_END:.+]] +// IR-EMPTY: +// IR-NEXT: [[FOR_BODY]]: +// IR-NEXT: %[[TMP31:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_]], align 4 +// IR-NEXT: %[[TMP32:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_I]], align 4 +// IR-NEXT: %[[TMP33:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_2]], align 4 +// IR-NEXT: %[[MUL21:.+]] = mul i32 %[[TMP32]], %[[TMP33]] +// IR-NEXT: %[[ADD22:.+]] = add i32 %[[TMP31]], %[[MUL21]] +// IR-NEXT: store i32 %[[ADD22]], i32* %[[I]], align 4 +// IR-NEXT: %[[TMP34:.+]] = load i32, i32* %[[TMP2]], align 4 +// IR-NEXT: %[[TMP35:.+]] = load i32, i32* %[[TMP0]], align 4 +// IR-NEXT: %[[TMP36:.+]] = load i32, i32* %[[TMP1]], align 4 +// IR-NEXT: %[[TMP37:.+]] = load i32, i32* %[[I]], align 4 +// IR-NEXT: call void (...) @body(i32 %[[TMP34]], i32 %[[TMP35]], i32 %[[TMP36]], i32 %[[TMP37]]) +// IR-NEXT: br label %[[FOR_INC:.+]] +// IR-EMPTY: +// IR-NEXT: [[FOR_INC]]: +// IR-NEXT: %[[TMP38:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_I]], align 4 +// IR-NEXT: %[[INC:.+]] = add i32 %[[TMP38]], 1 +// IR-NEXT: store i32 %[[INC]], i32* %[[DOTUNROLL_INNER_IV_I]], align 4 +// IR-NEXT: br label %[[FOR_COND]], !llvm.loop ![[LOOP2:[0-9]+]] +// IR-EMPTY: +// IR-NEXT: [[FOR_END]]: +// IR-NEXT: br label %[[OMP_BODY_CONTINUE:.+]] +// IR-EMPTY: +// IR-NEXT: [[OMP_BODY_CONTINUE]]: +// IR-NEXT: br label %[[OMP_INNER_FOR_INC:.+]] +// IR-EMPTY: +// IR-NEXT: [[OMP_INNER_FOR_INC]]: +// IR-NEXT: %[[TMP39:.+]] = load i32, i32* %[[DOTOMP_IV]], align 4 +// IR-NEXT: %[[ADD23:.+]] = add i32 %[[TMP39]], 1 +// IR-NEXT: store i32 %[[ADD23]], i32* %[[DOTOMP_IV]], align 4 +// IR-NEXT: br label %[[OMP_INNER_FOR_COND]] +// IR-EMPTY: +// IR-NEXT: [[OMP_INNER_FOR_END]]: +// IR-NEXT: br label %[[OMP_LOOP_EXIT:.+]] +// IR-EMPTY: +// IR-NEXT: [[OMP_LOOP_EXIT]]: +// IR-NEXT: %[[TMP40:.+]] = load i32*, i32** %[[DOTGLOBAL_TID__ADDR]], align 8 +// IR-NEXT: %[[TMP41:.+]] = load i32, i32* %[[TMP40]], align 4 +// IR-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @1, i32 %[[TMP41]]) +// IR-NEXT: br label %[[OMP_PRECOND_END]] +// IR-EMPTY: +// IR-NEXT: [[OMP_PRECOND_END]]: +// IR-NEXT: ret void +// IR-NEXT: } + +#endif /* HEADER */ + + +// IR: ![[LOOP2]] = distinct !{![[LOOP2]], ![[LOOPPROP3:[0-9]+]], ![[LOOPPROP4:[0-9]+]]} +// IR: ![[LOOPPROP3]] = !{!"llvm.loop.mustprogress"} +// IR: ![[LOOPPROP4]] = !{!"llvm.loop.unroll.count", i32 7} diff --git a/clang/test/OpenMP/unroll_codegen_partial.cpp b/clang/test/OpenMP/unroll_codegen_partial.cpp new file mode 100644 --- /dev/null +++ b/clang/test/OpenMP/unroll_codegen_partial.cpp @@ -0,0 +1,64 @@ +// Check code generation +// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -emit-llvm %s -o - | FileCheck %s --check-prefix=IR + +// Check same results after serialization round-trip +// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -emit-pch -o %t %s +// RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -include-pch %t -emit-llvm %s -o - | FileCheck %s --check-prefix=IR +// expected-no-diagnostics + +#ifndef HEADER +#define HEADER + +// placeholder for loop body code. +extern "C" void body(...) {} + + +// IR-LABEL: @func( +// IR-NEXT: [[ENTRY:.*]]: +// IR-NEXT: %[[START_ADDR:.+]] = alloca i32, align 4 +// IR-NEXT: %[[END_ADDR:.+]] = alloca i32, align 4 +// IR-NEXT: %[[STEP_ADDR:.+]] = alloca i32, align 4 +// IR-NEXT: %[[I:.+]] = alloca i32, align 4 +// IR-NEXT: store i32 %[[START:.+]], i32* %[[START_ADDR]], align 4 +// IR-NEXT: store i32 %[[END:.+]], i32* %[[END_ADDR]], align 4 +// IR-NEXT: store i32 %[[STEP:.+]], i32* %[[STEP_ADDR]], align 4 +// IR-NEXT: %[[TMP0:.+]] = load i32, i32* %[[START_ADDR]], align 4 +// IR-NEXT: store i32 %[[TMP0]], i32* %[[I]], align 4 +// IR-NEXT: br label %[[FOR_COND:.+]] +// IR-EMPTY: +// IR-NEXT: [[FOR_COND]]: +// IR-NEXT: %[[TMP1:.+]] = load i32, i32* %[[I]], align 4 +// IR-NEXT: %[[TMP2:.+]] = load i32, i32* %[[END_ADDR]], align 4 +// IR-NEXT: %[[CMP:.+]] = icmp slt i32 %[[TMP1]], %[[TMP2]] +// IR-NEXT: br i1 %[[CMP]], label %[[FOR_BODY:.+]], label %[[FOR_END:.+]] +// IR-EMPTY: +// IR-NEXT: [[FOR_BODY]]: +// IR-NEXT: %[[TMP3:.+]] = load i32, i32* %[[START_ADDR]], align 4 +// IR-NEXT: %[[TMP4:.+]] = load i32, i32* %[[END_ADDR]], align 4 +// IR-NEXT: %[[TMP5:.+]] = load i32, i32* %[[STEP_ADDR]], align 4 +// IR-NEXT: %[[TMP6:.+]] = load i32, i32* %[[I]], align 4 +// IR-NEXT: call void (...) @body(i32 %[[TMP3]], i32 %[[TMP4]], i32 %[[TMP5]], i32 %[[TMP6]]) +// IR-NEXT: br label %[[FOR_INC:.+]] +// IR-EMPTY: +// IR-NEXT: [[FOR_INC]]: +// IR-NEXT: %[[TMP7:.+]] = load i32, i32* %[[STEP_ADDR]], align 4 +// IR-NEXT: %[[TMP8:.+]] = load i32, i32* %[[I]], align 4 +// IR-NEXT: %[[ADD:.+]] = add nsw i32 %[[TMP8]], %[[TMP7]] +// IR-NEXT: store i32 %[[ADD]], i32* %[[I]], align 4 +// IR-NEXT: br label %[[FOR_COND]], !llvm.loop ![[LOOP2:[0-9]+]] +// IR-EMPTY: +// IR-NEXT: [[FOR_END]]: +// IR-NEXT: ret void +// IR-NEXT: } +extern "C" void func(int start, int end, int step) { + #pragma omp unroll partial + for (int i = start; i < end; i+=step) + body(start, end, step, i); +} + +#endif /* HEADER */ + + +// IR: ![[LOOP2]] = distinct !{![[LOOP2]], ![[LOOPPROP3:[0-9]+]], ![[LOOPPROP4:[0-9]+]]} +// IR: ![[LOOPPROP3]] = !{!"llvm.loop.mustprogress"} +// IR: ![[LOOPPROP4]] = !{!"llvm.loop.unroll.enable"} diff --git a/clang/test/OpenMP/tile_codegen_for_multiple.cpp b/clang/test/OpenMP/unroll_codegen_tile_for.cpp copy from clang/test/OpenMP/tile_codegen_for_multiple.cpp copy to clang/test/OpenMP/unroll_codegen_tile_for.cpp --- a/clang/test/OpenMP/tile_codegen_for_multiple.cpp +++ b/clang/test/OpenMP/unroll_codegen_tile_for.cpp @@ -6,9 +6,6 @@ // RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -include-pch %t -emit-llvm %s -o - | FileCheck %s --check-prefix=IR // expected-no-diagnostics -// Account for multiple transformations of a loop before consumed by -// #pragma omp for. - #ifndef HEADER #define HEADER @@ -23,64 +20,64 @@ // IR-NEXT: %[[STEP_ADDR:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTOMP_IV:.+]] = alloca i32, align 4 // IR-NEXT: %[[TMP:.+]] = alloca i32, align 4 +// IR-NEXT: %[[I:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTCAPTURE_EXPR_:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTCAPTURE_EXPR_1:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTCAPTURE_EXPR_2:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTCAPTURE_EXPR_3:.+]] = alloca i32, align 4 -// IR-NEXT: %[[I:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTUNROLLED_IV_I:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTCAPTURE_EXPR_6:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTCAPTURE_EXPR_8:.+]] = alloca i32, align 4 -// IR-NEXT: %[[DOTFLOOR_0_IV_I:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTCAPTURE_EXPR_12:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTCAPTURE_EXPR_14:.+]] = alloca i32, align 4 -// IR-NEXT: %[[DOTFLOOR_0_IV__FLOOR_0_IV_I:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTFLOOR_0_IV__UNROLLED_IV_I:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTOMP_LB:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTOMP_UB:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTOMP_STRIDE:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTOMP_IS_LAST:.+]] = alloca i32, align 4 -// IR-NEXT: %[[DOTFLOOR_0_IV__FLOOR_0_IV_I18:.+]] = alloca i32, align 4 -// IR-NEXT: %[[DOTTILE_0_IV__FLOOR_0_IV_I:.+]] = alloca i32, align 4 -// IR-NEXT: %[[DOTTILE_0_IV_I:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTFLOOR_0_IV__UNROLLED_IV_I18:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTTILE_0_IV__UNROLLED_IV_I:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTUNROLL_INNER_IV_I:.+]] = alloca i32, align 4 // IR-NEXT: %[[TMP0:.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) // IR-NEXT: store i32 %[[START:.+]], i32* %[[START_ADDR]], align 4 // IR-NEXT: store i32 %[[END:.+]], i32* %[[END_ADDR]], align 4 // IR-NEXT: store i32 %[[STEP:.+]], i32* %[[STEP_ADDR]], align 4 // IR-NEXT: %[[TMP1:.+]] = load i32, i32* %[[START_ADDR]], align 4 -// IR-NEXT: store i32 %[[TMP1]], i32* %[[DOTCAPTURE_EXPR_]], align 4 -// IR-NEXT: %[[TMP2:.+]] = load i32, i32* %[[END_ADDR]], align 4 -// IR-NEXT: store i32 %[[TMP2]], i32* %[[DOTCAPTURE_EXPR_1]], align 4 -// IR-NEXT: %[[TMP3:.+]] = load i32, i32* %[[STEP_ADDR]], align 4 -// IR-NEXT: store i32 %[[TMP3]], i32* %[[DOTCAPTURE_EXPR_2]], align 4 -// IR-NEXT: %[[TMP4:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_1]], align 4 -// IR-NEXT: %[[TMP5:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_]], align 4 -// IR-NEXT: %[[SUB:.+]] = sub i32 %[[TMP4]], %[[TMP5]] +// IR-NEXT: store i32 %[[TMP1]], i32* %[[I]], align 4 +// IR-NEXT: %[[TMP2:.+]] = load i32, i32* %[[START_ADDR]], align 4 +// IR-NEXT: store i32 %[[TMP2]], i32* %[[DOTCAPTURE_EXPR_]], align 4 +// IR-NEXT: %[[TMP3:.+]] = load i32, i32* %[[END_ADDR]], align 4 +// IR-NEXT: store i32 %[[TMP3]], i32* %[[DOTCAPTURE_EXPR_1]], align 4 +// IR-NEXT: %[[TMP4:.+]] = load i32, i32* %[[STEP_ADDR]], align 4 +// IR-NEXT: store i32 %[[TMP4]], i32* %[[DOTCAPTURE_EXPR_2]], align 4 +// IR-NEXT: %[[TMP5:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_1]], align 4 +// IR-NEXT: %[[TMP6:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_]], align 4 +// IR-NEXT: %[[SUB:.+]] = sub i32 %[[TMP5]], %[[TMP6]] // IR-NEXT: %[[SUB4:.+]] = sub i32 %[[SUB]], 1 -// IR-NEXT: %[[TMP6:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_2]], align 4 -// IR-NEXT: %[[ADD:.+]] = add i32 %[[SUB4]], %[[TMP6]] // IR-NEXT: %[[TMP7:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_2]], align 4 -// IR-NEXT: %[[DIV:.+]] = udiv i32 %[[ADD]], %[[TMP7]] +// IR-NEXT: %[[ADD:.+]] = add i32 %[[SUB4]], %[[TMP7]] +// IR-NEXT: %[[TMP8:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_2]], align 4 +// IR-NEXT: %[[DIV:.+]] = udiv i32 %[[ADD]], %[[TMP8]] // IR-NEXT: %[[SUB5:.+]] = sub i32 %[[DIV]], 1 // IR-NEXT: store i32 %[[SUB5]], i32* %[[DOTCAPTURE_EXPR_3]], align 4 -// IR-NEXT: %[[TMP8:.+]] = load i32, i32* %[[START_ADDR]], align 4 -// IR-NEXT: store i32 %[[TMP8]], i32* %[[I]], align 4 +// IR-NEXT: store i32 0, i32* %[[DOTUNROLLED_IV_I]], align 4 // IR-NEXT: %[[TMP9:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_3]], align 4 // IR-NEXT: %[[ADD7:.+]] = add i32 %[[TMP9]], 1 // IR-NEXT: store i32 %[[ADD7]], i32* %[[DOTCAPTURE_EXPR_6]], align 4 // IR-NEXT: %[[TMP10:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_6]], align 4 -// IR-NEXT: %[[SUB9:.+]] = sub i32 %[[TMP10]], -3 -// IR-NEXT: %[[DIV10:.+]] = udiv i32 %[[SUB9]], 4 +// IR-NEXT: %[[SUB9:.+]] = sub i32 %[[TMP10]], -1 +// IR-NEXT: %[[DIV10:.+]] = udiv i32 %[[SUB9]], 2 // IR-NEXT: %[[SUB11:.+]] = sub i32 %[[DIV10]], 1 // IR-NEXT: store i32 %[[SUB11]], i32* %[[DOTCAPTURE_EXPR_8]], align 4 -// IR-NEXT: store i32 0, i32* %[[DOTFLOOR_0_IV_I]], align 4 // IR-NEXT: %[[TMP11:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_8]], align 4 // IR-NEXT: %[[ADD13:.+]] = add i32 %[[TMP11]], 1 // IR-NEXT: store i32 %[[ADD13]], i32* %[[DOTCAPTURE_EXPR_12]], align 4 // IR-NEXT: %[[TMP12:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_12]], align 4 -// IR-NEXT: %[[SUB15:.+]] = sub i32 %[[TMP12]], -2 -// IR-NEXT: %[[DIV16:.+]] = udiv i32 %[[SUB15]], 3 +// IR-NEXT: %[[SUB15:.+]] = sub i32 %[[TMP12]], -3 +// IR-NEXT: %[[DIV16:.+]] = udiv i32 %[[SUB15]], 4 // IR-NEXT: %[[SUB17:.+]] = sub i32 %[[DIV16]], 1 // IR-NEXT: store i32 %[[SUB17]], i32* %[[DOTCAPTURE_EXPR_14]], align 4 -// IR-NEXT: store i32 0, i32* %[[DOTFLOOR_0_IV__FLOOR_0_IV_I]], align 4 +// IR-NEXT: store i32 0, i32* %[[DOTFLOOR_0_IV__UNROLLED_IV_I]], align 4 // IR-NEXT: %[[TMP13:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_12]], align 4 // IR-NEXT: %[[CMP:.+]] = icmp ult i32 0, %[[TMP13]] // IR-NEXT: br i1 %[[CMP]], label %[[OMP_PRECOND_THEN:.+]], label %[[OMP_PRECOND_END:.+]] @@ -121,19 +118,19 @@ // IR-EMPTY: // IR-NEXT: [[OMP_INNER_FOR_BODY]]: // IR-NEXT: %[[TMP22:.+]] = load i32, i32* %[[DOTOMP_IV]], align 4 -// IR-NEXT: %[[MUL:.+]] = mul i32 %[[TMP22]], 3 +// IR-NEXT: %[[MUL:.+]] = mul i32 %[[TMP22]], 4 // IR-NEXT: %[[ADD22:.+]] = add i32 0, %[[MUL]] -// IR-NEXT: store i32 %[[ADD22]], i32* %[[DOTFLOOR_0_IV__FLOOR_0_IV_I18]], align 4 -// IR-NEXT: %[[TMP23:.+]] = load i32, i32* %[[DOTFLOOR_0_IV__FLOOR_0_IV_I18]], align 4 -// IR-NEXT: store i32 %[[TMP23]], i32* %[[DOTTILE_0_IV__FLOOR_0_IV_I]], align 4 +// IR-NEXT: store i32 %[[ADD22]], i32* %[[DOTFLOOR_0_IV__UNROLLED_IV_I18]], align 4 +// IR-NEXT: %[[TMP23:.+]] = load i32, i32* %[[DOTFLOOR_0_IV__UNROLLED_IV_I18]], align 4 +// IR-NEXT: store i32 %[[TMP23]], i32* %[[DOTTILE_0_IV__UNROLLED_IV_I]], align 4 // IR-NEXT: br label %[[FOR_COND:.+]] // IR-EMPTY: // IR-NEXT: [[FOR_COND]]: -// IR-NEXT: %[[TMP24:.+]] = load i32, i32* %[[DOTTILE_0_IV__FLOOR_0_IV_I]], align 4 +// IR-NEXT: %[[TMP24:.+]] = load i32, i32* %[[DOTTILE_0_IV__UNROLLED_IV_I]], align 4 // IR-NEXT: %[[TMP25:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_8]], align 4 // IR-NEXT: %[[ADD23:.+]] = add i32 %[[TMP25]], 1 -// IR-NEXT: %[[TMP26:.+]] = load i32, i32* %[[DOTFLOOR_0_IV__FLOOR_0_IV_I18]], align 4 -// IR-NEXT: %[[ADD24:.+]] = add i32 %[[TMP26]], 3 +// IR-NEXT: %[[TMP26:.+]] = load i32, i32* %[[DOTFLOOR_0_IV__UNROLLED_IV_I18]], align 4 +// IR-NEXT: %[[ADD24:.+]] = add i32 %[[TMP26]], 4 // IR-NEXT: %[[CMP25:.+]] = icmp ult i32 %[[ADD23]], %[[ADD24]] // IR-NEXT: br i1 %[[CMP25]], label %[[COND_TRUE26:.+]], label %[[COND_FALSE28:.+]] // IR-EMPTY: @@ -143,55 +140,49 @@ // IR-NEXT: br label %[[COND_END30:.+]] // IR-EMPTY: // IR-NEXT: [[COND_FALSE28]]: -// IR-NEXT: %[[TMP28:.+]] = load i32, i32* %[[DOTFLOOR_0_IV__FLOOR_0_IV_I18]], align 4 -// IR-NEXT: %[[ADD29:.+]] = add i32 %[[TMP28]], 3 +// IR-NEXT: %[[TMP28:.+]] = load i32, i32* %[[DOTFLOOR_0_IV__UNROLLED_IV_I18]], align 4 +// IR-NEXT: %[[ADD29:.+]] = add i32 %[[TMP28]], 4 // IR-NEXT: br label %[[COND_END30]] // IR-EMPTY: // IR-NEXT: [[COND_END30]]: // IR-NEXT: %[[COND31:.+]] = phi i32 [ %[[ADD27]], %[[COND_TRUE26]] ], [ %[[ADD29]], %[[COND_FALSE28]] ] // IR-NEXT: %[[CMP32:.+]] = icmp ult i32 %[[TMP24]], %[[COND31]] -// IR-NEXT: br i1 %[[CMP32]], label %[[FOR_BODY:.+]], label %[[FOR_END51:.+]] +// IR-NEXT: br i1 %[[CMP32]], label %[[FOR_BODY:.+]], label %[[FOR_END45:.+]] // IR-EMPTY: // IR-NEXT: [[FOR_BODY]]: -// IR-NEXT: %[[TMP29:.+]] = load i32, i32* %[[DOTTILE_0_IV__FLOOR_0_IV_I]], align 4 -// IR-NEXT: %[[MUL33:.+]] = mul i32 %[[TMP29]], 4 +// IR-NEXT: %[[TMP29:.+]] = load i32, i32* %[[DOTTILE_0_IV__UNROLLED_IV_I]], align 4 +// IR-NEXT: %[[MUL33:.+]] = mul i32 %[[TMP29]], 2 // IR-NEXT: %[[ADD34:.+]] = add i32 0, %[[MUL33]] -// IR-NEXT: store i32 %[[ADD34]], i32* %[[DOTFLOOR_0_IV_I]], align 4 -// IR-NEXT: %[[TMP30:.+]] = load i32, i32* %[[DOTFLOOR_0_IV_I]], align 4 -// IR-NEXT: store i32 %[[TMP30]], i32* %[[DOTTILE_0_IV_I]], align 4 +// IR-NEXT: store i32 %[[ADD34]], i32* %[[DOTUNROLLED_IV_I]], align 4 +// IR-NEXT: %[[TMP30:.+]] = load i32, i32* %[[DOTUNROLLED_IV_I]], align 4 +// IR-NEXT: store i32 %[[TMP30]], i32* %[[DOTUNROLL_INNER_IV_I]], align 4 // IR-NEXT: br label %[[FOR_COND35:.+]] // IR-EMPTY: // IR-NEXT: [[FOR_COND35]]: -// IR-NEXT: %[[TMP31:.+]] = load i32, i32* %[[DOTTILE_0_IV_I]], align 4 -// IR-NEXT: %[[TMP32:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_3]], align 4 -// IR-NEXT: %[[ADD36:.+]] = add i32 %[[TMP32]], 1 -// IR-NEXT: %[[TMP33:.+]] = load i32, i32* %[[DOTFLOOR_0_IV_I]], align 4 -// IR-NEXT: %[[ADD37:.+]] = add nsw i32 %[[TMP33]], 4 -// IR-NEXT: %[[CMP38:.+]] = icmp ult i32 %[[ADD36]], %[[ADD37]] -// IR-NEXT: br i1 %[[CMP38]], label %[[COND_TRUE39:.+]], label %[[COND_FALSE41:.+]] -// IR-EMPTY: -// IR-NEXT: [[COND_TRUE39]]: +// IR-NEXT: %[[TMP31:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_I]], align 4 +// IR-NEXT: %[[TMP32:.+]] = load i32, i32* %[[DOTUNROLLED_IV_I]], align 4 +// IR-NEXT: %[[ADD36:.+]] = add i32 %[[TMP32]], 2 +// IR-NEXT: %[[CMP37:.+]] = icmp ule i32 %[[TMP31]], %[[ADD36]] +// IR-NEXT: br i1 %[[CMP37]], label %[[LAND_RHS:.+]], label %[[LAND_END:.+]] +// IR-EMPTY: +// IR-NEXT: [[LAND_RHS]]: +// IR-NEXT: %[[TMP33:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_I]], align 4 // IR-NEXT: %[[TMP34:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_3]], align 4 -// IR-NEXT: %[[ADD40:.+]] = add i32 %[[TMP34]], 1 -// IR-NEXT: br label %[[COND_END43:.+]] +// IR-NEXT: %[[ADD38:.+]] = add i32 %[[TMP34]], 1 +// IR-NEXT: %[[CMP39:.+]] = icmp ule i32 %[[TMP33]], %[[ADD38]] +// IR-NEXT: br label %[[LAND_END]] // IR-EMPTY: -// IR-NEXT: [[COND_FALSE41]]: -// IR-NEXT: %[[TMP35:.+]] = load i32, i32* %[[DOTFLOOR_0_IV_I]], align 4 -// IR-NEXT: %[[ADD42:.+]] = add nsw i32 %[[TMP35]], 4 -// IR-NEXT: br label %[[COND_END43]] +// IR-NEXT: [[LAND_END]]: +// IR-NEXT: %[[TMP35:.+]] = phi i1 [ false, %[[FOR_COND35]] ], [ %[[CMP39]], %[[LAND_RHS]] ] +// IR-NEXT: br i1 %[[TMP35]], label %[[FOR_BODY40:.+]], label %[[FOR_END:.+]] // IR-EMPTY: -// IR-NEXT: [[COND_END43]]: -// IR-NEXT: %[[COND44:.+]] = phi i32 [ %[[ADD40]], %[[COND_TRUE39]] ], [ %[[ADD42]], %[[COND_FALSE41]] ] -// IR-NEXT: %[[CMP45:.+]] = icmp ult i32 %[[TMP31]], %[[COND44]] -// IR-NEXT: br i1 %[[CMP45]], label %[[FOR_BODY46:.+]], label %[[FOR_END:.+]] -// IR-EMPTY: -// IR-NEXT: [[FOR_BODY46]]: +// IR-NEXT: [[FOR_BODY40]]: // IR-NEXT: %[[TMP36:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_]], align 4 -// IR-NEXT: %[[TMP37:.+]] = load i32, i32* %[[DOTTILE_0_IV_I]], align 4 +// IR-NEXT: %[[TMP37:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_I]], align 4 // IR-NEXT: %[[TMP38:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_2]], align 4 -// IR-NEXT: %[[MUL47:.+]] = mul i32 %[[TMP37]], %[[TMP38]] -// IR-NEXT: %[[ADD48:.+]] = add i32 %[[TMP36]], %[[MUL47]] -// IR-NEXT: store i32 %[[ADD48]], i32* %[[I]], align 4 +// IR-NEXT: %[[MUL41:.+]] = mul i32 %[[TMP37]], %[[TMP38]] +// IR-NEXT: %[[ADD42:.+]] = add i32 %[[TMP36]], %[[MUL41]] +// IR-NEXT: store i32 %[[ADD42]], i32* %[[I]], align 4 // IR-NEXT: %[[TMP39:.+]] = load i32, i32* %[[START_ADDR]], align 4 // IR-NEXT: %[[TMP40:.+]] = load i32, i32* %[[END_ADDR]], align 4 // IR-NEXT: %[[TMP41:.+]] = load i32, i32* %[[STEP_ADDR]], align 4 @@ -200,21 +191,21 @@ // IR-NEXT: br label %[[FOR_INC:.+]] // IR-EMPTY: // IR-NEXT: [[FOR_INC]]: -// IR-NEXT: %[[TMP43:.+]] = load i32, i32* %[[DOTTILE_0_IV_I]], align 4 -// IR-NEXT: %[[INC:.+]] = add nsw i32 %[[TMP43]], 1 -// IR-NEXT: store i32 %[[INC]], i32* %[[DOTTILE_0_IV_I]], align 4 +// IR-NEXT: %[[TMP43:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_I]], align 4 +// IR-NEXT: %[[INC:.+]] = add i32 %[[TMP43]], 1 +// IR-NEXT: store i32 %[[INC]], i32* %[[DOTUNROLL_INNER_IV_I]], align 4 // IR-NEXT: br label %[[FOR_COND35]], !llvm.loop ![[LOOP2:[0-9]+]] // IR-EMPTY: // IR-NEXT: [[FOR_END]]: -// IR-NEXT: br label %[[FOR_INC49:.+]] +// IR-NEXT: br label %[[FOR_INC43:.+]] // IR-EMPTY: -// IR-NEXT: [[FOR_INC49]]: -// IR-NEXT: %[[TMP44:.+]] = load i32, i32* %[[DOTTILE_0_IV__FLOOR_0_IV_I]], align 4 -// IR-NEXT: %[[INC50:.+]] = add i32 %[[TMP44]], 1 -// IR-NEXT: store i32 %[[INC50]], i32* %[[DOTTILE_0_IV__FLOOR_0_IV_I]], align 4 -// IR-NEXT: br label %[[FOR_COND]], !llvm.loop ![[LOOP4:[0-9]+]] +// IR-NEXT: [[FOR_INC43]]: +// IR-NEXT: %[[TMP44:.+]] = load i32, i32* %[[DOTTILE_0_IV__UNROLLED_IV_I]], align 4 +// IR-NEXT: %[[INC44:.+]] = add i32 %[[TMP44]], 1 +// IR-NEXT: store i32 %[[INC44]], i32* %[[DOTTILE_0_IV__UNROLLED_IV_I]], align 4 +// IR-NEXT: br label %[[FOR_COND]], !llvm.loop ![[LOOP5:[0-9]+]] // IR-EMPTY: -// IR-NEXT: [[FOR_END51]]: +// IR-NEXT: [[FOR_END45]]: // IR-NEXT: br label %[[OMP_BODY_CONTINUE:.+]] // IR-EMPTY: // IR-NEXT: [[OMP_BODY_CONTINUE]]: @@ -222,8 +213,8 @@ // IR-EMPTY: // IR-NEXT: [[OMP_INNER_FOR_INC]]: // IR-NEXT: %[[TMP45:.+]] = load i32, i32* %[[DOTOMP_IV]], align 4 -// IR-NEXT: %[[ADD52:.+]] = add i32 %[[TMP45]], 1 -// IR-NEXT: store i32 %[[ADD52]], i32* %[[DOTOMP_IV]], align 4 +// IR-NEXT: %[[ADD46:.+]] = add i32 %[[TMP45]], 1 +// IR-NEXT: store i32 %[[ADD46]], i32* %[[DOTOMP_IV]], align 4 // IR-NEXT: br label %[[OMP_INNER_FOR_COND]] // IR-EMPTY: // IR-NEXT: [[OMP_INNER_FOR_END]]: @@ -238,11 +229,17 @@ // IR-NEXT: ret void // IR-NEXT: } extern "C" void func(int start, int end, int step) { -#pragma omp for -#pragma omp tile sizes(3) -#pragma omp tile sizes(4) - for (int i = start; i < end; i += step) + #pragma omp for + #pragma omp tile sizes(4) + #pragma omp unroll partial + for (int i = start; i < end; i+=step) body(start, end, step, i); } #endif /* HEADER */ + + +// IR: ![[LOOP2]] = distinct !{![[LOOP2]], ![[LOOPPROP3:[0-9]+]], ![[LOOPPROP4:[0-9]+]]} +// IR: ![[LOOPPROP3]] = !{!"llvm.loop.mustprogress"} +// IR: ![[LOOPPROP4]] = !{!"llvm.loop.unroll.count", i32 2} +// IR: ![[LOOP5]] = distinct !{![[LOOP5]], ![[LOOPPROP3]]} diff --git a/clang/test/OpenMP/tile_codegen_for_multiple.cpp b/clang/test/OpenMP/unroll_codegen_unroll_for.cpp rename from clang/test/OpenMP/tile_codegen_for_multiple.cpp rename to clang/test/OpenMP/unroll_codegen_unroll_for.cpp --- a/clang/test/OpenMP/tile_codegen_for_multiple.cpp +++ b/clang/test/OpenMP/unroll_codegen_unroll_for.cpp @@ -6,9 +6,6 @@ // RUN: %clang_cc1 -verify -triple x86_64-pc-linux-gnu -fopenmp -fopenmp-version=51 -include-pch %t -emit-llvm %s -o - | FileCheck %s --check-prefix=IR // expected-no-diagnostics -// Account for multiple transformations of a loop before consumed by -// #pragma omp for. - #ifndef HEADER #define HEADER @@ -23,64 +20,64 @@ // IR-NEXT: %[[STEP_ADDR:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTOMP_IV:.+]] = alloca i32, align 4 // IR-NEXT: %[[TMP:.+]] = alloca i32, align 4 +// IR-NEXT: %[[I:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTCAPTURE_EXPR_:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTCAPTURE_EXPR_1:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTCAPTURE_EXPR_2:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTCAPTURE_EXPR_3:.+]] = alloca i32, align 4 -// IR-NEXT: %[[I:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTUNROLLED_IV_I:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTCAPTURE_EXPR_6:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTCAPTURE_EXPR_8:.+]] = alloca i32, align 4 -// IR-NEXT: %[[DOTFLOOR_0_IV_I:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTCAPTURE_EXPR_12:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTCAPTURE_EXPR_14:.+]] = alloca i32, align 4 -// IR-NEXT: %[[DOTFLOOR_0_IV__FLOOR_0_IV_I:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTUNROLLED_IV__UNROLLED_IV_I:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTOMP_LB:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTOMP_UB:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTOMP_STRIDE:.+]] = alloca i32, align 4 // IR-NEXT: %[[DOTOMP_IS_LAST:.+]] = alloca i32, align 4 -// IR-NEXT: %[[DOTFLOOR_0_IV__FLOOR_0_IV_I18:.+]] = alloca i32, align 4 -// IR-NEXT: %[[DOTTILE_0_IV__FLOOR_0_IV_I:.+]] = alloca i32, align 4 -// IR-NEXT: %[[DOTTILE_0_IV_I:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTUNROLLED_IV__UNROLLED_IV_I18:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTUNROLL_INNER_IV__UNROLLED_IV_I:.+]] = alloca i32, align 4 +// IR-NEXT: %[[DOTUNROLL_INNER_IV_I:.+]] = alloca i32, align 4 // IR-NEXT: %[[TMP0:.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) // IR-NEXT: store i32 %[[START:.+]], i32* %[[START_ADDR]], align 4 // IR-NEXT: store i32 %[[END:.+]], i32* %[[END_ADDR]], align 4 // IR-NEXT: store i32 %[[STEP:.+]], i32* %[[STEP_ADDR]], align 4 // IR-NEXT: %[[TMP1:.+]] = load i32, i32* %[[START_ADDR]], align 4 -// IR-NEXT: store i32 %[[TMP1]], i32* %[[DOTCAPTURE_EXPR_]], align 4 -// IR-NEXT: %[[TMP2:.+]] = load i32, i32* %[[END_ADDR]], align 4 -// IR-NEXT: store i32 %[[TMP2]], i32* %[[DOTCAPTURE_EXPR_1]], align 4 -// IR-NEXT: %[[TMP3:.+]] = load i32, i32* %[[STEP_ADDR]], align 4 -// IR-NEXT: store i32 %[[TMP3]], i32* %[[DOTCAPTURE_EXPR_2]], align 4 -// IR-NEXT: %[[TMP4:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_1]], align 4 -// IR-NEXT: %[[TMP5:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_]], align 4 -// IR-NEXT: %[[SUB:.+]] = sub i32 %[[TMP4]], %[[TMP5]] +// IR-NEXT: store i32 %[[TMP1]], i32* %[[I]], align 4 +// IR-NEXT: %[[TMP2:.+]] = load i32, i32* %[[START_ADDR]], align 4 +// IR-NEXT: store i32 %[[TMP2]], i32* %[[DOTCAPTURE_EXPR_]], align 4 +// IR-NEXT: %[[TMP3:.+]] = load i32, i32* %[[END_ADDR]], align 4 +// IR-NEXT: store i32 %[[TMP3]], i32* %[[DOTCAPTURE_EXPR_1]], align 4 +// IR-NEXT: %[[TMP4:.+]] = load i32, i32* %[[STEP_ADDR]], align 4 +// IR-NEXT: store i32 %[[TMP4]], i32* %[[DOTCAPTURE_EXPR_2]], align 4 +// IR-NEXT: %[[TMP5:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_1]], align 4 +// IR-NEXT: %[[TMP6:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_]], align 4 +// IR-NEXT: %[[SUB:.+]] = sub i32 %[[TMP5]], %[[TMP6]] // IR-NEXT: %[[SUB4:.+]] = sub i32 %[[SUB]], 1 -// IR-NEXT: %[[TMP6:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_2]], align 4 -// IR-NEXT: %[[ADD:.+]] = add i32 %[[SUB4]], %[[TMP6]] // IR-NEXT: %[[TMP7:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_2]], align 4 -// IR-NEXT: %[[DIV:.+]] = udiv i32 %[[ADD]], %[[TMP7]] +// IR-NEXT: %[[ADD:.+]] = add i32 %[[SUB4]], %[[TMP7]] +// IR-NEXT: %[[TMP8:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_2]], align 4 +// IR-NEXT: %[[DIV:.+]] = udiv i32 %[[ADD]], %[[TMP8]] // IR-NEXT: %[[SUB5:.+]] = sub i32 %[[DIV]], 1 // IR-NEXT: store i32 %[[SUB5]], i32* %[[DOTCAPTURE_EXPR_3]], align 4 -// IR-NEXT: %[[TMP8:.+]] = load i32, i32* %[[START_ADDR]], align 4 -// IR-NEXT: store i32 %[[TMP8]], i32* %[[I]], align 4 +// IR-NEXT: store i32 0, i32* %[[DOTUNROLLED_IV_I]], align 4 // IR-NEXT: %[[TMP9:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_3]], align 4 // IR-NEXT: %[[ADD7:.+]] = add i32 %[[TMP9]], 1 // IR-NEXT: store i32 %[[ADD7]], i32* %[[DOTCAPTURE_EXPR_6]], align 4 // IR-NEXT: %[[TMP10:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_6]], align 4 -// IR-NEXT: %[[SUB9:.+]] = sub i32 %[[TMP10]], -3 -// IR-NEXT: %[[DIV10:.+]] = udiv i32 %[[SUB9]], 4 +// IR-NEXT: %[[SUB9:.+]] = sub i32 %[[TMP10]], -1 +// IR-NEXT: %[[DIV10:.+]] = udiv i32 %[[SUB9]], 2 // IR-NEXT: %[[SUB11:.+]] = sub i32 %[[DIV10]], 1 // IR-NEXT: store i32 %[[SUB11]], i32* %[[DOTCAPTURE_EXPR_8]], align 4 -// IR-NEXT: store i32 0, i32* %[[DOTFLOOR_0_IV_I]], align 4 // IR-NEXT: %[[TMP11:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_8]], align 4 // IR-NEXT: %[[ADD13:.+]] = add i32 %[[TMP11]], 1 // IR-NEXT: store i32 %[[ADD13]], i32* %[[DOTCAPTURE_EXPR_12]], align 4 // IR-NEXT: %[[TMP12:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_12]], align 4 -// IR-NEXT: %[[SUB15:.+]] = sub i32 %[[TMP12]], -2 -// IR-NEXT: %[[DIV16:.+]] = udiv i32 %[[SUB15]], 3 +// IR-NEXT: %[[SUB15:.+]] = sub i32 %[[TMP12]], -1 +// IR-NEXT: %[[DIV16:.+]] = udiv i32 %[[SUB15]], 2 // IR-NEXT: %[[SUB17:.+]] = sub i32 %[[DIV16]], 1 // IR-NEXT: store i32 %[[SUB17]], i32* %[[DOTCAPTURE_EXPR_14]], align 4 -// IR-NEXT: store i32 0, i32* %[[DOTFLOOR_0_IV__FLOOR_0_IV_I]], align 4 +// IR-NEXT: store i32 0, i32* %[[DOTUNROLLED_IV__UNROLLED_IV_I]], align 4 // IR-NEXT: %[[TMP13:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_12]], align 4 // IR-NEXT: %[[CMP:.+]] = icmp ult i32 0, %[[TMP13]] // IR-NEXT: br i1 %[[CMP]], label %[[OMP_PRECOND_THEN:.+]], label %[[OMP_PRECOND_END:.+]] @@ -121,77 +118,65 @@ // IR-EMPTY: // IR-NEXT: [[OMP_INNER_FOR_BODY]]: // IR-NEXT: %[[TMP22:.+]] = load i32, i32* %[[DOTOMP_IV]], align 4 -// IR-NEXT: %[[MUL:.+]] = mul i32 %[[TMP22]], 3 +// IR-NEXT: %[[MUL:.+]] = mul i32 %[[TMP22]], 2 // IR-NEXT: %[[ADD22:.+]] = add i32 0, %[[MUL]] -// IR-NEXT: store i32 %[[ADD22]], i32* %[[DOTFLOOR_0_IV__FLOOR_0_IV_I18]], align 4 -// IR-NEXT: %[[TMP23:.+]] = load i32, i32* %[[DOTFLOOR_0_IV__FLOOR_0_IV_I18]], align 4 -// IR-NEXT: store i32 %[[TMP23]], i32* %[[DOTTILE_0_IV__FLOOR_0_IV_I]], align 4 +// IR-NEXT: store i32 %[[ADD22]], i32* %[[DOTUNROLLED_IV__UNROLLED_IV_I18]], align 4 +// IR-NEXT: %[[TMP23:.+]] = load i32, i32* %[[DOTUNROLLED_IV__UNROLLED_IV_I18]], align 4 +// IR-NEXT: store i32 %[[TMP23]], i32* %[[DOTUNROLL_INNER_IV__UNROLLED_IV_I]], align 4 // IR-NEXT: br label %[[FOR_COND:.+]] // IR-EMPTY: // IR-NEXT: [[FOR_COND]]: -// IR-NEXT: %[[TMP24:.+]] = load i32, i32* %[[DOTTILE_0_IV__FLOOR_0_IV_I]], align 4 -// IR-NEXT: %[[TMP25:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_8]], align 4 -// IR-NEXT: %[[ADD23:.+]] = add i32 %[[TMP25]], 1 -// IR-NEXT: %[[TMP26:.+]] = load i32, i32* %[[DOTFLOOR_0_IV__FLOOR_0_IV_I18]], align 4 -// IR-NEXT: %[[ADD24:.+]] = add i32 %[[TMP26]], 3 -// IR-NEXT: %[[CMP25:.+]] = icmp ult i32 %[[ADD23]], %[[ADD24]] -// IR-NEXT: br i1 %[[CMP25]], label %[[COND_TRUE26:.+]], label %[[COND_FALSE28:.+]] -// IR-EMPTY: -// IR-NEXT: [[COND_TRUE26]]: +// IR-NEXT: %[[TMP24:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV__UNROLLED_IV_I]], align 4 +// IR-NEXT: %[[TMP25:.+]] = load i32, i32* %[[DOTUNROLLED_IV__UNROLLED_IV_I18]], align 4 +// IR-NEXT: %[[ADD23:.+]] = add i32 %[[TMP25]], 2 +// IR-NEXT: %[[CMP24:.+]] = icmp ule i32 %[[TMP24]], %[[ADD23]] +// IR-NEXT: br i1 %[[CMP24]], label %[[LAND_RHS:.+]], label %[[LAND_END:.+]] +// IR-EMPTY: +// IR-NEXT: [[LAND_RHS]]: +// IR-NEXT: %[[TMP26:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV__UNROLLED_IV_I]], align 4 // IR-NEXT: %[[TMP27:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_8]], align 4 -// IR-NEXT: %[[ADD27:.+]] = add i32 %[[TMP27]], 1 -// IR-NEXT: br label %[[COND_END30:.+]] -// IR-EMPTY: -// IR-NEXT: [[COND_FALSE28]]: -// IR-NEXT: %[[TMP28:.+]] = load i32, i32* %[[DOTFLOOR_0_IV__FLOOR_0_IV_I18]], align 4 -// IR-NEXT: %[[ADD29:.+]] = add i32 %[[TMP28]], 3 -// IR-NEXT: br label %[[COND_END30]] +// IR-NEXT: %[[ADD25:.+]] = add i32 %[[TMP27]], 1 +// IR-NEXT: %[[CMP26:.+]] = icmp ule i32 %[[TMP26]], %[[ADD25]] +// IR-NEXT: br label %[[LAND_END]] // IR-EMPTY: -// IR-NEXT: [[COND_END30]]: -// IR-NEXT: %[[COND31:.+]] = phi i32 [ %[[ADD27]], %[[COND_TRUE26]] ], [ %[[ADD29]], %[[COND_FALSE28]] ] -// IR-NEXT: %[[CMP32:.+]] = icmp ult i32 %[[TMP24]], %[[COND31]] -// IR-NEXT: br i1 %[[CMP32]], label %[[FOR_BODY:.+]], label %[[FOR_END51:.+]] +// IR-NEXT: [[LAND_END]]: +// IR-NEXT: %[[TMP28:.+]] = phi i1 [ false, %[[FOR_COND]] ], [ %[[CMP26]], %[[LAND_RHS]] ] +// IR-NEXT: br i1 %[[TMP28]], label %[[FOR_BODY:.+]], label %[[FOR_END41:.+]] // IR-EMPTY: // IR-NEXT: [[FOR_BODY]]: -// IR-NEXT: %[[TMP29:.+]] = load i32, i32* %[[DOTTILE_0_IV__FLOOR_0_IV_I]], align 4 -// IR-NEXT: %[[MUL33:.+]] = mul i32 %[[TMP29]], 4 -// IR-NEXT: %[[ADD34:.+]] = add i32 0, %[[MUL33]] -// IR-NEXT: store i32 %[[ADD34]], i32* %[[DOTFLOOR_0_IV_I]], align 4 -// IR-NEXT: %[[TMP30:.+]] = load i32, i32* %[[DOTFLOOR_0_IV_I]], align 4 -// IR-NEXT: store i32 %[[TMP30]], i32* %[[DOTTILE_0_IV_I]], align 4 -// IR-NEXT: br label %[[FOR_COND35:.+]] -// IR-EMPTY: -// IR-NEXT: [[FOR_COND35]]: -// IR-NEXT: %[[TMP31:.+]] = load i32, i32* %[[DOTTILE_0_IV_I]], align 4 -// IR-NEXT: %[[TMP32:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_3]], align 4 -// IR-NEXT: %[[ADD36:.+]] = add i32 %[[TMP32]], 1 -// IR-NEXT: %[[TMP33:.+]] = load i32, i32* %[[DOTFLOOR_0_IV_I]], align 4 -// IR-NEXT: %[[ADD37:.+]] = add nsw i32 %[[TMP33]], 4 -// IR-NEXT: %[[CMP38:.+]] = icmp ult i32 %[[ADD36]], %[[ADD37]] -// IR-NEXT: br i1 %[[CMP38]], label %[[COND_TRUE39:.+]], label %[[COND_FALSE41:.+]] -// IR-EMPTY: -// IR-NEXT: [[COND_TRUE39]]: +// IR-NEXT: %[[TMP29:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV__UNROLLED_IV_I]], align 4 +// IR-NEXT: %[[MUL27:.+]] = mul i32 %[[TMP29]], 2 +// IR-NEXT: %[[ADD28:.+]] = add i32 0, %[[MUL27]] +// IR-NEXT: store i32 %[[ADD28]], i32* %[[DOTUNROLLED_IV_I]], align 4 +// IR-NEXT: %[[TMP30:.+]] = load i32, i32* %[[DOTUNROLLED_IV_I]], align 4 +// IR-NEXT: store i32 %[[TMP30]], i32* %[[DOTUNROLL_INNER_IV_I]], align 4 +// IR-NEXT: br label %[[FOR_COND29:.+]] +// IR-EMPTY: +// IR-NEXT: [[FOR_COND29]]: +// IR-NEXT: %[[TMP31:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_I]], align 4 +// IR-NEXT: %[[TMP32:.+]] = load i32, i32* %[[DOTUNROLLED_IV_I]], align 4 +// IR-NEXT: %[[ADD30:.+]] = add i32 %[[TMP32]], 2 +// IR-NEXT: %[[CMP31:.+]] = icmp ule i32 %[[TMP31]], %[[ADD30]] +// IR-NEXT: br i1 %[[CMP31]], label %[[LAND_RHS32:.+]], label %[[LAND_END35:.+]] +// IR-EMPTY: +// IR-NEXT: [[LAND_RHS32]]: +// IR-NEXT: %[[TMP33:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_I]], align 4 // IR-NEXT: %[[TMP34:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_3]], align 4 -// IR-NEXT: %[[ADD40:.+]] = add i32 %[[TMP34]], 1 -// IR-NEXT: br label %[[COND_END43:.+]] +// IR-NEXT: %[[ADD33:.+]] = add i32 %[[TMP34]], 1 +// IR-NEXT: %[[CMP34:.+]] = icmp ule i32 %[[TMP33]], %[[ADD33]] +// IR-NEXT: br label %[[LAND_END35]] // IR-EMPTY: -// IR-NEXT: [[COND_FALSE41]]: -// IR-NEXT: %[[TMP35:.+]] = load i32, i32* %[[DOTFLOOR_0_IV_I]], align 4 -// IR-NEXT: %[[ADD42:.+]] = add nsw i32 %[[TMP35]], 4 -// IR-NEXT: br label %[[COND_END43]] +// IR-NEXT: [[LAND_END35]]: +// IR-NEXT: %[[TMP35:.+]] = phi i1 [ false, %[[FOR_COND29]] ], [ %[[CMP34]], %[[LAND_RHS32]] ] +// IR-NEXT: br i1 %[[TMP35]], label %[[FOR_BODY36:.+]], label %[[FOR_END:.+]] // IR-EMPTY: -// IR-NEXT: [[COND_END43]]: -// IR-NEXT: %[[COND44:.+]] = phi i32 [ %[[ADD40]], %[[COND_TRUE39]] ], [ %[[ADD42]], %[[COND_FALSE41]] ] -// IR-NEXT: %[[CMP45:.+]] = icmp ult i32 %[[TMP31]], %[[COND44]] -// IR-NEXT: br i1 %[[CMP45]], label %[[FOR_BODY46:.+]], label %[[FOR_END:.+]] -// IR-EMPTY: -// IR-NEXT: [[FOR_BODY46]]: +// IR-NEXT: [[FOR_BODY36]]: // IR-NEXT: %[[TMP36:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_]], align 4 -// IR-NEXT: %[[TMP37:.+]] = load i32, i32* %[[DOTTILE_0_IV_I]], align 4 +// IR-NEXT: %[[TMP37:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_I]], align 4 // IR-NEXT: %[[TMP38:.+]] = load i32, i32* %[[DOTCAPTURE_EXPR_2]], align 4 -// IR-NEXT: %[[MUL47:.+]] = mul i32 %[[TMP37]], %[[TMP38]] -// IR-NEXT: %[[ADD48:.+]] = add i32 %[[TMP36]], %[[MUL47]] -// IR-NEXT: store i32 %[[ADD48]], i32* %[[I]], align 4 +// IR-NEXT: %[[MUL37:.+]] = mul i32 %[[TMP37]], %[[TMP38]] +// IR-NEXT: %[[ADD38:.+]] = add i32 %[[TMP36]], %[[MUL37]] +// IR-NEXT: store i32 %[[ADD38]], i32* %[[I]], align 4 // IR-NEXT: %[[TMP39:.+]] = load i32, i32* %[[START_ADDR]], align 4 // IR-NEXT: %[[TMP40:.+]] = load i32, i32* %[[END_ADDR]], align 4 // IR-NEXT: %[[TMP41:.+]] = load i32, i32* %[[STEP_ADDR]], align 4 @@ -200,21 +185,21 @@ // IR-NEXT: br label %[[FOR_INC:.+]] // IR-EMPTY: // IR-NEXT: [[FOR_INC]]: -// IR-NEXT: %[[TMP43:.+]] = load i32, i32* %[[DOTTILE_0_IV_I]], align 4 -// IR-NEXT: %[[INC:.+]] = add nsw i32 %[[TMP43]], 1 -// IR-NEXT: store i32 %[[INC]], i32* %[[DOTTILE_0_IV_I]], align 4 -// IR-NEXT: br label %[[FOR_COND35]], !llvm.loop ![[LOOP2:[0-9]+]] +// IR-NEXT: %[[TMP43:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV_I]], align 4 +// IR-NEXT: %[[INC:.+]] = add i32 %[[TMP43]], 1 +// IR-NEXT: store i32 %[[INC]], i32* %[[DOTUNROLL_INNER_IV_I]], align 4 +// IR-NEXT: br label %[[FOR_COND29]], !llvm.loop ![[LOOP2:[0-9]+]] // IR-EMPTY: // IR-NEXT: [[FOR_END]]: -// IR-NEXT: br label %[[FOR_INC49:.+]] +// IR-NEXT: br label %[[FOR_INC39:.+]] // IR-EMPTY: -// IR-NEXT: [[FOR_INC49]]: -// IR-NEXT: %[[TMP44:.+]] = load i32, i32* %[[DOTTILE_0_IV__FLOOR_0_IV_I]], align 4 -// IR-NEXT: %[[INC50:.+]] = add i32 %[[TMP44]], 1 -// IR-NEXT: store i32 %[[INC50]], i32* %[[DOTTILE_0_IV__FLOOR_0_IV_I]], align 4 -// IR-NEXT: br label %[[FOR_COND]], !llvm.loop ![[LOOP4:[0-9]+]] +// IR-NEXT: [[FOR_INC39]]: +// IR-NEXT: %[[TMP44:.+]] = load i32, i32* %[[DOTUNROLL_INNER_IV__UNROLLED_IV_I]], align 4 +// IR-NEXT: %[[INC40:.+]] = add i32 %[[TMP44]], 1 +// IR-NEXT: store i32 %[[INC40]], i32* %[[DOTUNROLL_INNER_IV__UNROLLED_IV_I]], align 4 +// IR-NEXT: br label %[[FOR_COND]], !llvm.loop ![[LOOP5:[0-9]+]] // IR-EMPTY: -// IR-NEXT: [[FOR_END51]]: +// IR-NEXT: [[FOR_END41]]: // IR-NEXT: br label %[[OMP_BODY_CONTINUE:.+]] // IR-EMPTY: // IR-NEXT: [[OMP_BODY_CONTINUE]]: @@ -222,8 +207,8 @@ // IR-EMPTY: // IR-NEXT: [[OMP_INNER_FOR_INC]]: // IR-NEXT: %[[TMP45:.+]] = load i32, i32* %[[DOTOMP_IV]], align 4 -// IR-NEXT: %[[ADD52:.+]] = add i32 %[[TMP45]], 1 -// IR-NEXT: store i32 %[[ADD52]], i32* %[[DOTOMP_IV]], align 4 +// IR-NEXT: %[[ADD42:.+]] = add i32 %[[TMP45]], 1 +// IR-NEXT: store i32 %[[ADD42]], i32* %[[DOTOMP_IV]], align 4 // IR-NEXT: br label %[[OMP_INNER_FOR_COND]] // IR-EMPTY: // IR-NEXT: [[OMP_INNER_FOR_END]]: @@ -238,11 +223,17 @@ // IR-NEXT: ret void // IR-NEXT: } extern "C" void func(int start, int end, int step) { -#pragma omp for -#pragma omp tile sizes(3) -#pragma omp tile sizes(4) - for (int i = start; i < end; i += step) + #pragma omp for + #pragma omp unroll partial + #pragma omp unroll partial + for (int i = start; i < end; i+=step) body(start, end, step, i); } #endif /* HEADER */ + + +// IR: ![[LOOP2]] = distinct !{![[LOOP2]], ![[LOOPPROP3:[0-9]+]], ![[LOOPPROP4:[0-9]+]]} +// IR: ![[LOOPPROP3]] = !{!"llvm.loop.mustprogress"} +// IR: ![[LOOPPROP4]] = !{!"llvm.loop.unroll.count", i32 2} +// IR: ![[LOOP5]] = distinct !{![[LOOP5]], ![[LOOPPROP3]], ![[LOOPPROP4]]} diff --git a/clang/test/OpenMP/unroll_messages.cpp b/clang/test/OpenMP/unroll_messages.cpp new file mode 100644 --- /dev/null +++ b/clang/test/OpenMP/unroll_messages.cpp @@ -0,0 +1,130 @@ +// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -std=c++17 -fopenmp -fopenmp-version=51 -fsyntax-only -Wuninitialized -verify %s + +void func(int n) { + // expected-error@+2 {{statement after '#pragma omp unroll' must be a for loop}} + #pragma omp unroll + func(n); + + // expected-error@+2 {{statement after '#pragma omp unroll' must be a for loop}} + #pragma omp unroll + ; + + // expected-error@+2 {{the loop condition expression depends on the current loop control variable}} + #pragma omp unroll + for (int i = 0; i < 2*(i-4); ++i) {} + + // expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', '>=', or '!=') of loop variable 'i'}} + #pragma omp unroll + for (int i = 0; i/3 < 7; ++i) {} + + // expected-warning@+1 {{extra tokens at the end of '#pragma omp unroll' are ignored}} + #pragma omp unroll foo + for (int i = 0; i < n; ++i) {} + + // expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}} + #pragma omp unroll partial( + for (int i = 0; i < n; ++i) {} + + // expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}} + #pragma omp unroll partial(4 + for (int i = 0; i < n; ++i) {} + + // expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}} + #pragma omp unroll partial(4+ + for (int i = 0; i < n; ++i) {} + + // expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}} + #pragma omp unroll partial(for) + for (int i = 0; i < n; ++i) {} + + // expected-error@+1 {{integral constant expression must have integral or unscoped enumeration type, not 'void (int)'}} + #pragma omp unroll partial(func) + for (int i = 0; i < n; ++i) {} + + // expected-error@+1 {{expected expression}} + #pragma omp unroll partial() + for (int i = 0; i < n; ++i) {} + + // expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}} + #pragma omp unroll partial(4,4) + for (int i = 0; i < n; ++i) {} + + // expected-error@+3 {{expression is not an integral constant expression}} expected-note@+3 {{read of non-const variable 'a' is not allowed in a constant expression}} + // expected-note@+1 {{declared here}} + int a; + #pragma omp unroll partial(a) + for (int i = 0; i < n; ++i) {} + + // expected-error@+1 {{argument to 'partial' clause must be a strictly positive integer value}} + #pragma omp unroll partial(0) + for (int i = 0; i < n; ++i) {} + + // expected-error@+1 {{directive '#pragma omp unroll' cannot contain more than one 'partial' clause}} + #pragma omp unroll partial partial + for (int i = 0; i < n; ++i) {} + + // expected-error@+1 {{directive '#pragma omp unroll' cannot contain more than one 'partial' clause}} + #pragma omp unroll partial(4) partial + for (int i = 0; i < n; ++i) {} + + // expected-error@+1 {{directive '#pragma omp unroll' cannot contain more than one 'full' clause}} + #pragma omp unroll full full + for (int i = 0; i < 128; ++i) {} + + // expected-error@+1 {{'full' and 'partial' clause are mutually exclusive and may not appear on the same directive}} expected-note@+1 {{'partial' clause is specified here}} + #pragma omp unroll partial full + for (int i = 0; i < n; ++i) {} + + // expected-error@+1 {{'partial' and 'full' clause are mutually exclusive and may not appear on the same directive}} expected-note@+1 {{'full' clause is specified here}} + #pragma omp unroll full partial + for (int i = 0; i < n; ++i) {} + + // expected-error@+2 {{loop to be fully unrolled must have a constant trip count}} expected-note@+1 {{'#pragma omp unroll full' directive found here}} + #pragma omp unroll full + for (int i = 0; i < n; ++i) {} + + // expected-error@+2 {{statement after '#pragma omp for' must be a for loop}} + #pragma omp for + #pragma omp unroll + for (int i = 0; i < n; ++i) {} + + // expected-error@+2 {{statement after '#pragma omp for' must be a for loop}} + #pragma omp for + #pragma omp unroll full + for (int i = 0; i < 128; ++i) {} + + // expected-error@+2 {{statement after '#pragma omp unroll' must be a for loop}} + #pragma omp unroll + #pragma omp unroll + for (int i = 0; i < n; ++i) {} + + // expected-error@+2 {{statement after '#pragma omp tile' must be a for loop}} + #pragma omp tile sizes(4) + #pragma omp unroll + for (int i = 0; i < n; ++i) {} + + // expected-error@+4 {{expected 2 for loops after '#pragma omp for', but found only 1}} + // expected-note@+1 {{as specified in 'collapse' clause}} + #pragma omp for collapse(2) + for (int i = 0; i < n; ++i) { + #pragma omp unroll full + for (int j = 0; j < 128; ++j) {} + } +} + + +template +void templated_func(int n) { + // expected-error@+1 {{argument to 'partial' clause must be a strictly positive integer value}} + #pragma omp unroll partial(Factor) + for (T i = 0; i < n; ++i) {} + + // expected-error@+2 {{loop to be fully unrolled must have a constant trip count}} expected-note@+1 {{'#pragma omp unroll full' directive found here}} + #pragma omp unroll full + for (int i = 0; i < n; i-=Factor) {} +} + +void template_inst(int n) { + // expected-note@+1 {{in instantiation of function template specialization 'templated_func' requested here}} + templated_func(n); +} diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp --- a/clang/tools/libclang/CIndex.cpp +++ b/clang/tools/libclang/CIndex.cpp @@ -2045,6 +2045,7 @@ void VisitOMPParallelDirective(const OMPParallelDirective *D); void VisitOMPSimdDirective(const OMPSimdDirective *D); void VisitOMPTileDirective(const OMPTileDirective *D); + void VisitOMPUnrollDirective(const OMPUnrollDirective *D); void VisitOMPForDirective(const OMPForDirective *D); void VisitOMPForSimdDirective(const OMPForSimdDirective *D); void VisitOMPSectionsDirective(const OMPSectionsDirective *D); @@ -2223,6 +2224,23 @@ Visitor->AddStmt(E); } + + + + +void OMPClauseEnqueue::VisitOMPFullClause(const OMPFullClause *C) {} + + +void OMPClauseEnqueue::VisitOMPPartialClause(const OMPPartialClause *C) { + Visitor->AddStmt(C->getFactor()); +} + + + + + + + void OMPClauseEnqueue::VisitOMPAllocatorClause(const OMPAllocatorClause *C) { Visitor->AddStmt(C->getAllocator()); } @@ -2896,6 +2914,12 @@ VisitOMPLoopBasedDirective(D); } + +void EnqueueVisitor::VisitOMPUnrollDirective(const OMPUnrollDirective *D) { + VisitOMPLoopBasedDirective(D); +} + + void EnqueueVisitor::VisitOMPForDirective(const OMPForDirective *D) { VisitOMPLoopDirective(D); } @@ -5574,6 +5598,8 @@ return cxstring::createRef("OMPSimdDirective"); case CXCursor_OMPTileDirective: return cxstring::createRef("OMPTileDirective"); + case CXCursor_OMPUnrollDirective: + return cxstring::createRef("OMPUnrollDirective"); case CXCursor_OMPForDirective: return cxstring::createRef("OMPForDirective"); case CXCursor_OMPForSimdDirective: diff --git a/clang/tools/libclang/CXCursor.cpp b/clang/tools/libclang/CXCursor.cpp --- a/clang/tools/libclang/CXCursor.cpp +++ b/clang/tools/libclang/CXCursor.cpp @@ -651,6 +651,9 @@ case Stmt::OMPTileDirectiveClass: K = CXCursor_OMPTileDirective; break; + case Stmt::OMPUnrollDirectiveClass: + K = CXCursor_OMPUnrollDirective; + break; case Stmt::OMPForDirectiveClass: K = CXCursor_OMPForDirective; break; diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td --- a/llvm/include/llvm/Frontend/OpenMP/OMP.td +++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td @@ -68,6 +68,8 @@ let flangClass = "OmpObjectList"; } def OMPC_Sizes: Clause<"sizes"> { let clangClass = "OMPSizesClause"; } +def OMPC_Full: Clause<"full"> { let clangClass = "OMPFullClause"; } +def OMPC_Partial: Clause<"partial"> { let clangClass = "OMPPartialClause"; } def OMPC_FirstPrivate : Clause<"firstprivate"> { let clangClass = "OMPFirstprivateClause"; let flangClass = "OmpObjectList"; @@ -400,6 +402,12 @@ VersionedClause, ]; } +def OMP_Unroll : Directive<"unroll"> { + let allowedOnceClauses = [ + VersionedClause, + VersionedClause, + ]; +} def OMP_For : Directive<"for"> { let allowedClauses = [ VersionedClause,