Index: include/clang/AST/StmtOpenMP.h =================================================================== --- include/clang/AST/StmtOpenMP.h +++ include/clang/AST/StmtOpenMP.h @@ -314,6 +314,9 @@ friend class ASTStmtReader; /// \brief Number of collapsed loops as specified by 'collapse' clause. unsigned CollapsedNum; + /// \brief DefaultScheduleKind - Schedule type to use for a given target + /// if no 'schedule' clause or a 'schedule' type 'auto' is specified. + OpenMPDefaultScheduleKind DefaultScheduleKind; /// \brief Offsets to the stored exprs. /// This enumeration contains offsets to all the pointers to children @@ -412,7 +415,7 @@ : OMPExecutableDirective(That, SC, Kind, StartLoc, EndLoc, NumClauses, numLoopChildren(CollapsedNum, Kind) + NumSpecialChildren), - CollapsedNum(CollapsedNum) {} + CollapsedNum(CollapsedNum), DefaultScheduleKind(OMPDSK_unknown) {} /// \brief Offset to the start of children expression arrays. static unsigned getArraysOffset(OpenMPDirectiveKind Kind) { @@ -521,6 +524,9 @@ "expected worksharing loop directive"); *std::next(child_begin(), PrevUpperBoundVariableOffset) = PrevUB; } + void setDefaultSchedule(OpenMPDefaultScheduleKind SK) { + DefaultScheduleKind = SK; + } void setCounters(ArrayRef A); void setPrivateCounters(ArrayRef A); void setInits(ArrayRef A); @@ -567,6 +573,9 @@ /// \brief PreviousUpperBound - local variable passed to runtime in the /// enclosing schedule or null if that does not apply. Expr *PrevUB; + /// \brief DefaultScheduleKind - Schedule type to use for the given target + /// if no 'schedule' clause or a 'schedule' type 'auto' is specified. + OpenMPDefaultScheduleKind DefaultScheduleKind; /// \brief Counters Loop counters. SmallVector Counters; /// \brief PrivateCounters Loop counters. @@ -608,6 +617,7 @@ NumIterations = nullptr; PrevLB = nullptr; PrevUB = nullptr; + DefaultScheduleKind = OMPDSK_unknown; Counters.resize(Size); PrivateCounters.resize(Size); Inits.resize(Size); @@ -739,6 +749,9 @@ return const_cast(reinterpret_cast( *std::next(child_begin(), PrevUpperBoundVariableOffset))); } + OpenMPDefaultScheduleKind getDefaultSchedule() const { + return DefaultScheduleKind; + } const Stmt *getBody() const { // This relies on the loop form is already checked by Sema. Stmt *Body = getAssociatedStmt()->IgnoreContainers(true); Index: include/clang/Basic/OpenMPKinds.h =================================================================== --- include/clang/Basic/OpenMPKinds.h +++ include/clang/Basic/OpenMPKinds.h @@ -127,6 +127,9 @@ OpenMPScheduleClauseModifier M2 = OMPC_SCHEDULE_MODIFIER_unknown; }; +/// Default schedule type for any loop-based (#for) OpenMP directive. +enum OpenMPDefaultScheduleKind { OMPDSK_static_chunkone, OMPDSK_unknown }; + OpenMPDirectiveKind getOpenMPDirectiveKind(llvm::StringRef Str); const char *getOpenMPDirectiveName(OpenMPDirectiveKind Kind); Index: lib/AST/StmtOpenMP.cpp =================================================================== --- lib/AST/StmtOpenMP.cpp +++ lib/AST/StmtOpenMP.cpp @@ -105,6 +105,7 @@ Dir->setUpdates(Exprs.Updates); Dir->setFinals(Exprs.Finals); Dir->setPreInits(Exprs.PreInits); + // TODO: Set default schedule. return Dir; } @@ -156,6 +157,7 @@ Dir->setFinals(Exprs.Finals); Dir->setPreInits(Exprs.PreInits); Dir->setHasCancel(HasCancel); + Dir->setDefaultSchedule(Exprs.DefaultScheduleKind); return Dir; } @@ -207,6 +209,7 @@ Dir->setUpdates(Exprs.Updates); Dir->setFinals(Exprs.Finals); Dir->setPreInits(Exprs.PreInits); + // TODO: Set default schedule. return Dir; } @@ -373,6 +376,7 @@ Dir->setFinals(Exprs.Finals); Dir->setPreInits(Exprs.PreInits); Dir->setHasCancel(HasCancel); + // TODO: Set default schedule. return Dir; } @@ -423,6 +427,7 @@ Dir->setUpdates(Exprs.Updates); Dir->setFinals(Exprs.Finals); Dir->setPreInits(Exprs.PreInits); + // TODO: Set default schedule. return Dir; } @@ -760,6 +765,7 @@ Dir->setFinals(Exprs.Finals); Dir->setPreInits(Exprs.PreInits); Dir->setHasCancel(HasCancel); + // TODO: Set default schedule. return Dir; } @@ -1003,6 +1009,7 @@ Dir->setUpdates(Exprs.Updates); Dir->setFinals(Exprs.Finals); Dir->setPreInits(Exprs.PreInits); + // TODO: Set default schedule. return Dir; } @@ -1077,6 +1084,7 @@ Dir->setUpdates(Exprs.Updates); Dir->setFinals(Exprs.Finals); Dir->setPreInits(Exprs.PreInits); + // TODO: Set default schedule. return Dir; } @@ -1133,6 +1141,7 @@ Dir->setUpdates(Exprs.Updates); Dir->setFinals(Exprs.Finals); Dir->setPreInits(Exprs.PreInits); + // TODO: Set default schedule. return Dir; } @@ -1188,6 +1197,7 @@ Dir->setUpdates(Exprs.Updates); Dir->setFinals(Exprs.Finals); Dir->setPreInits(Exprs.PreInits); + // TODO: Set default schedule. return Dir; } @@ -1242,6 +1252,7 @@ Dir->setUpdates(Exprs.Updates); Dir->setFinals(Exprs.Finals); Dir->setPreInits(Exprs.PreInits); + // TODO: Set default schedule. return Dir; } @@ -1336,6 +1347,7 @@ Dir->setUpdates(Exprs.Updates); Dir->setFinals(Exprs.Finals); Dir->setPreInits(Exprs.PreInits); + // TODO: Set default schedule. return Dir; } @@ -1389,6 +1401,7 @@ Dir->setUpdates(Exprs.Updates); Dir->setFinals(Exprs.Finals); Dir->setPreInits(Exprs.PreInits); + // TODO: Set default schedule. return Dir; } @@ -1444,6 +1457,7 @@ Dir->setUpdates(Exprs.Updates); Dir->setFinals(Exprs.Finals); Dir->setPreInits(Exprs.PreInits); + // TODO: Set default schedule. return Dir; } @@ -1502,6 +1516,7 @@ Dir->setUpdates(Exprs.Updates); Dir->setFinals(Exprs.Finals); Dir->setPreInits(Exprs.PreInits); + // TODO: Set default schedule. return Dir; } @@ -1582,6 +1597,7 @@ Dir->setUpdates(Exprs.Updates); Dir->setFinals(Exprs.Finals); Dir->setPreInits(Exprs.PreInits); + // TODO: Set default schedule. return Dir; } @@ -1640,6 +1656,7 @@ Dir->setUpdates(Exprs.Updates); Dir->setFinals(Exprs.Finals); Dir->setPreInits(Exprs.PreInits); + // TODO: Set default schedule. return Dir; } @@ -1701,6 +1718,7 @@ Dir->setUpdates(Exprs.Updates); Dir->setFinals(Exprs.Finals); Dir->setPreInits(Exprs.PreInits); + // TODO: Set default schedule. return Dir; } @@ -1759,6 +1777,7 @@ Dir->setUpdates(Exprs.Updates); Dir->setFinals(Exprs.Finals); Dir->setPreInits(Exprs.PreInits); + // TODO: Set default schedule. return Dir; } Index: lib/CodeGen/CGStmtOpenMP.cpp =================================================================== --- lib/CodeGen/CGStmtOpenMP.cpp +++ lib/CodeGen/CGStmtOpenMP.cpp @@ -2187,13 +2187,35 @@ } const unsigned IVSize = getContext().getTypeSize(IVExpr->getType()); const bool IVSigned = IVExpr->getType()->hasSignedIntegerRepresentation(); + // For NVPTX and other GPU targets high performance is often achieved + // if adjacent threads access memory in a coalesced manner. This is + // true for loops that access memory with stride one if a static + // schedule with chunk size of 1 is used. We generate such code + // whenever the OpenMP standard gives us freedom to do so. + // + // This case is called if there is no schedule clause, with a + // schedule(auto), or with a schedule(static,1). + // + // Codegen is optimized for this case. Since chunk size is 1 we do not + // need to generate the inner loop, i.e., the chunk iterator can be + // removed. + // while(idx < GlobalUB) { + // BODY; + // idx += ST; + // } + if (S.getDefaultSchedule() == OMPDSK_static_chunkone) { + ScheduleKind.Schedule = OMPC_SCHEDULE_static; + if (!Chunk) // Force use of chunk=1 + Chunk = Builder.getIntN(IVSize, 1); + } // OpenMP 4.5, 2.7.1 Loop Construct, Description. // If the static schedule kind is specified or if the ordered clause is // specified, and if no monotonic modifier is specified, the effect will // be as if the monotonic modifier was specified. - if (RT.isStaticNonchunked(ScheduleKind.Schedule, - /* Chunked */ Chunk != nullptr) && - !Ordered) { + if (S.getDefaultSchedule() == OMPDSK_static_chunkone || + (RT.isStaticNonchunked(ScheduleKind.Schedule, + /* Chunked */ Chunk != nullptr) && + !Ordered)) { if (isOpenMPSimdDirective(S.getDirectiveKind())) EmitOMPSimdInit(S, /*IsMonotonic=*/true); // OpenMP [2.7.1, Loop Construct, Description, table 2-1] @@ -2201,17 +2223,21 @@ // chunks that are approximately equal in size, and at most one chunk is // distributed to each thread. Note that the size of the chunks is // unspecified in this case. - RT.emitForStaticInit(*this, S.getLocStart(), ScheduleKind, - IVSize, IVSigned, Ordered, - IL.getAddress(), LB.getAddress(), - UB.getAddress(), ST.getAddress()); + RT.emitForStaticInit(*this, S.getLocStart(), ScheduleKind, IVSize, + IVSigned, Ordered, IL.getAddress(), + LB.getAddress(), UB.getAddress(), ST.getAddress(), + Chunk); auto LoopExit = getJumpDestInCurrentScope(createBasicBlock("omp.loop.exit")); - // UB = min(UB, GlobalUB); - EmitIgnoredExpr(S.getEnsureUpperBound()); + if (S.getDefaultSchedule() != OMPDSK_static_chunkone) { + // UB = min(UB, GlobalUB); + EmitIgnoredExpr(S.getEnsureUpperBound()); + } // IV = LB; EmitIgnoredExpr(S.getInit()); // while (idx <= UB) { BODY; ++idx; } + // if OMPDSK_static_chunkone: + // while (idx <= GlobalUB) { BODY; idx += ST; } EmitOMPInnerLoop(S, LoopScope.requiresCleanups(), S.getCond(), S.getInc(), [&S, LoopExit](CodeGenFunction &CGF) { Index: lib/Sema/SemaOpenMP.cpp =================================================================== --- lib/Sema/SemaOpenMP.cpp +++ lib/Sema/SemaOpenMP.cpp @@ -3855,15 +3855,71 @@ return PostUpdate; } +/// Get the default schedule type for any loop-based OpenMP directive, +/// specialized for a particular target. This is used to guide codegen +/// if a) no 'schedule' clause is specified, or b) a 'schedule' type of +/// 'auto' is specified by the user. +static OpenMPDefaultScheduleKind +getDefaultSchedule(Sema &S, OpenMPDirectiveKind Kind, + ArrayRef Clauses) { + OpenMPDefaultScheduleKind DefaultSchedule = OMPDSK_unknown; + + if (S.getLangOpts().OpenMPIsDevice && + S.Context.getTargetInfo().getTriple().isNVPTX()) { + // Force a schedule type of (static,1) if there is no schedule clause, or + // the user specifies schedule(auto) or schedule(static,1). + bool ChunkSizeOne = false; + auto ScheduleKind = OMPC_SCHEDULE_unknown; + auto ScheduleClause = + OMPExecutableDirective::getClausesOfKind(Clauses); + if (ScheduleClause.begin() != ScheduleClause.end()) { + ScheduleKind = (*ScheduleClause.begin())->getScheduleKind(); + if (const auto *Ch = (*ScheduleClause.begin())->getChunkSize()) { + if (!Ch->isValueDependent() && !Ch->isTypeDependent() && + !Ch->isInstantiationDependent() && + !Ch->containsUnexpandedParameterPack()) { + SourceLocation ChLoc = Ch->getLocStart(); + ExprResult Val = S.PerformOpenMPImplicitIntegerConversion( + ChLoc, const_cast(Ch)); + if (!Val.isInvalid()) { + Expr *ValExpr = Val.get(); + llvm::APSInt Result; + ChunkSizeOne = ValExpr->isIntegerConstantExpr(Result, S.Context) && + Result == 1; + } + } + } + } + + // Ordered clause requires dynamic dispatch. + auto OrderedClause = + OMPExecutableDirective::getClausesOfKind(Clauses); + bool Ordered = OrderedClause.begin() != OrderedClause.end(); + + bool StaticOneSchedule = + (!Ordered && (ScheduleKind == OMPC_SCHEDULE_unknown || + ScheduleKind == OMPC_SCHEDULE_auto || + (ScheduleKind == OMPC_SCHEDULE_static && ChunkSizeOne))); + + if (StaticOneSchedule) + DefaultSchedule = OMPDSK_static_chunkone; + } + + return DefaultSchedule; +} + /// \brief Called on a for stmt to check itself and nested loops (if any). /// \return Returns 0 if one of the collapsed stmts is not canonical for loop, /// number of collapsed loops otherwise. static unsigned -CheckOpenMPLoop(OpenMPDirectiveKind DKind, Expr *CollapseLoopCountExpr, - Expr *OrderedLoopCountExpr, Stmt *AStmt, Sema &SemaRef, - DSAStackTy &DSA, +CheckOpenMPLoop(OpenMPDirectiveKind DKind, ArrayRef Clauses, + Expr *CollapseLoopCountExpr, Expr *OrderedLoopCountExpr, + Stmt *AStmt, Sema &SemaRef, DSAStackTy &DSA, llvm::DenseMap &VarsWithImplicitDSA, OMPLoopDirective::HelperExprs &Built) { + OpenMPDefaultScheduleKind DefaultScheduleKind = + getDefaultSchedule(SemaRef, DKind, Clauses); + unsigned NestedLoopCount = 1; if (CollapseLoopCountExpr) { // Found 'collapse' clause - calculate collapse number. @@ -4136,17 +4192,20 @@ // Loop condition (IV < NumIterations) or (IV <= UB) for worksharing loops. SourceLocation CondLoc; ExprResult Cond = - (isOpenMPWorksharingDirective(DKind) || - isOpenMPTaskLoopDirective(DKind) || isOpenMPDistributeDirective(DKind)) + (DefaultScheduleKind != OMPDSK_static_chunkone && + (isOpenMPWorksharingDirective(DKind) || + isOpenMPTaskLoopDirective(DKind) || isOpenMPDistributeDirective(DKind))) ? SemaRef.BuildBinOp(CurScope, CondLoc, BO_LE, IV.get(), UB.get()) : SemaRef.BuildBinOp(CurScope, CondLoc, BO_LT, IV.get(), NumIterations.get()); - // Loop increment (IV = IV + 1) + // Loop increment (IV = IV + 1) or (IV = IV + ST) if (static,1) scheduling. SourceLocation IncLoc; ExprResult Inc = - SemaRef.BuildBinOp(CurScope, IncLoc, BO_Add, IV.get(), - SemaRef.ActOnIntegerConstant(IncLoc, 1).get()); + DefaultScheduleKind == OMPDSK_static_chunkone + ? SemaRef.BuildBinOp(CurScope, IncLoc, BO_Add, IV.get(), ST.get()) + : SemaRef.BuildBinOp(CurScope, IncLoc, BO_Add, IV.get(), + SemaRef.ActOnIntegerConstant(IncLoc, 1).get()); if (!Inc.isUsable()) return 0; Inc = SemaRef.BuildBinOp(CurScope, IncLoc, BO_Assign, IV.get(), Inc.get()); @@ -4295,6 +4354,7 @@ Built.NUB = NextUB.get(); Built.PrevLB = PrevLB.get(); Built.PrevUB = PrevUB.get(); + Built.DefaultScheduleKind = DefaultScheduleKind; Expr *CounterVal = SemaRef.DefaultLvalueConversion(IV.get()).get(); // Fill data for doacross depend clauses. @@ -4417,9 +4477,10 @@ OMPLoopDirective::HelperExprs B; // In presence of clause 'collapse' or 'ordered' with number of loops, it will // define the nested loops number. - unsigned NestedLoopCount = CheckOpenMPLoop( - OMPD_simd, getCollapseNumberExpr(Clauses), getOrderedNumberExpr(Clauses), - AStmt, *this, *DSAStack, VarsWithImplicitDSA, B); + unsigned NestedLoopCount = + CheckOpenMPLoop(OMPD_simd, Clauses, getCollapseNumberExpr(Clauses), + getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack, + VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); @@ -4456,9 +4517,10 @@ OMPLoopDirective::HelperExprs B; // In presence of clause 'collapse' or 'ordered' with number of loops, it will // define the nested loops number. - unsigned NestedLoopCount = CheckOpenMPLoop( - OMPD_for, getCollapseNumberExpr(Clauses), getOrderedNumberExpr(Clauses), - AStmt, *this, *DSAStack, VarsWithImplicitDSA, B); + unsigned NestedLoopCount = + CheckOpenMPLoop(OMPD_for, Clauses, getCollapseNumberExpr(Clauses), + getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack, + VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); @@ -4493,7 +4555,7 @@ // In presence of clause 'collapse' or 'ordered' with number of loops, it will // define the nested loops number. unsigned NestedLoopCount = - CheckOpenMPLoop(OMPD_for_simd, getCollapseNumberExpr(Clauses), + CheckOpenMPLoop(OMPD_for_simd, Clauses, getCollapseNumberExpr(Clauses), getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) @@ -4694,10 +4756,10 @@ OMPLoopDirective::HelperExprs B; // In presence of clause 'collapse' or 'ordered' with number of loops, it will // define the nested loops number. - unsigned NestedLoopCount = - CheckOpenMPLoop(OMPD_parallel_for, getCollapseNumberExpr(Clauses), - getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack, - VarsWithImplicitDSA, B); + unsigned NestedLoopCount = CheckOpenMPLoop( + OMPD_parallel_for, Clauses, getCollapseNumberExpr(Clauses), + getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack, + VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); @@ -4739,10 +4801,10 @@ OMPLoopDirective::HelperExprs B; // In presence of clause 'collapse' or 'ordered' with number of loops, it will // define the nested loops number. - unsigned NestedLoopCount = - CheckOpenMPLoop(OMPD_parallel_for_simd, getCollapseNumberExpr(Clauses), - getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack, - VarsWithImplicitDSA, B); + unsigned NestedLoopCount = CheckOpenMPLoop( + OMPD_parallel_for_simd, Clauses, getCollapseNumberExpr(Clauses), + getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack, + VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); @@ -5697,10 +5759,10 @@ OMPLoopDirective::HelperExprs B; // In presence of clause 'collapse' or 'ordered' with number of loops, it will // define the nested loops number. - unsigned NestedLoopCount = - CheckOpenMPLoop(OMPD_target_parallel_for, getCollapseNumberExpr(Clauses), - getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack, - VarsWithImplicitDSA, B); + unsigned NestedLoopCount = CheckOpenMPLoop( + OMPD_target_parallel_for, Clauses, getCollapseNumberExpr(Clauses), + getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack, + VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); @@ -5905,7 +5967,7 @@ // In presence of clause 'collapse' or 'ordered' with number of loops, it will // define the nested loops number. unsigned NestedLoopCount = - CheckOpenMPLoop(OMPD_taskloop, getCollapseNumberExpr(Clauses), + CheckOpenMPLoop(OMPD_taskloop, Clauses, getCollapseNumberExpr(Clauses), /*OrderedLoopCountExpr=*/nullptr, AStmt, *this, *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) @@ -5936,10 +5998,10 @@ OMPLoopDirective::HelperExprs B; // In presence of clause 'collapse' or 'ordered' with number of loops, it will // define the nested loops number. - unsigned NestedLoopCount = - CheckOpenMPLoop(OMPD_taskloop_simd, getCollapseNumberExpr(Clauses), - /*OrderedLoopCountExpr=*/nullptr, AStmt, *this, *DSAStack, - VarsWithImplicitDSA, B); + unsigned NestedLoopCount = CheckOpenMPLoop( + OMPD_taskloop_simd, Clauses, getCollapseNumberExpr(Clauses), + /*OrderedLoopCountExpr=*/nullptr, AStmt, *this, *DSAStack, + VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); @@ -5980,7 +6042,7 @@ // In presence of clause 'collapse' with number of loops, it will // define the nested loops number. unsigned NestedLoopCount = - CheckOpenMPLoop(OMPD_distribute, getCollapseNumberExpr(Clauses), + CheckOpenMPLoop(OMPD_distribute, Clauses, getCollapseNumberExpr(Clauses), nullptr /*ordered not a clause on distribute*/, AStmt, *this, *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) @@ -6013,7 +6075,7 @@ // In presence of clause 'collapse' with number of loops, it will // define the nested loops number. unsigned NestedLoopCount = CheckOpenMPLoop( - OMPD_distribute_parallel_for, getCollapseNumberExpr(Clauses), + OMPD_distribute_parallel_for, Clauses, getCollapseNumberExpr(Clauses), nullptr /*ordered not a clause on distribute*/, AStmt, *this, *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) @@ -6045,10 +6107,11 @@ OMPLoopDirective::HelperExprs B; // In presence of clause 'collapse' with number of loops, it will // define the nested loops number. - unsigned NestedLoopCount = CheckOpenMPLoop( - OMPD_distribute_parallel_for_simd, getCollapseNumberExpr(Clauses), - nullptr /*ordered not a clause on distribute*/, AStmt, *this, *DSAStack, - VarsWithImplicitDSA, B); + unsigned NestedLoopCount = + CheckOpenMPLoop(OMPD_distribute_parallel_for_simd, Clauses, + getCollapseNumberExpr(Clauses), + nullptr /*ordered not a clause on distribute*/, AStmt, + *this, *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); @@ -6081,10 +6144,10 @@ OMPLoopDirective::HelperExprs B; // In presence of clause 'collapse' with number of loops, it will // define the nested loops number. - unsigned NestedLoopCount = - CheckOpenMPLoop(OMPD_distribute_simd, getCollapseNumberExpr(Clauses), - nullptr /*ordered not a clause on distribute*/, AStmt, - *this, *DSAStack, VarsWithImplicitDSA, B); + unsigned NestedLoopCount = CheckOpenMPLoop( + OMPD_distribute_simd, Clauses, getCollapseNumberExpr(Clauses), + nullptr /*ordered not a clause on distribute*/, AStmt, *this, *DSAStack, + VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); @@ -6118,7 +6181,7 @@ // In presence of clause 'collapse' or 'ordered' with number of loops, it will // define the nested loops number. unsigned NestedLoopCount = CheckOpenMPLoop( - OMPD_target_parallel_for_simd, getCollapseNumberExpr(Clauses), + OMPD_target_parallel_for_simd, Clauses, getCollapseNumberExpr(Clauses), getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) @@ -6164,7 +6227,7 @@ // In presence of clause 'collapse' with number of loops, it will define the // nested loops number. unsigned NestedLoopCount = - CheckOpenMPLoop(OMPD_target_simd, getCollapseNumberExpr(Clauses), + CheckOpenMPLoop(OMPD_target_simd, Clauses, getCollapseNumberExpr(Clauses), getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) @@ -6210,10 +6273,10 @@ OMPLoopDirective::HelperExprs B; // In presence of clause 'collapse' with number of loops, it will // define the nested loops number. - unsigned NestedLoopCount = - CheckOpenMPLoop(OMPD_teams_distribute, getCollapseNumberExpr(Clauses), - nullptr /*ordered not a clause on distribute*/, AStmt, - *this, *DSAStack, VarsWithImplicitDSA, B); + unsigned NestedLoopCount = CheckOpenMPLoop( + OMPD_teams_distribute, Clauses, getCollapseNumberExpr(Clauses), + nullptr /*ordered not a clause on distribute*/, AStmt, *this, *DSAStack, + VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); @@ -6244,7 +6307,7 @@ // In presence of clause 'collapse' with number of loops, it will // define the nested loops number. unsigned NestedLoopCount = CheckOpenMPLoop( - OMPD_teams_distribute_simd, getCollapseNumberExpr(Clauses), + OMPD_teams_distribute_simd, Clauses, getCollapseNumberExpr(Clauses), nullptr /*ordered not a clause on distribute*/, AStmt, *this, *DSAStack, VarsWithImplicitDSA, B); @@ -6291,10 +6354,11 @@ OMPLoopDirective::HelperExprs B; // In presence of clause 'collapse' with number of loops, it will // define the nested loops number. - auto NestedLoopCount = CheckOpenMPLoop( - OMPD_teams_distribute_parallel_for_simd, getCollapseNumberExpr(Clauses), - nullptr /*ordered not a clause on distribute*/, AStmt, *this, *DSAStack, - VarsWithImplicitDSA, B); + auto NestedLoopCount = + CheckOpenMPLoop(OMPD_teams_distribute_parallel_for_simd, Clauses, + getCollapseNumberExpr(Clauses), + nullptr /*ordered not a clause on distribute*/, AStmt, + *this, *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); @@ -6339,10 +6403,11 @@ OMPLoopDirective::HelperExprs B; // In presence of clause 'collapse' with number of loops, it will // define the nested loops number. - unsigned NestedLoopCount = CheckOpenMPLoop( - OMPD_teams_distribute_parallel_for, getCollapseNumberExpr(Clauses), - nullptr /*ordered not a clause on distribute*/, AStmt, *this, *DSAStack, - VarsWithImplicitDSA, B); + unsigned NestedLoopCount = + CheckOpenMPLoop(OMPD_teams_distribute_parallel_for, Clauses, + getCollapseNumberExpr(Clauses), + nullptr /*ordered not a clause on distribute*/, AStmt, + *this, *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); @@ -6406,8 +6471,7 @@ // In presence of clause 'collapse' with number of loops, it will // define the nested loops number. auto NestedLoopCount = CheckOpenMPLoop( - OMPD_target_teams_distribute, - getCollapseNumberExpr(Clauses), + OMPD_target_teams_distribute, Clauses, getCollapseNumberExpr(Clauses), nullptr /*ordered not a clause on distribute*/, AStmt, *this, *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) @@ -6439,11 +6503,11 @@ OMPLoopDirective::HelperExprs B; // In presence of clause 'collapse' with number of loops, it will // define the nested loops number. - auto NestedLoopCount = CheckOpenMPLoop( - OMPD_target_teams_distribute_parallel_for, - getCollapseNumberExpr(Clauses), - nullptr /*ordered not a clause on distribute*/, AStmt, *this, *DSAStack, - VarsWithImplicitDSA, B); + auto NestedLoopCount = + CheckOpenMPLoop(OMPD_target_teams_distribute_parallel_for, Clauses, + getCollapseNumberExpr(Clauses), + nullptr /*ordered not a clause on distribute*/, AStmt, + *this, *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); @@ -6484,11 +6548,11 @@ OMPLoopDirective::HelperExprs B; // In presence of clause 'collapse' with number of loops, it will // define the nested loops number. - auto NestedLoopCount = CheckOpenMPLoop( - OMPD_target_teams_distribute_parallel_for_simd, - getCollapseNumberExpr(Clauses), - nullptr /*ordered not a clause on distribute*/, AStmt, *this, *DSAStack, - VarsWithImplicitDSA, B); + auto NestedLoopCount = + CheckOpenMPLoop(OMPD_target_teams_distribute_parallel_for_simd, Clauses, + getCollapseNumberExpr(Clauses), + nullptr /*ordered not a clause on distribute*/, AStmt, + *this, *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); @@ -6530,10 +6594,11 @@ OMPLoopDirective::HelperExprs B; // In presence of clause 'collapse' with number of loops, it will // define the nested loops number. - auto NestedLoopCount = CheckOpenMPLoop( - OMPD_target_teams_distribute_simd, getCollapseNumberExpr(Clauses), - nullptr /*ordered not a clause on distribute*/, AStmt, *this, *DSAStack, - VarsWithImplicitDSA, B); + auto NestedLoopCount = + CheckOpenMPLoop(OMPD_target_teams_distribute_simd, Clauses, + getCollapseNumberExpr(Clauses), + nullptr /*ordered not a clause on distribute*/, AStmt, + *this, *DSAStack, VarsWithImplicitDSA, B); if (NestedLoopCount == 0) return StmtError(); Index: test/OpenMP/nvptx_coalesced_scheduling_codegen.cpp =================================================================== --- /dev/null +++ test/OpenMP/nvptx_coalesced_scheduling_codegen.cpp @@ -0,0 +1,322 @@ +// Test target codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// expected-no-diagnostics +#ifndef HEADER +#define HEADER + +// Check that the execution mode of the target regions on the gpu is set to the right mode. +// CHECK-DAG: {{@__omp_offloading_.+l19}}_exec_mode = weak constant i8 0 + +template +tx ftemplate() { + tx a[100]; + tx b[10][10]; + + #pragma omp target parallel + { + #pragma omp for + for (int i = 0; i < 99; i++) { + a[i] = 1; + } + + #pragma omp for schedule(auto) + for (int i = 0; i < 98; i++) { + a[i] = 2; + } + + #pragma omp for schedule(static,1) + for (int i = 0; i < 97; i++) { + a[i] = 3; + } + + #pragma omp for schedule(static,2) + for (int i = 0; i < 96; i++) { + a[i] = 1; + } + + #pragma omp for schedule(static) + for (int i = 0; i < 95; i++) { + a[i] = 1; + } + + #pragma omp for schedule(auto) ordered + for (int i = 0; i < 94; i++) { + a[i] = 1; + } + + #pragma omp for schedule(runtime) + for (int i = 0; i < 93; i++) { + a[i] = 1; + } + + #pragma omp for schedule(dynamic) + for (int i = 0; i < 92; i++) { + a[i] = 1; + } + + #pragma omp for schedule(guided) + for (int i = 0; i < 91; i++) { + a[i] = 1; + } + } + + return a[0] + b[9][9]; +} + +int bar(){ + int a = 0; + + a += ftemplate(); + + return a; +} + + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l19}}( + // CHECK: call void @__kmpc_spmd_kernel_init( + // CHECK: br label {{%?}}[[EXEC:.+]] + // + // CHECK: [[EXEC]] + // CHECK: {{call|invoke}} void [[OP1:@.+]](i32* + // CHECK: br label {{%?}}[[DONE:.+]] + // + // CHECK: [[DONE]] + // CHECK: call void @__kmpc_spmd_kernel_deinit() + // CHECK: br label {{%?}}[[EXIT:.+]] + // + // CHECK: [[EXIT]] + // CHECK: ret void + // CHECK: } + + // CHECK: define internal void [[OP1]]( + + // No schedule clause. + // + // CHECK: store i32 0, i32* [[LB_PTR:%.+]], align + // CHECK: store i32 98, i32* [[UB_PTR:%.+]], align + // CHECK: store i32 1, i32* [[ST_PTR:%.+]], align + // CHECK: call void @__kmpc_for_static_init_4(%ident_t* {{@.+}}, i32 {{%.+}}, i32 33, i32* {{%.+}}, i32* [[LB_PTR]], i32* [[UB_PTR]], i32* [[ST_PTR]], i32 1, i32 1) + // CHECK: [[LB:%.+]] = load i32, i32* [[LB_PTR]], align + // CHECK: store i32 [[LB]], i32* [[IV_PTR:%.+]], align + // CHECK: br label {{%?}}[[FOR_COND:.+]] + // + // CHECK: [[FOR_COND]] + // CHECK: [[IV:%.+]] = load i32, i32* [[IV_PTR]], align + // CHECK: [[CMP:%.+]] = icmp slt i32 [[IV]], 99 + // CHECK: br i1 [[CMP]], label {{%?}}[[FOR_BODY:.+]], label {{%?}}[[FOR_END:.+]] + // + // [[FOR_BODY]] + // CHECK: [[IV:%.+]] = load i32, i32* [[IV_PTR]], align + // CHECK: [[MUL:%.+]] = mul nsw i32 [[IV]], 1 + // CHECK: [[ADD:%.+]] = add nsw i32 0, [[MUL]] + // CHECK: store i32 [[ADD]], i32* [[I_PTR:%.+]], align + // CHECK: [[I:%.+]] = load i32, i32* [[I_PTR]], align + // CHECK-32: [[ELEM_PTR:%.+]] = getelementptr inbounds [100 x i32], [100 x i32]* {{%.+}}, i32 0, i32 [[I]] + // CHECK-64: [[IDX:%.+]] = sext i32 [[I]] to i64 + // CHECK-64: [[ELEM_PTR:%.+]] = getelementptr inbounds [100 x i32], [100 x i32]* {{%.+}}, i64 0, i64 [[IDX]] + // CHECK: store i32 1, i32* [[ELEM_PTR]], align + // CHECK: br label {{%?}}[[FOR_CONT:.+]] + // + // CHECK: [[FOR_CONT]] + // CHECK: br label {{%?}}[[FOR_INC:.+]] + // + // CHECK: [[FOR_INC]] + // CHECK: [[IV:%.+]] = load i32, i32* [[IV_PTR]], align + // CHECK: [[ST:%.+]] = load i32, i32* [[ST_PTR]], align + // CHECK: [[ADD:%.+]] = add nsw i32 [[IV]], [[ST]] + // CHECK: store i32 [[ADD]], i32* [[IV_PTR]], align + // CHECK: br label {{%?}}[[FOR_COND]] + // + // CHECK: [[FOR_END]] + + + + // schedule(auto) clause. + // + // CHECK: store i32 0, i32* [[LB_PTR:%.+]], align + // CHECK: store i32 97, i32* [[UB_PTR:%.+]], align + // CHECK: store i32 1, i32* [[ST_PTR:%.+]], align + // CHECK: call void @__kmpc_for_static_init_4(%ident_t* {{@.+}}, i32 {{%.+}}, i32 33, i32* {{%.+}}, i32* [[LB_PTR]], i32* [[UB_PTR]], i32* [[ST_PTR]], i32 1, i32 1) + // CHECK: [[LB:%.+]] = load i32, i32* [[LB_PTR]], align + // CHECK: store i32 [[LB]], i32* [[IV_PTR:%.+]], align + // CHECK: br label {{%?}}[[FOR_COND:.+]] + // + // CHECK: [[FOR_COND]] + // CHECK: [[IV:%.+]] = load i32, i32* [[IV_PTR]], align + // CHECK: [[CMP:%.+]] = icmp slt i32 [[IV]], 98 + // CHECK: br i1 [[CMP]], label {{%?}}[[FOR_BODY:.+]], label {{%?}}[[FOR_END:.+]] + // + // [[FOR_BODY]] + // CHECK: [[IV:%.+]] = load i32, i32* [[IV_PTR]], align + // CHECK: [[MUL:%.+]] = mul nsw i32 [[IV]], 1 + // CHECK: [[ADD:%.+]] = add nsw i32 0, [[MUL]] + // CHECK: store i32 [[ADD]], i32* [[I_PTR:%.+]], align + // CHECK: [[I:%.+]] = load i32, i32* [[I_PTR]], align + // CHECK-32: [[ELEM_PTR:%.+]] = getelementptr inbounds [100 x i32], [100 x i32]* {{%.+}}, i32 0, i32 [[I]] + // CHECK-64: [[IDX:%.+]] = sext i32 [[I]] to i64 + // CHECK-64: [[ELEM_PTR:%.+]] = getelementptr inbounds [100 x i32], [100 x i32]* {{%.+}}, i64 0, i64 [[IDX]] + // CHECK: store i32 2, i32* [[ELEM_PTR]], align + // CHECK: br label {{%?}}[[FOR_CONT:.+]] + // + // CHECK: [[FOR_CONT]] + // CHECK: br label {{%?}}[[FOR_INC:.+]] + // + // CHECK: [[FOR_INC]] + // CHECK: [[IV:%.+]] = load i32, i32* [[IV_PTR]], align + // CHECK: [[ST:%.+]] = load i32, i32* [[ST_PTR]], align + // CHECK: [[ADD:%.+]] = add nsw i32 [[IV]], [[ST]] + // CHECK: store i32 [[ADD]], i32* [[IV_PTR]], align + // CHECK: br label {{%?}}[[FOR_COND]] + // + // CHECK: [[FOR_END]] + + + + // schedule(static,1) clause. + // + // CHECK: store i32 0, i32* [[LB_PTR:%.+]], align + // CHECK: store i32 96, i32* [[UB_PTR:%.+]], align + // CHECK: store i32 1, i32* [[ST_PTR:%.+]], align + // CHECK: call void @__kmpc_for_static_init_4(%ident_t* {{@.+}}, i32 {{%.+}}, i32 33, i32* {{%.+}}, i32* [[LB_PTR]], i32* [[UB_PTR]], i32* [[ST_PTR]], i32 1, i32 1) + // CHECK: [[LB:%.+]] = load i32, i32* [[LB_PTR]], align + // CHECK: store i32 [[LB]], i32* [[IV_PTR:%.+]], align + // CHECK: br label {{%?}}[[FOR_COND:.+]] + // + // CHECK: [[FOR_COND]] + // CHECK: [[IV:%.+]] = load i32, i32* [[IV_PTR]], align + // CHECK: [[CMP:%.+]] = icmp slt i32 [[IV]], 97 + // CHECK: br i1 [[CMP]], label {{%?}}[[FOR_BODY:.+]], label {{%?}}[[FOR_END:.+]] + // + // [[FOR_BODY]] + // CHECK: [[IV:%.+]] = load i32, i32* [[IV_PTR]], align + // CHECK: [[MUL:%.+]] = mul nsw i32 [[IV]], 1 + // CHECK: [[ADD:%.+]] = add nsw i32 0, [[MUL]] + // CHECK: store i32 [[ADD]], i32* [[I_PTR:%.+]], align + // CHECK: [[I:%.+]] = load i32, i32* [[I_PTR]], align + // CHECK-32: [[ELEM_PTR:%.+]] = getelementptr inbounds [100 x i32], [100 x i32]* {{%.+}}, i32 0, i32 [[I]] + // CHECK-64: [[IDX:%.+]] = sext i32 [[I]] to i64 + // CHECK-64: [[ELEM_PTR:%.+]] = getelementptr inbounds [100 x i32], [100 x i32]* {{%.+}}, i64 0, i64 [[IDX]] + // CHECK: store i32 3, i32* [[ELEM_PTR]], align + // CHECK: br label {{%?}}[[FOR_CONT:.+]] + // + // CHECK: [[FOR_CONT]] + // CHECK: br label {{%?}}[[FOR_INC:.+]] + // + // CHECK: [[FOR_INC]] + // CHECK: [[IV:%.+]] = load i32, i32* [[IV_PTR]], align + // CHECK: [[ST:%.+]] = load i32, i32* [[ST_PTR]], align + // CHECK: [[ADD:%.+]] = add nsw i32 [[IV]], [[ST]] + // CHECK: store i32 [[ADD]], i32* [[IV_PTR]], align + // CHECK: br label {{%?}}[[FOR_COND]] + // + // CHECK: [[FOR_END]] + + + + // schedule(static,2) clause. Non-coalesced codegen. + // + // CHECK: store i32 0, i32* [[LB_PTR:%.+]], align + // CHECK: store i32 95, i32* [[UB_PTR:%.+]], align + // CHECK: store i32 1, i32* [[ST_PTR:%.+]], align + // CHECK: call void @__kmpc_for_static_init_4(%ident_t* {{@.+}}, i32 {{%.+}}, i32 33, i32* {{%.+}}, i32* [[LB_PTR]], i32* [[UB_PTR]], i32* [[ST_PTR]], i32 1, i32 2) + // CHECK: br label {{%?}}[[DISPATCH_COND:.+]] + // + // CHECK: [[DISPATCH_COND]] + // CHECK: [[UB:%.+]] = load i32, i32* [[UB_PTR]], align + // CHECK: = icmp sgt i32 [[UB]], 95 + // + // CHECK: [[LB:%.+]] = load i32, i32* [[LB_PTR]], align + // CHECK: store i32 [[LB]], i32* [[IV_PTR:%.+]], align + // + // CHECK: = getelementptr + // + // CHECK: [[IV:%.+]] = load i32, i32* [[IV_PTR]], align + // CHECK: [[ADD:%.+]] = add nsw i32 [[IV]], 1 + // CHECK: store i32 [[ADD]], i32* [[IV_PTR]], align + + + + // schedule(static) clause. Non-coalesced codegen. + // + // CHECK: store i32 0, i32* [[LB_PTR:%.+]], align + // CHECK: store i32 94, i32* [[UB_PTR:%.+]], align + // CHECK: store i32 1, i32* [[ST_PTR:%.+]], align + // CHECK: call void @__kmpc_for_static_init_4(%ident_t* {{@.+}}, i32 {{%.+}}, i32 34, i32* {{%.+}}, i32* [[LB_PTR]], i32* [[UB_PTR]], i32* [[ST_PTR]], i32 1, i32 1) + // CHECK: [[UB:%.+]] = load i32, i32* [[UB_PTR]], align + // CHECK: = icmp sgt i32 [[UB]], 94 + // + // CHECK: [[LB:%.+]] = load i32, i32* [[LB_PTR]], align + // CHECK: store i32 [[LB]], i32* [[IV_PTR:%.+]], align + // + // CHECK: = getelementptr + // + // CHECK: [[IV:%.+]] = load i32, i32* [[IV_PTR]], align + // CHECK: [[ADD:%.+]] = add nsw i32 [[IV]], 1 + // CHECK: store i32 [[ADD]], i32* [[IV_PTR]], align + + + + // schedule(auto) ordered clause. Non-coalesced codegen. + // + // CHECK: store i32 0, i32* [[LB_PTR:%.+]], align + // CHECK: store i32 93, i32* [[UB_PTR:%.+]], align + // CHECK: store i32 1, i32* [[ST_PTR:%.+]], align + // CHECK: call void @__kmpc_dispatch_init_4(%ident_t* {{@.+}}, i32 {{%.+}}, i32 70 + // CHECK: call i32 @__kmpc_dispatch_next_4(%ident_t* {{@.+}}, i32 {{%.+}}, i32* {{%.+}}, i32* [[LB_PTR]], i32* [[UB_PTR]], i32* [[ST_PTR]]) + // + // CHECK: [[LB:%.+]] = load i32, i32* [[LB_PTR]], align + // CHECK: store i32 [[LB]], i32* [[IV_PTR:%.+]], align + // + // CHECK: = getelementptr + // + // CHECK: [[IV:%.+]] = load i32, i32* [[IV_PTR]], align + // CHECK: [[ADD:%.+]] = add nsw i32 [[IV]], 1 + // CHECK: store i32 [[ADD]], i32* [[IV_PTR]], align + + + + // schedule(runtime) clause. Non-coalesced codegen. + // + // CHECK: store i32 0, i32* [[LB_PTR:%.+]], align + // CHECK: store i32 92, i32* [[UB_PTR:%.+]], align + // CHECK: store i32 1, i32* [[ST_PTR:%.+]], align + // CHECK: call void @__kmpc_dispatch_init_4(%ident_t* {{@.+}}, i32 {{%.+}}, i32 37 + // CHECK: call i32 @__kmpc_dispatch_next_4(%ident_t* {{@.+}}, i32 {{%.+}}, i32* {{%.+}}, i32* [[LB_PTR]], i32* [[UB_PTR]], i32* [[ST_PTR]]) + // + // CHECK: [[LB:%.+]] = load i32, i32* [[LB_PTR]], align + // CHECK: store i32 [[LB]], i32* [[IV_PTR:%.+]], align + // + // CHECK: = getelementptr + // + // CHECK: [[IV:%.+]] = load i32, i32* [[IV_PTR]], align + // CHECK: [[ADD:%.+]] = add nsw i32 [[IV]], 1 + // CHECK: store i32 [[ADD]], i32* [[IV_PTR]], align + + + + // schedule(dynamic) clause. Non-coalesced codegen. + // + // CHECK: store i32 0, i32* [[LB_PTR:%.+]], align + // CHECK: store i32 91, i32* [[UB_PTR:%.+]], align + // CHECK: store i32 1, i32* [[ST_PTR:%.+]], align + // CHECK: call void @__kmpc_dispatch_init_4(%ident_t* {{@.+}}, i32 {{%.+}}, i32 35 + // CHECK: call i32 @__kmpc_dispatch_next_4(%ident_t* {{@.+}}, i32 {{%.+}}, i32* {{%.+}}, i32* [[LB_PTR]], i32* [[UB_PTR]], i32* [[ST_PTR]]) + // + // CHECK: [[LB:%.+]] = load i32, i32* [[LB_PTR]], align + // CHECK: store i32 [[LB]], i32* [[IV_PTR:%.+]], align + // + // CHECK: = getelementptr + // + // CHECK: [[IV:%.+]] = load i32, i32* [[IV_PTR]], align + // CHECK: [[ADD:%.+]] = add nsw i32 [[IV]], 1 + // CHECK: store i32 [[ADD]], i32* [[IV_PTR]], align + + + + // CHECK: ret void + // CHECK: } + +#endif